From 30bfce109420912f201d4f295f9130ff44f04b41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Jan 2021 13:06:36 -0800 Subject: net: remove ndo_udp_tunnel_* callbacks All UDP tunnel port management is now routed via udp_tunnel_nic infra directly. Remove the old callbacks. Reviewed-by: Alexander Duyck Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 259be67644e3..1ec3ac5d5bbf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1213,19 +1213,6 @@ struct netdev_net_notifier { * struct netdev_phys_item_id *ppid) * Called to get the parent ID of the physical port of this device. * - * void (*ndo_udp_tunnel_add)(struct net_device *dev, - * struct udp_tunnel_info *ti); - * Called by UDP tunnel to notify a driver about the UDP port and socket - * address family that a UDP tunnel is listnening to. It is called only - * when a new port starts listening. The operation is protected by the - * RTNL. - * - * void (*ndo_udp_tunnel_del)(struct net_device *dev, - * struct udp_tunnel_info *ti); - * Called by UDP tunnel to notify the driver about a UDP port and socket - * address family that the UDP tunnel is not listening to anymore. The - * operation is protected by the RTNL. - * * void* (*ndo_dfwd_add_station)(struct net_device *pdev, * struct net_device *dev) * Called by upper layer devices to accelerate switching or other @@ -1464,10 +1451,6 @@ struct net_device_ops { struct netdev_phys_item_id *ppid); int (*ndo_get_phys_port_name)(struct net_device *dev, char *name, size_t len); - void (*ndo_udp_tunnel_add)(struct net_device *dev, - struct udp_tunnel_info *ti); - void (*ndo_udp_tunnel_del)(struct net_device *dev, - struct udp_tunnel_info *ti); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, -- cgit v1.2.3 From b9ef3fecd1403fc1c2dd146e67b366691394b80f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Jan 2021 13:06:37 -0800 Subject: udp_tunnel: reshuffle NETIF_F_RX_UDP_TUNNEL_PORT checks Move the NETIF_F_RX_UDP_TUNNEL_PORT feature check into udp_tunnel_nic_*_port() helpers, since they're always done right before the call. Add similar checks before calling the notifier. udp_tunnel_nic invokes the notifier without checking features which could result in some wasted cycles. Reviewed-by: Alexander Duyck Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 8 ++++++++ net/ipv4/udp_tunnel_core.c | 10 ---------- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 2ea453dac876..282d10ee60e1 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -129,12 +129,16 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); static inline void udp_tunnel_get_rx_info(struct net_device *dev) { ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); } static inline void udp_tunnel_drop_rx_info(struct net_device *dev) { ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); } @@ -323,6 +327,8 @@ udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, static inline void udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->add_port(dev, ti); } @@ -330,6 +336,8 @@ udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) static inline void udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->del_port(dev, ti); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 376a085be7ed..b97e3635acf5 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,9 +90,6 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; @@ -107,9 +104,6 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; @@ -132,8 +126,6 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; udp_tunnel_nic_add_port(dev, &ti); } rcu_read_unlock(); @@ -154,8 +146,6 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; udp_tunnel_nic_del_port(dev, &ti); } rcu_read_unlock(); -- cgit v1.2.3 From 8b86850bf9ef259e194ce49c776aded3b8c281fb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 6 Jan 2021 09:09:44 -0800 Subject: net: phy: bcm7xxx: Add an entry for BCM72116 BCM72116 features a 28nm integrated EPHY, add an entry to match this PHY OUI. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210106170944.1253046-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm7xxx.c | 2 ++ include/linux/brcmphy.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 15812001b3ff..e79297a4bae8 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -612,6 +612,7 @@ static void bcm7xxx_28nm_remove(struct phy_device *phydev) static struct phy_driver bcm7xxx_driver[] = { BCM7XXX_28NM_EPHY(PHY_ID_BCM72113, "Broadcom BCM72113"), + BCM7XXX_28NM_EPHY(PHY_ID_BCM72116, "Broadcom BCM72116"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7250, "Broadcom BCM7250"), BCM7XXX_28NM_EPHY(PHY_ID_BCM7255, "Broadcom BCM7255"), BCM7XXX_28NM_EPHY(PHY_ID_BCM7260, "Broadcom BCM7260"), @@ -633,6 +634,7 @@ static struct phy_driver bcm7xxx_driver[] = { static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = { { PHY_ID_BCM72113, 0xfffffff0 }, + { PHY_ID_BCM72116, 0xfffffff0, }, { PHY_ID_BCM7250, 0xfffffff0, }, { PHY_ID_BCM7255, 0xfffffff0, }, { PHY_ID_BCM7260, 0xfffffff0, }, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index d0bd226d6bd9..de9430d55c90 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -31,6 +31,7 @@ #define PHY_ID_BCM89610 0x03625cd0 #define PHY_ID_BCM72113 0x35905310 +#define PHY_ID_BCM72116 0x35905350 #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7255 0xae025120 #define PHY_ID_BCM7260 0xae025190 -- cgit v1.2.3 From d5f19486cee79d04c054427577ac96ed123706db Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 6 Jan 2021 11:51:35 +0200 Subject: net: dsa: listen for SWITCHDEV_{FDB,DEL}_ADD_TO_DEVICE on foreign bridge neighbors Some DSA switches (and not only) cannot learn source MAC addresses from packets injected from the CPU. They only perform hardware address learning from inbound traffic. This can be problematic when we have a bridge spanning some DSA switch ports and some non-DSA ports (which we'll call "foreign interfaces" from DSA's perspective). There are 2 classes of problems created by the lack of learning on CPU-injected traffic: - excessive flooding, due to the fact that DSA treats those addresses as unknown - the risk of stale routes, which can lead to temporary packet loss To illustrate the second class, consider the following situation, which is common in production equipment (wireless access points, where there is a WLAN interface and an Ethernet switch, and these form a single bridging domain). AP 1: +------------------------------------------------------------------------+ | br0 | +------------------------------------------------------------------------+ +------------+ +------------+ +------------+ +------------+ +------------+ | swp0 | | swp1 | | swp2 | | swp3 | | wlan0 | +------------+ +------------+ +------------+ +------------+ +------------+ | ^ ^ | | | | | | | Client A Client B | | | +------------+ +------------+ +------------+ +------------+ +------------+ | swp0 | | swp1 | | swp2 | | swp3 | | wlan0 | +------------+ +------------+ +------------+ +------------+ +------------+ +------------------------------------------------------------------------+ | br0 | +------------------------------------------------------------------------+ AP 2 - br0 of AP 1 will know that Clients A and B are reachable via wlan0 - the hardware fdb of a DSA switch driver today is not kept in sync with the software entries on other bridge ports, so it will not know that clients A and B are reachable via the CPU port UNLESS the hardware switch itself performs SA learning from traffic injected from the CPU. Nonetheless, a substantial number of switches don't. - the hardware fdb of the DSA switch on AP 2 may autonomously learn that Client A and B are reachable through swp0. Therefore, the software br0 of AP 2 also may or may not learn this. In the example we're illustrating, some Ethernet traffic has been going on, and br0 from AP 2 has indeed learnt that it can reach Client B through swp0. One of the wireless clients, say Client B, disconnects from AP 1 and roams to AP 2. The topology now looks like this: AP 1: +------------------------------------------------------------------------+ | br0 | +------------------------------------------------------------------------+ +------------+ +------------+ +------------+ +------------+ +------------+ | swp0 | | swp1 | | swp2 | | swp3 | | wlan0 | +------------+ +------------+ +------------+ +------------+ +------------+ | ^ | | | Client A | | | Client B | | | v +------------+ +------------+ +------------+ +------------+ +------------+ | swp0 | | swp1 | | swp2 | | swp3 | | wlan0 | +------------+ +------------+ +------------+ +------------+ +------------+ +------------------------------------------------------------------------+ | br0 | +------------------------------------------------------------------------+ AP 2 - br0 of AP 1 still knows that Client A is reachable via wlan0 (no change) - br0 of AP 1 will (possibly) know that Client B has left wlan0. There are cases where it might never find out though. Either way, DSA today does not process that notification in any way. - the hardware FDB of the DSA switch on AP 1 may learn autonomously that Client B can be reached via swp0, if it receives any packet with Client 1's source MAC address over Ethernet. - the hardware FDB of the DSA switch on AP 2 still thinks that Client B can be reached via swp0. It does not know that it has roamed to wlan0, because it doesn't perform SA learning from the CPU port. Now Client A contacts Client B. AP 1 routes the packet fine towards swp0 and delivers it on the Ethernet segment. AP 2 sees a frame on swp0 and its fdb says that the destination is swp0. Hairpinning is disabled => drop. This problem comes from the fact that these switches have a 'blind spot' for addresses coming from software bridging. The generic solution is not to assume that hardware learning can be enabled somehow, but to listen to more bridge learning events. It turns out that the bridge driver does learn in software from all inbound frames, in __br_handle_local_finish. A proper SWITCHDEV_FDB_ADD_TO_DEVICE notification is emitted for the addresses serviced by the bridge on 'foreign' interfaces. The software bridge also does the right thing on migration, by notifying that the old entry is deleted, so that does not need to be special-cased in DSA. When it is deleted, we just need to delete our static FDB entry towards the CPU too, and wait. The problem is that DSA currently only cares about SWITCHDEV_FDB_ADD_TO_DEVICE events received on its own interfaces, such as static FDB entries. Luckily we can change that, and DSA can listen to all switchdev FDB add/del events in the system and figure out if those events were emitted by a bridge that spans at least one of DSA's own ports. In case that is true, DSA will also offload that address towards its own CPU port, in the eventuality that there might be bridge clients attached to the DSA switch who want to talk to the station connected to the foreign interface. In terms of implementation, we need to keep the fdb_info->added_by_user check for the case where the switchdev event was targeted directly at a DSA switch port. But we don't need to look at that flag for snooped events. So the check is currently too late, we need to move it earlier. This also simplifies the code a bit, since we avoid uselessly allocating and freeing switchdev_work. We could probably do some improvements in the future. For example, multi-bridge support is rudimentary at the moment. If there are two bridges spanning a DSA switch's ports, and both of them need to service the same MAC address, then what will happen is that the migration of one of those stations will trigger the deletion of the FDB entry from the CPU port while it is still used by other bridge. That could be improved with reference counting but is left for another time. This behavior needs to be enabled at driver level by setting ds->assisted_learning_on_cpu_port = true. This is because we don't want to inflict a potential performance penalty (accesses through MDIO/I2C/SPI are expensive) to hardware that really doesn't need it because address learning on the CPU port works there. Reported-by: DENG Qingfang Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 +++++ net/dsa/slave.c | 66 +++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 60 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4e60d2610f20..6b74690bd8d4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -319,6 +319,11 @@ struct dsa_switch { */ bool untag_bridge_pvid; + /* Let DSA manage the FDB entries towards the CPU, based on the + * software bridge database. + */ + bool assisted_learning_on_cpu_port; + /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 37dffe5bc46f..456576f75a50 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2109,6 +2109,28 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) dev_put(dp->slave); } +static int dsa_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + if (dsa_slave_dev_check(lower_dev)) { + priv->data = (void *)netdev_priv(lower_dev); + return 1; + } + + return 0; +} + +static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .data = NULL, + }; + + netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv); + + return (struct dsa_slave_priv *)priv.data; +} + /* Called under rcu_read_lock() */ static int dsa_slave_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) @@ -2127,10 +2149,37 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: - if (!dsa_slave_dev_check(dev)) - return NOTIFY_DONE; + fdb_info = ptr; - dp = dsa_slave_to_port(dev); + if (dsa_slave_dev_check(dev)) { + if (!fdb_info->added_by_user) + return NOTIFY_OK; + + dp = dsa_slave_to_port(dev); + } else { + /* Snoop addresses learnt on foreign interfaces + * bridged with us, for switches that don't + * automatically learn SA from CPU-injected traffic + */ + struct net_device *br_dev; + struct dsa_slave_priv *p; + + br_dev = netdev_master_upper_dev_get_rcu(dev); + if (!br_dev) + return NOTIFY_DONE; + + if (!netif_is_bridge_master(br_dev)) + return NOTIFY_DONE; + + p = dsa_slave_dev_lower_find(br_dev); + if (!p) + return NOTIFY_DONE; + + dp = p->dp->cpu_dp; + + if (!dp->ds->assisted_learning_on_cpu_port) + return NOTIFY_DONE; + } if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del) return NOTIFY_DONE; @@ -2145,18 +2194,13 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, switchdev_work->port = dp->index; switchdev_work->event = event; - fdb_info = ptr; - - if (!fdb_info->added_by_user) { - kfree(switchdev_work); - return NOTIFY_OK; - } - ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; - dev_hold(dev); + /* Hold a reference on the slave for dsa_fdb_offload_notify */ + if (dsa_is_user_port(dp->ds, dp->index)) + dev_hold(dev); dsa_schedule_work(&switchdev_work->work); break; default: -- cgit v1.2.3 From f46b9b8ee89b52f6ee1f4da49d392e609746ab10 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 7 Jan 2021 03:24:00 +0200 Subject: net: dsa: move the Broadcom tag information in a separate header file It is a bit strange to see something as specific as Broadcom SYSTEMPORT bits in the main DSA include file. Move these away into a separate header, and have the tagger and the SYSTEMPORT driver include them. Signed-off-by: Vladimir Oltean Acked-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + drivers/net/ethernet/broadcom/bcmsysport.c | 1 + include/linux/dsa/brcm.h | 16 ++++++++++++++++ include/net/dsa.h | 6 ------ net/dsa/tag_brcm.c | 1 + 5 files changed, 19 insertions(+), 6 deletions(-) create mode 100644 include/linux/dsa/brcm.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 7c1e45c416b1..3854aca806f5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3400,6 +3400,7 @@ L: openwrt-devel@lists.openwrt.org (subscribers-only) S: Supported F: Documentation/devicetree/bindings/net/dsa/brcm,b53.yaml F: drivers/net/dsa/b53/* +F: include/linux/dsa/brcm.h F: include/linux/platform_data/b53.h BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index b1ae9eb8f247..9ef743d6ba67 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/dsa/brcm.h b/include/linux/dsa/brcm.h new file mode 100644 index 000000000000..47545a948784 --- /dev/null +++ b/include/linux/dsa/brcm.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * Copyright (C) 2014 Broadcom Corporation + */ + +/* Included by drivers/net/ethernet/broadcom/bcmsysport.c and + * net/dsa/tag_brcm.c + */ +#ifndef _NET_DSA_BRCM_H +#define _NET_DSA_BRCM_H + +/* Broadcom tag specific helpers to insert and extract queue/port number */ +#define BRCM_TAG_SET_PORT_QUEUE(p, q) ((p) << 8 | q) +#define BRCM_TAG_GET_PORT(v) ((v) >> 8) +#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) + +#endif diff --git a/include/net/dsa.h b/include/net/dsa.h index 6b74690bd8d4..e92a09c77f96 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -878,12 +878,6 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, } #endif -/* Broadcom tag specific helpers to insert and extract queue/port number */ -#define BRCM_TAG_SET_PORT_QUEUE(p, q) ((p) << 8 | q) -#define BRCM_TAG_GET_PORT(v) ((v) >> 8) -#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) - - netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e934dace3922..e2577a7dcbca 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -5,6 +5,7 @@ * Copyright (C) 2014 Broadcom Corporation */ +#include #include #include #include -- cgit v1.2.3 From a5e3c9ba9258dbe55cba0cc3804675c9f1eaa169 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 7 Jan 2021 03:24:01 +0200 Subject: net: dsa: export dsa_slave_dev_check Using the NETDEV_CHANGEUPPER notifications, drivers can be aware when they are enslaved to e.g. a bridge by calling netif_is_bridge_master(). Export this helper from DSA to get the equivalent functionality of determining whether the upper interface of a CHANGEUPPER notifier is a DSA switch interface or not. Signed-off-by: Vladimir Oltean Acked-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 6 ++++++ net/dsa/dsa_priv.h | 1 - net/dsa/slave.c | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e92a09c77f96..e218eca128d1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -860,6 +860,7 @@ int register_dsa_notifier(struct notifier_block *nb); int unregister_dsa_notifier(struct notifier_block *nb); int call_dsa_notifiers(unsigned long val, struct net_device *dev, struct dsa_notifier_info *info); +bool dsa_slave_dev_check(const struct net_device *dev); #else static inline int register_dsa_notifier(struct notifier_block *nb) { @@ -876,6 +877,11 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, { return NOTIFY_DONE; } + +static inline bool dsa_slave_dev_check(const struct net_device *dev) +{ + return false; +} #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c04225f74929..4f1bbaab72f2 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -184,7 +184,6 @@ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); -bool dsa_slave_dev_check(const struct net_device *dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 456576f75a50..d7f9fbb93589 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1924,6 +1924,7 @@ bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } +EXPORT_SYMBOL_GPL(dsa_slave_dev_check); static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) -- cgit v1.2.3 From 1dbb130281c447fdd061475931e1eb7baf475f53 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 7 Jan 2021 03:24:03 +0200 Subject: net: dsa: remove the DSA specific notifiers This effectively reverts commit 60724d4bae14 ("net: dsa: Add support for DSA specific notifiers"). The reason is that since commit 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings"), it appears that there is a generic way to achieve the same purpose. The only user thus far, the Broadcom SYSTEMPORT driver, was converted to use the generic notifiers. Signed-off-by: Vladimir Oltean Acked-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 42 ------------------------------------------ net/dsa/dsa.c | 22 ---------------------- net/dsa/slave.c | 17 ----------------- 3 files changed, 81 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e218eca128d1..9a0cab5fb7c4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -833,51 +833,9 @@ static inline int dsa_switch_resume(struct dsa_switch *ds) } #endif /* CONFIG_PM_SLEEP */ -enum dsa_notifier_type { - DSA_PORT_REGISTER, - DSA_PORT_UNREGISTER, -}; - -struct dsa_notifier_info { - struct net_device *dev; -}; - -struct dsa_notifier_register_info { - struct dsa_notifier_info info; /* must be first */ - struct net_device *master; - unsigned int port_number; - unsigned int switch_number; -}; - -static inline struct net_device * -dsa_notifier_info_to_dev(const struct dsa_notifier_info *info) -{ - return info->dev; -} - #if IS_ENABLED(CONFIG_NET_DSA) -int register_dsa_notifier(struct notifier_block *nb); -int unregister_dsa_notifier(struct notifier_block *nb); -int call_dsa_notifiers(unsigned long val, struct net_device *dev, - struct dsa_notifier_info *info); bool dsa_slave_dev_check(const struct net_device *dev); #else -static inline int register_dsa_notifier(struct notifier_block *nb) -{ - return 0; -} - -static inline int unregister_dsa_notifier(struct notifier_block *nb) -{ - return 0; -} - -static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, - struct dsa_notifier_info *info) -{ - return NOTIFY_DONE; -} - static inline bool dsa_slave_dev_check(const struct net_device *dev) { return false; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a1b1dc8a4d87..df75481b12ed 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -309,28 +309,6 @@ bool dsa_schedule_work(struct work_struct *work) return queue_work(dsa_owq, work); } -static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain); - -int register_dsa_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&dsa_notif_chain, nb); -} -EXPORT_SYMBOL_GPL(register_dsa_notifier); - -int unregister_dsa_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&dsa_notif_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_dsa_notifier); - -int call_dsa_notifiers(unsigned long val, struct net_device *dev, - struct dsa_notifier_info *info) -{ - info->dev = dev; - return atomic_notifier_call_chain(&dsa_notif_chain, val, info); -} -EXPORT_SYMBOL_GPL(call_dsa_notifiers); - int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d7f9fbb93589..c8b842ac2600 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1764,20 +1764,6 @@ int dsa_slave_resume(struct net_device *slave_dev) return 0; } -static void dsa_slave_notify(struct net_device *dev, unsigned long val) -{ - struct net_device *master = dsa_slave_to_master(dev); - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_notifier_register_info rinfo = { - .switch_number = dp->ds->index, - .port_number = dp->index, - .master = master, - .info.dev = dev, - }; - - call_dsa_notifiers(val, dev, &rinfo.info); -} - int dsa_slave_create(struct dsa_port *port) { const struct dsa_port *cpu_dp = port->cpu_dp; @@ -1863,8 +1849,6 @@ int dsa_slave_create(struct dsa_port *port) goto out_gcells; } - dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); - rtnl_lock(); ret = register_netdevice(slave_dev); @@ -1913,7 +1897,6 @@ void dsa_slave_destroy(struct net_device *slave_dev) phylink_disconnect_phy(dp->pl); rtnl_unlock(); - dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); phylink_destroy(dp->pl); gro_cells_destroy(&p->gcells); free_percpu(slave_dev->tstats); -- cgit v1.2.3 From 424f481f06dc7f1528447cc3224f5fd3c5e11f65 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:29 -0800 Subject: skbuff: remove unused skb_zcopy_abort function skb_zcopy_abort() has no in-tree consumers, remove it. Signed-off-by: Jonathan Lemon Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 333bcdc39635..3ca8d7c7b30c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1490,17 +1490,6 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) } } -/* Abort a zerocopy operation and revert zckey on error in send syscall */ -static inline void skb_zcopy_abort(struct sk_buff *skb) -{ - struct ubuf_info *uarg = skb_zcopy(skb); - - if (uarg) { - sock_zerocopy_put_abort(uarg, false); - skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; - } -} - static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; -- cgit v1.2.3 From 75518851a2a0c047def521aaf87db401de098352 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:31 -0800 Subject: skbuff: Push status and refcounts into sock_zerocopy_callback Before this change, the caller of sock_zerocopy_callback would need to save the zerocopy status, decrement and check the refcount, and then call the callback function - the callback was only invoked when the refcount reached zero. Now, the caller just passes the status into the callback function, which saves the status and handles its own refcounts. This makes the behavior of the sock_zerocopy_callback identical to the tpacket and vhost callbacks. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 --- net/core/skbuff.c | 14 +++++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ca8d7c7b30c..52e96c35f5af 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1479,9 +1479,6 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) if (uarg) { if (skb_zcopy_is_nouarg(skb)) { /* no notification callback */ - } else if (uarg->callback == sock_zerocopy_callback) { - uarg->zerocopy = uarg->zerocopy && zerocopy; - sock_zerocopy_put(uarg); } else { uarg->callback(uarg, zerocopy); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d88963f47f7d..8c18940723ff 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1194,7 +1194,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +static void __sock_zerocopy_callback(struct ubuf_info *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1222,7 +1222,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; serr->ee.ee_data = hi; serr->ee.ee_info = lo; - if (!success) + if (!uarg->zerocopy) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; q = &sk->sk_error_queue; @@ -1241,11 +1241,19 @@ release: consume_skb(skb); sock_put(sk); } + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +{ + uarg->zerocopy = uarg->zerocopy & success; + + if (refcount_dec_and_test(&uarg->refcnt)) + __sock_zerocopy_callback(uarg); +} EXPORT_SYMBOL_GPL(sock_zerocopy_callback); void sock_zerocopy_put(struct ubuf_info *uarg) { - if (uarg && refcount_dec_and_test(&uarg->refcnt)) + if (uarg) uarg->callback(uarg, uarg->zerocopy); } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -- cgit v1.2.3 From 59776362b14b81dc45c6bbd4e7cb3871f390fc1b Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:32 -0800 Subject: skbuff: replace sock_zerocopy_put() with skb_zcopy_put() Replace sock_zerocopy_put with the generic skb_zcopy_put() function. Pass 'true' as the success argument, as this is identical to no change. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 7 ++++++- net/core/skbuff.c | 9 +-------- net/ipv4/tcp.c | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 52e96c35f5af..a6c86839035b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -496,7 +496,6 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg) refcount_inc(&uarg->refcnt); } -void sock_zerocopy_put(struct ubuf_info *uarg); void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); @@ -1471,6 +1470,12 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } +static inline void skb_zcopy_put(struct ubuf_info *uarg) +{ + if (uarg) + uarg->callback(uarg, true); +} + /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8c18940723ff..0e028825367a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1251,13 +1251,6 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) } EXPORT_SYMBOL_GPL(sock_zerocopy_callback); -void sock_zerocopy_put(struct ubuf_info *uarg) -{ - if (uarg) - uarg->callback(uarg, uarg->zerocopy); -} -EXPORT_SYMBOL_GPL(sock_zerocopy_put); - void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { @@ -1267,7 +1260,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg->len--; if (have_uref) - sock_zerocopy_put(uarg); + skb_zcopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ed42d2193c5c..298a1fae841c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1429,7 +1429,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - sock_zerocopy_put(uarg); + skb_zcopy_put(uarg); return copied + copied_syn; do_error: -- cgit v1.2.3 From e76d46cfff8d2335f363f58bd7387de4059d871d Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:33 -0800 Subject: skbuff: replace sock_zerocopy_get with skb_zcopy_get Rename the get routines for consistency. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 12 ++++++------ net/core/skbuff.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a6c86839035b..5b8a53ab51fd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -491,11 +491,6 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -static inline void sock_zerocopy_get(struct ubuf_info *uarg) -{ - refcount_inc(&uarg->refcnt); -} - void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); @@ -1441,6 +1436,11 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline void skb_zcopy_get(struct ubuf_info *uarg) +{ + refcount_inc(&uarg->refcnt); +} + static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { @@ -1448,7 +1448,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, if (unlikely(have_ref && *have_ref)) *have_ref = false; else - sock_zerocopy_get(uarg); + skb_zcopy_get(uarg); skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0e028825367a..00f195908e79 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1163,7 +1163,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, /* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM) - sock_zerocopy_get(uarg); + skb_zcopy_get(uarg); return uarg; } -- cgit v1.2.3 From 36177832f42d9c7b222ab039678398a9d4070fff Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:34 -0800 Subject: skbuff: Add skb parameter to the ubuf zerocopy callback Add an optional skb parameter to the zerocopy callback parameter, which is passed down from skb_zcopy_clear(). This gives access to the original skb, which is needed for upcoming RX zero-copy error handling. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/xen-netback/common.h | 3 ++- drivers/net/xen-netback/netback.c | 5 +++-- drivers/vhost/net.c | 3 ++- include/linux/skbuff.h | 17 ++++++++--------- net/core/skbuff.c | 3 ++- 7 files changed, 19 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 1f4bdd94407a..3f51f3766d18 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -727,7 +727,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; } else if (msg_control) { struct ubuf_info *uarg = msg_control; - uarg->callback(uarg, false); + uarg->callback(NULL, uarg, false); } if (tap) { diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 978ac0981d16..862c9063aa09 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1819,7 +1819,7 @@ drop: skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; } else if (msg_control) { struct ubuf_info *uarg = msg_control; - uarg->callback(uarg, false); + uarg->callback(NULL, uarg, false); } skb_reset_network_header(skb); diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 8ee24e351bdc..4a16d6e33c09 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -399,7 +399,8 @@ void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb); void xenvif_carrier_on(struct xenvif *vif); /* Callback from stack when TX packet can be released */ -void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); +void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf, + bool zerocopy_success); /* Unmap a pending page and release it back to the guest */ void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx); diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index bc3421d14576..19a27dce79d2 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1091,7 +1091,7 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s uarg = skb_shinfo(skb)->destructor_arg; /* increase inflight counter to offset decrement in callback */ atomic_inc(&queue->inflight_packets); - uarg->callback(uarg, true); + uarg->callback(NULL, uarg, true); skb_shinfo(skb)->destructor_arg = NULL; /* Fill the skb with the new (local) frags. */ @@ -1228,7 +1228,8 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) return work_done; } -void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success) +void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf, + bool zerocopy_success) { unsigned long flags; pending_ring_idx_t index; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index c8784dfafdd7..f834a097895e 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -381,7 +381,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, } } -static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) +static void vhost_zerocopy_callback(struct sk_buff *skb, + struct ubuf_info *ubuf, bool success) { struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5b8a53ab51fd..b23c3b4b3209 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -461,7 +461,8 @@ enum { * The desc field is used to track userspace buffer index. */ struct ubuf_info { - void (*callback)(struct ubuf_info *, bool zerocopy_success); + void (*callback)(struct sk_buff *, struct ubuf_info *, + bool zerocopy_success); union { struct { unsigned long desc; @@ -493,7 +494,8 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); -void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); +void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, @@ -1473,20 +1475,17 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) static inline void skb_zcopy_put(struct ubuf_info *uarg) { if (uarg) - uarg->callback(uarg, true); + uarg->callback(NULL, uarg, true); } /* Release a reference on a zerocopy structure */ -static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) +static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { - if (skb_zcopy_is_nouarg(skb)) { - /* no notification callback */ - } else { - uarg->callback(uarg, zerocopy); - } + if (!skb_zcopy_is_nouarg(skb)) + uarg->callback(skb, uarg, zerocopy_success); skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00f195908e79..89130b21d9f0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1242,7 +1242,8 @@ release: sock_put(sk); } -void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { uarg->zerocopy = uarg->zerocopy & success; -- cgit v1.2.3 From 236a6b1cd585a408139550201343f3f16f9324b9 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:35 -0800 Subject: skbuff: Call sock_zerocopy_put_abort from skb_zcopy_put_abort The sock_zerocopy_put_abort function contains logic which is specific to the current zerocopy implementation. Add a wrapper which checks the callback and dispatches apppropriately. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 10 ++++++++++ net/core/skbuff.c | 12 +++++------- net/ipv4/ip_output.c | 3 +-- net/ipv4/tcp.c | 2 +- net/ipv6/ip6_output.c | 3 +-- 5 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b23c3b4b3209..9f7393167f0a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1478,6 +1478,16 @@ static inline void skb_zcopy_put(struct ubuf_info *uarg) uarg->callback(NULL, uarg, true); } +static inline void skb_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) +{ + if (uarg) { + if (uarg->callback == sock_zerocopy_callback) + sock_zerocopy_put_abort(uarg, have_uref); + else if (have_uref) + skb_zcopy_put(uarg); + } +} + /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 89130b21d9f0..5b9cd528d6a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1254,15 +1254,13 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback); void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { - if (uarg) { - struct sock *sk = skb_from_uarg(uarg)->sk; + struct sock *sk = skb_from_uarg(uarg)->sk; - atomic_dec(&sk->sk_zckey); - uarg->len--; + atomic_dec(&sk->sk_zckey); + uarg->len--; - if (have_uref) - skb_zcopy_put(uarg); - } + if (have_uref) + sock_zerocopy_callback(NULL, uarg, true); } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 89fff5f59eea..bae9b29e17a3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1230,8 +1230,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - if (uarg) - sock_zerocopy_put_abort(uarg, extra_uref); + skb_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 298a1fae841c..fb58215972ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1440,7 +1440,7 @@ do_fault: if (copied + copied_syn) goto out; out_err: - sock_zerocopy_put_abort(uarg, true); + skb_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 749ad72386b2..c8c87891533a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1715,8 +1715,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - if (uarg) - sock_zerocopy_put_abort(uarg, extra_uref); + skb_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3 From 8c793822c5803e01d03f71c431f59316f0b278b7 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:37 -0800 Subject: skbuff: rename sock_zerocopy_* to msg_zerocopy_* At Willem's suggestion, rename the sock_zerocopy_* functions so that they match the MSG_ZEROCOPY flag, which makes it clear they are specific to this zerocopy implementation. Signed-off-by: Jonathan Lemon Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 +++++++++--------- net/core/skbuff.c | 30 +++++++++++++++--------------- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv6/ip6_output.c | 2 +- 5 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9f7393167f0a..f6a3cee5c967 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -488,14 +488,14 @@ struct ubuf_info { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); -struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); -struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); +struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg); -void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); -void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success); +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, @@ -1481,8 +1481,8 @@ static inline void skb_zcopy_put(struct ubuf_info *uarg) static inline void skb_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { - if (uarg->callback == sock_zerocopy_callback) - sock_zerocopy_put_abort(uarg, have_uref); + if (uarg->callback == msg_zerocopy_callback) + msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) skb_zcopy_put(uarg); } @@ -2776,7 +2776,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) if (likely(!skb_zcopy(skb))) return 0; if (!skb_zcopy_is_nouarg(skb) && - skb_uarg(skb)->callback == sock_zerocopy_callback) + skb_uarg(skb)->callback == msg_zerocopy_callback) return 0; return skb_copy_ubufs(skb, gfp_mask); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6d031ed99182..bcd56763952e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1094,7 +1094,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) +struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info *uarg; struct sk_buff *skb; @@ -1114,7 +1114,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->callback = sock_zerocopy_callback; + uarg->callback = msg_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; @@ -1124,15 +1124,15 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) return uarg; } -EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); +EXPORT_SYMBOL_GPL(msg_zerocopy_alloc); static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } -struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg) { if (uarg) { const u32 byte_limit = 1 << 19; /* limit to a few TSO */ @@ -1171,9 +1171,9 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return sock_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size); } -EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); +EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) { @@ -1195,7 +1195,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -static void __sock_zerocopy_callback(struct ubuf_info *uarg) +static void __msg_zerocopy_callback(struct ubuf_info *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1243,17 +1243,17 @@ release: sock_put(sk); } -void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { uarg->zerocopy = uarg->zerocopy & success; if (refcount_dec_and_test(&uarg->refcnt)) - __sock_zerocopy_callback(uarg); + __msg_zerocopy_callback(uarg); } -EXPORT_SYMBOL_GPL(sock_zerocopy_callback); +EXPORT_SYMBOL_GPL(msg_zerocopy_callback); -void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1261,9 +1261,9 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg->len--; if (have_uref) - sock_zerocopy_callback(NULL, uarg, true); + msg_zerocopy_callback(NULL, uarg, true); } -EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); +EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bae9b29e17a3..ffee03729285 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1018,7 +1018,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fb58215972ba..2882d520f5b1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1217,7 +1217,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c8c87891533a..f59cfa39686a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1471,7 +1471,7 @@ emsgsize: csummode = CHECKSUM_PARTIAL; if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ -- cgit v1.2.3 From 06b4feb37e64e543714c971a4162a75e2e4024d4 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:38 -0800 Subject: net: group skb_shinfo zerocopy related bits together. In preparation for expanded zerocopy (TX and RX), move the zerocopy related bits out of tx_flags into their own flag word. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 3 +-- drivers/net/tun.c | 3 +-- drivers/net/xen-netback/interface.c | 4 ++-- include/linux/skbuff.h | 38 ++++++++++++++++++++----------------- net/core/skbuff.c | 9 ++++----- net/ipv4/tcp.c | 2 +- net/kcm/kcmsock.c | 4 ++-- 7 files changed, 32 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 3f51f3766d18..f7a19d9b7c27 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -723,8 +723,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_shinfo(skb)->destructor_arg = msg_control; - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->callback(NULL, uarg, false); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 862c9063aa09..d8f1c3d34612 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1815,8 +1815,7 @@ drop: /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_shinfo(skb)->destructor_arg = msg_control; - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->callback(NULL, uarg, false); diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index acb786d8b1d8..08b0e3d0b7eb 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -47,7 +47,7 @@ /* Number of bytes allowed on the internal guest Rx queue. */ #define XENVIF_RX_QUEUE_BYTES (XEN_NETIF_RX_RING_SIZE/2 * PAGE_SIZE) -/* This function is used to set SKBTX_DEV_ZEROCOPY as well as +/* This function is used to set SKBFL_ZEROCOPY_ENABLE as well as * increasing the inflight counter. We need to increase the inflight * counter because core driver calls into xenvif_zerocopy_callback * which calls xenvif_skb_zerocopy_complete. @@ -55,7 +55,7 @@ void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue, struct sk_buff *skb) { - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_ENABLE; atomic_inc(&queue->inflight_packets); } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f6a3cee5c967..7b5adc511ac6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -430,28 +430,32 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, - /* device driver supports TX zero-copy buffers */ - SKBTX_DEV_ZEROCOPY = 1 << 3, - /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, - /* This indicates at least one fragment might be overwritten - * (as in vmsplice(), sendfile() ...) - * If we need to compute a TX checksum, we'll need to copy - * all frags to avoid possible bad checksum - */ - SKBTX_SHARED_FRAG = 1 << 5, - /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; -#define SKBTX_ZEROCOPY_FRAG (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG) #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) +/* Definitions for flags in struct skb_shared_info */ +enum { + /* use zcopy routines */ + SKBFL_ZEROCOPY_ENABLE = BIT(0), + + /* This indicates at least one fragment might be overwritten + * (as in vmsplice(), sendfile() ...) + * If we need to compute a TX checksum, we'll need to copy + * all frags to avoid possible bad checksum + */ + SKBFL_SHARED_FRAG = BIT(1), +}; + +#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) + /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. @@ -506,7 +510,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - __u8 __unused; + __u8 flags; __u8 meta_len; __u8 nr_frags; __u8 tx_flags; @@ -1433,7 +1437,7 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) { - bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY; + bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL; } @@ -1452,14 +1456,14 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, else skb_zcopy_get(uarg); skb_shinfo(skb)->destructor_arg = uarg; - skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); - skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) @@ -1497,7 +1501,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; } } @@ -3323,7 +3327,7 @@ static inline int skb_linearize(struct sk_buff *skb) static inline bool skb_has_shared_frag(const struct sk_buff *skb) { return skb_is_nonlinear(skb) && - skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bcd56763952e..5b9e52cbd087 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1327,7 +1327,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, * @skb: the skb to modify * @gfp_mask: allocation priority * - * This must be called on SKBTX_DEV_ZEROCOPY skb. + * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * @@ -3264,8 +3264,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); - skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); @@ -3954,8 +3953,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + SKBFL_SHARED_FRAG; if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2882d520f5b1..1954190b33c7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1010,7 +1010,7 @@ new_segment: } if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; skb->len += copy; skb->data_len += copy; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index a9eb616f5521..d0b56ffbb057 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -786,7 +786,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page, if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; goto coalesced; } @@ -834,7 +834,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page, get_page(page); skb_fill_page_desc(skb, i, page, offset, size); - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; coalesced: skb->len += size; -- cgit v1.2.3 From 04c2d33eabdc76df730e1e356f6b2c656381d7d5 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:39 -0800 Subject: skbuff: add flags to ubuf_info for ubuf setup Currently, when an ubuf is attached to a new skb, the shared flags word is initialized to a fixed value. Instead of doing this, set the default flags in the ubuf, and have new skbs inherit from this default. This is needed when setting up different zerocopy types. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 ++- net/core/skbuff.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7b5adc511ac6..a7bc71d2debb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -480,6 +480,7 @@ struct ubuf_info { }; }; refcount_t refcnt; + u8 flags; struct mmpin { struct user_struct *user; @@ -1456,7 +1457,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, else skb_zcopy_get(uarg); skb_shinfo(skb)->destructor_arg = uarg; - skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags |= uarg->flags; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5b9e52cbd087..a7bcbbff99e3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1119,6 +1119,7 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; + uarg->flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&uarg->refcnt, 1); sock_hold(sk); -- cgit v1.2.3 From 9ee5e5ade033875191a2d2e470033e9cdde44a6a Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:40 -0800 Subject: tap/tun: add skb_zcopy_init() helper for initialization. Replace direct assignments with skb_zcopy_init() for zerocopy cases where a new skb is initialized, without changing the reference counts. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 3 +-- drivers/net/tun.c | 3 +-- drivers/vhost/net.c | 1 + include/linux/skbuff.h | 9 +++++++-- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index f7a19d9b7c27..3c652c8ac5ba 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -722,8 +722,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, tap = rcu_dereference(q->tap); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { - skb_shinfo(skb)->destructor_arg = msg_control; - skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; + skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->callback(NULL, uarg, false); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d8f1c3d34612..02a93cfdb6b1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1814,8 +1814,7 @@ drop: /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { - skb_shinfo(skb)->destructor_arg = msg_control; - skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; + skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->callback(NULL, uarg, false); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f834a097895e..3b744031ec8f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -903,6 +903,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; + ubuf->flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a7bc71d2debb..68ad45a98810 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1448,6 +1448,12 @@ static inline void skb_zcopy_get(struct ubuf_info *uarg) refcount_inc(&uarg->refcnt); } +static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg) +{ + skb_shinfo(skb)->destructor_arg = uarg; + skb_shinfo(skb)->flags |= uarg->flags; +} + static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { @@ -1456,8 +1462,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, *have_ref = false; else skb_zcopy_get(uarg); - skb_shinfo(skb)->destructor_arg = uarg; - skb_shinfo(skb)->flags |= uarg->flags; + skb_zcopy_init(skb, uarg); } } -- cgit v1.2.3 From 8e0449172497a915e79da66b222255dc9b4a5f31 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:41 -0800 Subject: skbuff: Rename skb_zcopy_{get|put} to net_zcopy_{get|put} Unlike the rest of the skb_zcopy_ functions, these routines operate on a 'struct ubuf', not a skb. Remove the 'skb_' prefix from the naming to make things clearer. Suggested-by: Willem de Bruijn Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 10 +++++----- net/core/skbuff.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 68ad45a98810..7a057b1f1eb8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1443,7 +1443,7 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } -static inline void skb_zcopy_get(struct ubuf_info *uarg) +static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); } @@ -1461,7 +1461,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, if (unlikely(have_ref && *have_ref)) *have_ref = false; else - skb_zcopy_get(uarg); + net_zcopy_get(uarg); skb_zcopy_init(skb, uarg); } } @@ -1482,19 +1482,19 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } -static inline void skb_zcopy_put(struct ubuf_info *uarg) +static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) uarg->callback(NULL, uarg, true); } -static inline void skb_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) +static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { if (uarg->callback == msg_zerocopy_callback) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) - skb_zcopy_put(uarg); + net_zcopy_put(uarg); } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a7bcbbff99e3..7626a33cce59 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1165,7 +1165,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, /* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM) - skb_zcopy_get(uarg); + net_zcopy_get(uarg); return uarg; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ffee03729285..102b1998ba3c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1230,7 +1230,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - skb_zcopy_put_abort(uarg, extra_uref); + net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1954190b33c7..2267d21c73a6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1429,7 +1429,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: - skb_zcopy_put(uarg); + net_zcopy_put(uarg); return copied + copied_syn; do_error: @@ -1440,7 +1440,7 @@ do_fault: if (copied + copied_syn) goto out; out_err: - skb_zcopy_put_abort(uarg, true); + net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f59cfa39686a..072ce9678616 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1715,7 +1715,7 @@ alloc_new_skb: error_efault: err = -EFAULT; error: - skb_zcopy_put_abort(uarg, extra_uref); + net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3 From 43b5169d8355ccf26d726fbc75f083b2429113e4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 22 Dec 2020 22:09:28 +0100 Subject: net, xdp: Introduce xdp_init_buff utility routine Introduce xdp_init_buff utility routine to initialize xdp_buff fields const over NAPI iterations (e.g. frame_sz or rxq pointer). Rely on xdp_init_buff in all XDP capable drivers. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Duyck Acked-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Shay Agroskin Acked-by: Martin Habets Acked-by: Camelia Groza Acked-by: Marcin Wojtas Link: https://lore.kernel.org/bpf/7f8329b6da1434dc2b05a77f2e800b29628a8913.1608670965.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 3 +-- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 4 ++-- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 4 ++-- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 4 ++-- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 8 ++++---- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 6 +++--- drivers/net/ethernet/intel/ice/ice_txrx.c | 6 +++--- drivers/net/ethernet/intel/igb/igb_main.c | 6 +++--- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 +++---- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 7 +++---- drivers/net/ethernet/marvell/mvneta.c | 3 +-- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 8 +++++--- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 3 +-- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- drivers/net/ethernet/qlogic/qede/qede_fp.c | 3 +-- drivers/net/ethernet/sfc/rx.c | 3 +-- drivers/net/ethernet/socionext/netsec.c | 3 +-- drivers/net/ethernet/ti/cpsw.c | 4 ++-- drivers/net/ethernet/ti/cpsw_new.c | 4 ++-- drivers/net/hyperv/netvsc_bpf.c | 3 +-- drivers/net/tun.c | 7 +++---- drivers/net/veth.c | 8 ++++---- drivers/net/virtio_net.c | 6 ++---- drivers/net/xen-netfront.c | 4 ++-- include/net/xdp.h | 7 +++++++ net/bpf/test_run.c | 4 ++-- net/core/dev.c | 8 ++++---- 28 files changed, 68 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 06596fa1f9fe..43331f6967c9 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1634,8 +1634,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev, "%s qid %d\n", __func__, rx_ring->qid); res_budget = budget; - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = ENA_PAGE_SIZE; + xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq); do { xdp_verdict = XDP_PASS; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index fcc262064766..ab805d6750e5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -133,12 +133,12 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, dma_sync_single_for_cpu(&pdev->dev, mapping + offset, *len, bp->rx_dir); txr = rxr->bnapi->tx_ring; + /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ + xdp_init_buff(&xdp, PAGE_SIZE, &rxr->xdp_rxq); xdp.data_hard_start = *data_ptr - offset; xdp.data = *data_ptr; xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; - xdp.rxq = &rxr->xdp_rxq; - xdp.frame_sz = PAGE_SIZE; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index f3b7b443f964..9fc672f075f2 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -547,12 +547,12 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, cpu_addr = (u64)phys_to_virt(cpu_addr); page = virt_to_page((void *)cpu_addr); + xdp_init_buff(&xdp, RCV_FRAG_LEN + XDP_PACKET_HEADROOM, + &rq->xdp_rxq); xdp.data_hard_start = page_address(page); xdp.data = (void *)cpu_addr; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = RCV_FRAG_LEN + XDP_PACKET_HEADROOM; orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 4360ce4d3fb6..26e20b96fd96 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2532,12 +2532,12 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, return XDP_PASS; } + xdp_init_buff(&xdp, DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE, + &dpaa_fq->xdp_rxq); xdp.data = vaddr + fd_off; xdp.data_meta = xdp.data; xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; xdp.data_end = xdp.data + qm_fd_get_length(fd); - xdp.frame_sz = DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE; - xdp.rxq = &dpaa_fq->xdp_rxq; /* We reserve a fixed headroom of 256 bytes under the erratum and we * offer it all to XDP programs to use. If no room is left for the diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index fb0bcd18ec0c..3a7892a66fe5 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -358,14 +358,14 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, if (!xdp_prog) goto out; + xdp_init_buff(&xdp, + DPAA2_ETH_RX_BUF_RAW_SIZE - + (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM), + &ch->xdp_rxq); xdp.data = vaddr + dpaa2_fd_get_offset(fd); xdp.data_end = xdp.data + dpaa2_fd_get_len(fd); xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; xdp_set_data_meta_invalid(&xdp); - xdp.rxq = &ch->xdp_rxq; - - xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE - - (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 4aca637d4a23..a87fb8264d0c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2344,7 +2344,7 @@ static void i40e_inc_ntc(struct i40e_ring *rx_ring) **/ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; struct sk_buff *skb = rx_ring->skb; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); unsigned int xdp_xmit = 0; @@ -2352,9 +2352,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) struct xdp_buff xdp; #if (PAGE_SIZE < 8192) - xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); + frame_sz = i40e_rx_frame_truesize(rx_ring, 0); #endif - xdp.rxq = &rx_ring->xdp_rxq; + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *rx_buffer; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index a2d0aad8cfdd..500e93bf6238 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1089,18 +1089,18 @@ ice_is_non_eop(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc, */ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_pkts = 0; + unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); unsigned int xdp_res, xdp_xmit = 0; struct bpf_prog *xdp_prog = NULL; struct xdp_buff xdp; bool failure; - xdp.rxq = &rx_ring->xdp_rxq; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0); + frame_sz = ice_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 03f78fdb0dcd..cf9e8b7d2c70 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8681,13 +8681,13 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) u16 cleaned_count = igb_desc_unused(rx_ring); unsigned int xdp_xmit = 0; struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; + u32 frame_sz = 0; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = igb_rx_frame_truesize(rx_ring, 0); + frame_sz = igb_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_packets < budget)) { union e1000_adv_rx_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6cbbe09ce8a0..7a7c86835a87 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2291,7 +2291,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, struct ixgbe_ring *rx_ring, const int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; struct ixgbe_adapter *adapter = q_vector->adapter; #ifdef IXGBE_FCOE int ddp_bytes; @@ -2301,12 +2301,11 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, unsigned int xdp_xmit = 0; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; - /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); + frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 4061cd7db5dd..624efcd71569 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1121,19 +1121,18 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct ixgbevf_ring *rx_ring, int budget) { - unsigned int total_rx_bytes = 0, total_rx_packets = 0; + unsigned int total_rx_bytes = 0, total_rx_packets = 0, frame_sz = 0; struct ixgbevf_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbevf_desc_unused(rx_ring); struct sk_buff *skb = rx_ring->skb; bool xdp_xmit = false; struct xdp_buff xdp; - xdp.rxq = &rx_ring->xdp_rxq; - /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) - xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); + frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); #endif + xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq); while (likely(total_rx_packets < budget)) { struct ixgbevf_rx_buffer *rx_buffer; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index bc4d8d144401..038c6b436cba 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2363,9 +2363,8 @@ static int mvneta_rx_swbm(struct napi_struct *napi, u32 desc_status, frame_sz; struct xdp_buff xdp_buf; + xdp_init_buff(&xdp_buf, PAGE_SIZE, &rxq->xdp_rxq); xdp_buf.data_hard_start = NULL; - xdp_buf.frame_sz = PAGE_SIZE; - xdp_buf.rxq = &rxq->xdp_rxq; sinfo.nr_frags = 0; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 4b1808acef58..5872cb011ae1 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3563,16 +3563,18 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, frag_size = bm_pool->frag_size; if (xdp_prog) { + struct xdp_rxq_info *xdp_rxq; + xdp.data_hard_start = data; xdp.data = data + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM; xdp.data_end = xdp.data + rx_bytes; - xdp.frame_sz = PAGE_SIZE; if (bm_pool->pkt_size == MVPP2_BM_SHORT_PKT_SIZE) - xdp.rxq = &rxq->xdp_rxq_short; + xdp_rxq = &rxq->xdp_rxq_short; else - xdp.rxq = &rxq->xdp_rxq_long; + xdp_rxq = &rxq->xdp_rxq_long; + xdp_init_buff(&xdp, PAGE_SIZE, xdp_rxq); xdp_set_data_meta_invalid(&xdp); ret = mvpp2_run_xdp(port, rxq, xdp_prog, &xdp, pp, &ps); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index c1c9118a66c9..93da9c2d50c0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -682,8 +682,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud /* Protect accesses to: ring->xdp_prog, priv->mac_hash list */ rcu_read_lock(); xdp_prog = rcu_dereference(ring->xdp_prog); - xdp.rxq = &ring->xdp_rxq; - xdp.frame_sz = priv->frag_info[0].frag_stride; + xdp_init_buff(&xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq); doorbell_pending = false; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 7f5851c61218..bc7c81f2b036 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1126,12 +1126,11 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va, static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, u32 len, struct xdp_buff *xdp) { + xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); xdp->data_hard_start = va; xdp->data = va + headroom; xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; - xdp->rxq = &rq->xdp_rxq; - xdp->frame_sz = rq->buff.frame0_sz; } static struct sk_buff * diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 7ba8f4c7f26d..513bc60bcd09 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1822,8 +1822,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; - xdp.frame_sz = PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM; - xdp.rxq = &rx_ring->xdp_rxq; + xdp_init_buff(&xdp, PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM, + &rx_ring->xdp_rxq); tx_ring = r_vec->xdp_ring; while (pkts_polled < budget) { diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index ca0ee29a57b5..8d0c6d62022a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1090,12 +1090,11 @@ static bool qede_rx_xdp(struct qede_dev *edev, struct xdp_buff xdp; enum xdp_action act; + xdp_init_buff(&xdp, rxq->rx_buf_seg_size, &rxq->xdp_rxq); xdp.data_hard_start = page_address(bd->data); xdp.data = xdp.data_hard_start + *data_offset; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; - xdp.rxq = &rxq->xdp_rxq; - xdp.frame_sz = rxq->rx_buf_seg_size; /* PAGE_SIZE when XDP enabled */ /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index aaa112877561..eaa6650955d1 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -293,14 +293,13 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, memcpy(rx_prefix, *ehp - efx->rx_prefix_size, efx->rx_prefix_size); + xdp_init_buff(&xdp, efx->rx_page_buf_step, &rx_queue->xdp_rxq_info); xdp.data = *ehp; xdp.data_hard_start = xdp.data - EFX_XDP_HEADROOM; /* No support yet for XDP metadata */ xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + rx_buf->len; - xdp.rxq = &rx_queue->xdp_rxq_info; - xdp.frame_sz = efx->rx_page_buf_step; xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 19d20a6d0d44..945ca9517bf9 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -956,8 +956,7 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) u32 xdp_act = 0; int done = 0; - xdp.rxq = &dring->xdp_rxq; - xdp.frame_sz = PAGE_SIZE; + xdp_init_buff(&xdp, PAGE_SIZE, &dring->xdp_rxq); rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index b0f00b4edd94..78a923391828 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -392,6 +392,8 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + if (status & CPDMA_RX_VLAN_ENCAP) { xdp.data = pa + CPSW_HEADROOM + CPSW_RX_VLAN_ENCAP_HDR_SIZE; @@ -405,8 +407,6 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = pa; - xdp.rxq = &priv->xdp_rxq[ch]; - xdp.frame_sz = PAGE_SIZE; port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 2f5e0ad23ad7..1b3385ec9645 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -335,6 +335,8 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + if (status & CPDMA_RX_VLAN_ENCAP) { xdp.data = pa + CPSW_HEADROOM + CPSW_RX_VLAN_ENCAP_HDR_SIZE; @@ -348,8 +350,6 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = pa; - xdp.rxq = &priv->xdp_rxq[ch]; - xdp.frame_sz = PAGE_SIZE; ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 440486d9c999..14a7ee4c6899 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -44,12 +44,11 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, goto out; } + xdp_init_buff(xdp, PAGE_SIZE, &nvchan->xdp_rxq); xdp->data_hard_start = page_address(page); xdp->data = xdp->data_hard_start + NETVSC_XDP_HDRM; xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; - xdp->rxq = &nvchan->xdp_rxq; - xdp->frame_sz = PAGE_SIZE; memcpy(xdp->data, data, len); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 02a93cfdb6b1..3b7728433a86 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1599,12 +1599,11 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct xdp_buff xdp; u32 act; + xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp.data_hard_start = buf; xdp.data = buf + pad; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; - xdp.rxq = &tfile->xdp_rxq; - xdp.frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { @@ -2342,9 +2341,9 @@ static int tun_xdp_one(struct tun_struct *tun, skb_xdp = true; goto build; } + + xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); xdp_set_data_meta_invalid(xdp); - xdp->rxq = &tfile->xdp_rxq; - xdp->frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, xdp); err = tun_xdp_act(tun, xdp_prog, xdp, act); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 02bfcdf50a7a..25f3601fb6dd 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -654,7 +654,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - u32 pktlen, headroom, act, metalen; + u32 pktlen, headroom, act, metalen, frame_sz; void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; int mac_len, delta, off; @@ -714,11 +714,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, xdp.data = skb_mac_header(skb); xdp.data_end = xdp.data + pktlen; xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; /* SKB "head" area always have tailroom for skb_shared_info */ - xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; - xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); orig_data = xdp.data; orig_data_end = xdp.data_end; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 508408fbe78f..36b0f81bcd7a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -689,12 +689,11 @@ static struct sk_buff *receive_small(struct net_device *dev, page = xdp_page; } + xdp_init_buff(&xdp, buflen, &rq->xdp_rxq); xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; xdp.data_end = xdp.data + len; xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = buflen; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -859,12 +858,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, * the descriptor on if we get an XDP_TX return code. */ data = page_address(xdp_page) + offset; + xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq); xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; xdp.data_end = xdp.data + (len - vi->hdr_len); xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = frame_sz - vi->hdr_len; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index b01848ef4649..329397c60d84 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -864,12 +864,12 @@ static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, u32 act; int err; + xdp_init_buff(xdp, XEN_PAGE_SIZE - XDP_PACKET_HEADROOM, + &queue->xdp_rxq); xdp->data_hard_start = page_address(pdata); xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; - xdp->rxq = &queue->xdp_rxq; - xdp->frame_sz = XEN_PAGE_SIZE - XDP_PACKET_HEADROOM; act = bpf_prog_run_xdp(prog, xdp); switch (act) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 600acb307db6..8a589f7e08e0 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -76,6 +76,13 @@ struct xdp_buff { u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; +static __always_inline void +xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) +{ + xdp->frame_sz = frame_sz; + xdp->rxq = rxq; +} + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c1c30a9f76f3..a8fa5a9e4137 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -640,10 +640,10 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; - xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - xdp.rxq = &rxqueue->xdp_rxq; + xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, + &rxqueue->xdp_rxq); bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) diff --git a/net/core/dev.c b/net/core/dev.c index 7afbb642e203..e6d758a3c2a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4606,11 +4606,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; int hlen, off; - u32 mac_len; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4649,8 +4649,8 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; - xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4659,7 +4659,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_eth_type = eth->h_proto; rxqueue = netif_get_rxqueue(skb); - xdp->rxq = &rxqueue->xdp_rxq; + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); act = bpf_prog_run_xdp(xdp_prog, xdp); -- cgit v1.2.3 From be9df4aff65f18caa79b35f88f42c3d5a43af14f Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 22 Dec 2020 22:09:29 +0100 Subject: net, xdp: Introduce xdp_prepare_buff utility routine Introduce xdp_prepare_buff utility routine to initialize per-descriptor xdp_buff fields (e.g. xdp_buff pointers). Rely on xdp_prepare_buff() in all XDP capable drivers. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Duyck Acked-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Shay Agroskin Acked-by: Martin Habets Acked-by: Camelia Groza Acked-by: Marcin Wojtas Link: https://lore.kernel.org/bpf/45f46f12295972a97da8ca01990b3e71501e9d89.1608670965.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 7 +++---- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 5 +---- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 8 ++++---- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 6 ++---- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 14 +++++--------- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 12 ++++++------ drivers/net/ethernet/intel/ice/ice_txrx.c | 9 +++++---- drivers/net/ethernet/intel/igb/igb_main.c | 12 ++++++------ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 ++++++------ drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 12 ++++++------ drivers/net/ethernet/marvell/mvneta.c | 7 ++----- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 8 +++----- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 ++---- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 5 +---- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++++---- drivers/net/ethernet/qlogic/qede/qede_fp.c | 6 ++---- drivers/net/ethernet/sfc/rx.c | 7 ++----- drivers/net/ethernet/socionext/netsec.c | 6 ++---- drivers/net/ethernet/ti/cpsw.c | 16 +++++----------- drivers/net/ethernet/ti/cpsw_new.c | 16 +++++----------- drivers/net/hyperv/netvsc_bpf.c | 5 +---- drivers/net/tun.c | 5 +---- drivers/net/veth.c | 8 ++------ drivers/net/virtio_net.c | 12 ++++-------- drivers/net/xen-netfront.c | 6 ++---- include/net/xdp.h | 12 ++++++++++++ net/bpf/test_run.c | 7 ++----- net/core/dev.c | 20 +++++++++----------- 28 files changed, 105 insertions(+), 152 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 43331f6967c9..1db6cfd2b55c 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1585,10 +1585,9 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp) int ret; rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]; - xdp->data = page_address(rx_info->page) + rx_info->page_offset; - xdp_set_data_meta_invalid(xdp); - xdp->data_hard_start = page_address(rx_info->page); - xdp->data_end = xdp->data + rx_ring->ena_bufs[0].len; + xdp_prepare_buff(xdp, page_address(rx_info->page), + rx_info->page_offset, + rx_ring->ena_bufs[0].len, false); /* If for some reason we received a bigger packet than * we expect, then we simply drop it */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index ab805d6750e5..641303894341 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -135,10 +135,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, txr = rxr->bnapi->tx_ring; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ xdp_init_buff(&xdp, PAGE_SIZE, &rxr->xdp_rxq); - xdp.data_hard_start = *data_ptr - offset; - xdp.data = *data_ptr; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = *data_ptr + *len; + xdp_prepare_buff(&xdp, *data_ptr - offset, offset, *len, false); orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 9fc672f075f2..c33b4e837515 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -530,6 +530,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, struct cqe_rx_t *cqe_rx, struct snd_queue *sq, struct rcv_queue *rq, struct sk_buff **skb) { + unsigned char *hard_start, *data; struct xdp_buff xdp; struct page *page; u32 action; @@ -549,10 +550,9 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_init_buff(&xdp, RCV_FRAG_LEN + XDP_PACKET_HEADROOM, &rq->xdp_rxq); - xdp.data_hard_start = page_address(page); - xdp.data = (void *)cpu_addr; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + len; + hard_start = page_address(page); + data = (unsigned char *)cpu_addr; + xdp_prepare_buff(&xdp, hard_start, data - hard_start, len, false); orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 26e20b96fd96..d8e568f6caf3 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2534,10 +2534,8 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, xdp_init_buff(&xdp, DPAA_BP_RAW_SIZE - DPAA_TX_PRIV_DATA_SIZE, &dpaa_fq->xdp_rxq); - xdp.data = vaddr + fd_off; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + qm_fd_get_length(fd); + xdp_prepare_buff(&xdp, vaddr + fd_off - XDP_PACKET_HEADROOM, + XDP_PACKET_HEADROOM, qm_fd_get_length(fd), true); /* We reserve a fixed headroom of 256 bytes under the erratum and we * offer it all to XDP programs to use. If no room is left for the diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 3a7892a66fe5..55b73c1136ca 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -350,7 +350,7 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, struct bpf_prog *xdp_prog; struct xdp_buff xdp; u32 xdp_act = XDP_PASS; - int err; + int err, offset; rcu_read_lock(); @@ -358,14 +358,10 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, if (!xdp_prog) goto out; - xdp_init_buff(&xdp, - DPAA2_ETH_RX_BUF_RAW_SIZE - - (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM), - &ch->xdp_rxq); - xdp.data = vaddr + dpaa2_fd_get_offset(fd); - xdp.data_end = xdp.data + dpaa2_fd_get_len(fd); - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp_set_data_meta_invalid(&xdp); + offset = dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM; + xdp_init_buff(&xdp, DPAA2_ETH_RX_BUF_RAW_SIZE - offset, &ch->xdp_rxq); + xdp_prepare_buff(&xdp, vaddr + offset, XDP_PACKET_HEADROOM, + dpaa2_fd_get_len(fd), false); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index a87fb8264d0c..2574e78f7597 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2406,12 +2406,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - i40e_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = i40e_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 500e93bf6238..422f53997c02 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1104,8 +1104,10 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { + unsigned int offset = ice_rx_offset(rx_ring); union ice_32b_rx_flex_desc *rx_desc; struct ice_rx_buf *rx_buf; + unsigned char *hard_start; struct sk_buff *skb; unsigned int size; u16 stat_err_bits; @@ -1151,10 +1153,9 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) goto construct_skb; } - xdp.data = page_address(rx_buf->page) + rx_buf->page_offset; - xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring); - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + size; + hard_start = page_address(rx_buf->page) + rx_buf->page_offset - + offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index cf9e8b7d2c70..6b5adbd9660b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8715,12 +8715,12 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - igb_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = igb_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 7a7c86835a87..56dca73d158e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2335,12 +2335,12 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - ixgbe_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = ixgbe_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 624efcd71569..a534a3fb392e 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1160,12 +1160,12 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* retrieve a buffer from the ring */ if (!skb) { - xdp.data = page_address(rx_buffer->page) + - rx_buffer->page_offset; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - - ixgbevf_rx_offset(rx_ring); - xdp.data_end = xdp.data + size; + unsigned int offset = ixgbevf_rx_offset(rx_ring); + unsigned char *hard_start; + + hard_start = page_address(rx_buffer->page) + + rx_buffer->page_offset - offset; + xdp_prepare_buff(&xdp, hard_start, offset, size, true); #if (PAGE_SIZE > 4096) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 038c6b436cba..6290bfb6494e 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2263,11 +2263,8 @@ mvneta_swbm_rx_frame(struct mvneta_port *pp, /* Prefetch header */ prefetch(data); - - xdp->data_hard_start = data; - xdp->data = data + pp->rx_offset_correction + MVNETA_MH_SIZE; - xdp->data_end = xdp->data + data_len; - xdp_set_data_meta_invalid(xdp); + xdp_prepare_buff(xdp, data, pp->rx_offset_correction + MVNETA_MH_SIZE, + data_len, false); sinfo = xdp_get_shared_info_from_buff(xdp); sinfo->nr_frags = 0; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 5872cb011ae1..5272f26a3e3e 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3565,17 +3565,15 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, if (xdp_prog) { struct xdp_rxq_info *xdp_rxq; - xdp.data_hard_start = data; - xdp.data = data + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM; - xdp.data_end = xdp.data + rx_bytes; - if (bm_pool->pkt_size == MVPP2_BM_SHORT_PKT_SIZE) xdp_rxq = &rxq->xdp_rxq_short; else xdp_rxq = &rxq->xdp_rxq_long; xdp_init_buff(&xdp, PAGE_SIZE, xdp_rxq); - xdp_set_data_meta_invalid(&xdp); + xdp_prepare_buff(&xdp, data, + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM, + rx_bytes, false); ret = mvpp2_run_xdp(port, rxq, xdp_prog, &xdp, pp, &ps); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 93da9c2d50c0..e35e4d7ef4d1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -776,10 +776,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud priv->frag_info[0].frag_size, DMA_FROM_DEVICE); - xdp.data_hard_start = va - frags[0].page_offset; - xdp.data = va; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + length; + xdp_prepare_buff(&xdp, va - frags[0].page_offset, + frags[0].page_offset, length, false); orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index bc7c81f2b036..a63ce7c8b98f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1127,10 +1127,7 @@ static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, u32 len, struct xdp_buff *xdp) { xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); - xdp->data_hard_start = va; - xdp->data = va + headroom; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; + xdp_prepare_buff(xdp, va, headroom, len, false); } static struct sk_buff * diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 513bc60bcd09..eeb30680b4dc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1914,10 +1914,10 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) unsigned int dma_off; int act; - xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM; - xdp.data = orig_data; - xdp.data_meta = orig_data; - xdp.data_end = orig_data + pkt_len; + xdp_prepare_buff(&xdp, + rxbuf->frag + NFP_NET_RX_BUF_HEADROOM, + pkt_off - NFP_NET_RX_BUF_HEADROOM, + pkt_len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 8d0c6d62022a..70c8d3cd85c0 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1091,10 +1091,8 @@ static bool qede_rx_xdp(struct qede_dev *edev, enum xdp_action act; xdp_init_buff(&xdp, rxq->rx_buf_seg_size, &rxq->xdp_rxq); - xdp.data_hard_start = page_address(bd->data); - xdp.data = xdp.data_hard_start + *data_offset; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + *len; + xdp_prepare_buff(&xdp, page_address(bd->data), *data_offset, + *len, false); /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index eaa6650955d1..89c5c75f479f 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -294,12 +294,9 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, efx->rx_prefix_size); xdp_init_buff(&xdp, efx->rx_page_buf_step, &rx_queue->xdp_rxq_info); - xdp.data = *ehp; - xdp.data_hard_start = xdp.data - EFX_XDP_HEADROOM; - /* No support yet for XDP metadata */ - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + rx_buf->len; + xdp_prepare_buff(&xdp, *ehp - EFX_XDP_HEADROOM, EFX_XDP_HEADROOM, + rx_buf->len, false); xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 945ca9517bf9..3c53051bdacf 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1015,10 +1015,8 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) dma_dir); prefetch(desc->addr); - xdp.data_hard_start = desc->addr; - xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + pkt_len; + xdp_prepare_buff(&xdp, desc->addr, NETSEC_RXBUF_HEADROOM, + pkt_len, false); if (xdp_prog) { xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 78a923391828..5239318e9686 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -392,21 +392,15 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { - xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + int headroom = CPSW_HEADROOM, size = len; + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); if (status & CPDMA_RX_VLAN_ENCAP) { - xdp.data = pa + CPSW_HEADROOM + - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - xdp.data_end = xdp.data + len - - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - } else { - xdp.data = pa + CPSW_HEADROOM; - xdp.data_end = xdp.data + len; + headroom += CPSW_RX_VLAN_ENCAP_HDR_SIZE; + size -= CPSW_RX_VLAN_ENCAP_HDR_SIZE; } - xdp_set_data_meta_invalid(&xdp); - - xdp.data_hard_start = pa; + xdp_prepare_buff(&xdp, pa, headroom, size, false); port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 1b3385ec9645..94747f82c60b 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -335,21 +335,15 @@ static void cpsw_rx_handler(void *token, int len, int status) } if (priv->xdp_prog) { - xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); + int headroom = CPSW_HEADROOM, size = len; + xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]); if (status & CPDMA_RX_VLAN_ENCAP) { - xdp.data = pa + CPSW_HEADROOM + - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - xdp.data_end = xdp.data + len - - CPSW_RX_VLAN_ENCAP_HDR_SIZE; - } else { - xdp.data = pa + CPSW_HEADROOM; - xdp.data_end = xdp.data + len; + headroom += CPSW_RX_VLAN_ENCAP_HDR_SIZE; + size -= CPSW_RX_VLAN_ENCAP_HDR_SIZE; } - xdp_set_data_meta_invalid(&xdp); - - xdp.data_hard_start = pa; + xdp_prepare_buff(&xdp, pa, headroom, size, false); ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 14a7ee4c6899..d60dcf6c9829 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -45,10 +45,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, } xdp_init_buff(xdp, PAGE_SIZE, &nvchan->xdp_rxq); - xdp->data_hard_start = page_address(page); - xdp->data = xdp->data_hard_start + NETVSC_XDP_HDRM; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; + xdp_prepare_buff(xdp, page_address(page), NETVSC_XDP_HDRM, len, false); memcpy(xdp->data, data, len); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3b7728433a86..702215596889 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1600,10 +1600,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); - xdp.data_hard_start = buf; - xdp.data = buf + pad; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + len; + xdp_prepare_buff(&xdp, buf, pad, len, false); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 25f3601fb6dd..99caae7d1641 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -710,15 +710,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, skb = nskb; } - xdp.data_hard_start = skb->head; - xdp.data = skb_mac_header(skb); - xdp.data_end = xdp.data + pktlen; - xdp.data_meta = xdp.data; - /* SKB "head" area always have tailroom for skb_shared_info */ - frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + frame_sz = skb_end_pointer(skb) - skb->head; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); + xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); orig_data = xdp.data; orig_data_end = xdp.data_end; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 36b0f81bcd7a..ba8e63792549 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -690,10 +690,8 @@ static struct sk_buff *receive_small(struct net_device *dev, } xdp_init_buff(&xdp, buflen, &rq->xdp_rxq); - xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; - xdp.data = xdp.data_hard_start + xdp_headroom; - xdp.data_end = xdp.data + len; - xdp.data_meta = xdp.data; + xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len, + xdp_headroom, len, true); orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -859,10 +857,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, */ data = page_address(xdp_page) + offset; xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq); - xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; - xdp.data = data + vi->hdr_len; - xdp.data_end = xdp.data + (len - vi->hdr_len); - xdp.data_meta = xdp.data; + xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len, + VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 329397c60d84..c20b78120bb4 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -866,10 +866,8 @@ static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, xdp_init_buff(xdp, XEN_PAGE_SIZE - XDP_PACKET_HEADROOM, &queue->xdp_rxq); - xdp->data_hard_start = page_address(pdata); - xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; - xdp_set_data_meta_invalid(xdp); - xdp->data_end = xdp->data + len; + xdp_prepare_buff(xdp, page_address(pdata), XDP_PACKET_HEADROOM, + len, false); act = bpf_prog_run_xdp(prog, xdp); switch (act) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 8a589f7e08e0..0cf3976ce77c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -83,6 +83,18 @@ xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) xdp->rxq = rxq; } +static __always_inline void +xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, + int headroom, int data_len, const bool meta_valid) +{ + unsigned char *data = hard_start + headroom; + + xdp->data_hard_start = hard_start; + xdp->data = data; + xdp->data_end = data + data_len; + xdp->data_meta = meta_valid ? data : data + 1; +} + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a8fa5a9e4137..23dfb2010ba6 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -636,14 +636,11 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); - xdp.data_hard_start = data; - xdp.data = data + headroom; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + size; - rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, &rxqueue->xdp_rxq); + xdp_prepare_buff(&xdp, data, headroom, size, true); + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) diff --git a/net/core/dev.c b/net/core/dev.c index e6d758a3c2a9..55499b017a42 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4603,14 +4603,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; - int hlen, off; + int off; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4642,25 +4642,23 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * header. */ mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp->data = skb->data - mac_len; - xdp->data_meta = xdp->data; - xdp->data_end = xdp->data + hlen; - xdp->data_hard_start = skb->data - skb_headroom(skb); + hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; - rxqueue = netif_get_rxqueue(skb); - xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); - act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ -- cgit v1.2.3 From 994122211665301080cd232c06ae219b681dcb57 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Jan 2021 17:34:01 -0600 Subject: remoteproc: qcom: expose types for COMPILE_TEST Stub functions are defined for SSR notifier registration in case QCOM_RPROC_COMMON is not configured. As a result, code that uses these functions can link successfully even if the common remoteproc code is not built. Code that registers an SSR notifier function likely needs the types defined in "qcom_rproc.h", but those are only exposed if QCOM_RPROC_COMMON is enabled. Rearrange the conditional definition so the qcom_ssr_notify_data structure and qcom_ssr_notify_type enumerated type are defined whether or not QCOM_RPROC_COMMON is enabled. Reviewed-by: Bjorn Andersson Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- include/linux/remoteproc/qcom_rproc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/remoteproc/qcom_rproc.h b/include/linux/remoteproc/qcom_rproc.h index 647051662174..82b211518136 100644 --- a/include/linux/remoteproc/qcom_rproc.h +++ b/include/linux/remoteproc/qcom_rproc.h @@ -3,8 +3,6 @@ struct notifier_block; -#if IS_ENABLED(CONFIG_QCOM_RPROC_COMMON) - /** * enum qcom_ssr_notify_type - Startup/Shutdown events related to a remoteproc * processor. @@ -26,6 +24,8 @@ struct qcom_ssr_notify_data { bool crashed; }; +#if IS_ENABLED(CONFIG_QCOM_RPROC_COMMON) + void *qcom_register_ssr_notifier(const char *name, struct notifier_block *nb); int qcom_unregister_ssr_notifier(void *notify, struct notifier_block *nb); -- cgit v1.2.3 From ce2ceb9b1cff777c7d8beb368eddc3b8e45381f6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Jan 2021 17:34:02 -0600 Subject: soc: qcom: mdt_loader: define stubs for COMPILE_TEST Define stub functions for the exposed MDT functions in case QCOM_MDT_LOADER is not configured. This allows users of these functions to link correctly for COMPILE_TEST builds without QCOM_SCM enabled. Reviewed-by: Bjorn Andersson Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- include/linux/soc/qcom/mdt_loader.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h index e600baec6825..afd47217996b 100644 --- a/include/linux/soc/qcom/mdt_loader.h +++ b/include/linux/soc/qcom/mdt_loader.h @@ -11,6 +11,8 @@ struct device; struct firmware; +#if IS_ENABLED(CONFIG_QCOM_MDT_LOADER) + ssize_t qcom_mdt_get_size(const struct firmware *fw); int qcom_mdt_load(struct device *dev, const struct firmware *fw, const char *fw_name, int pas_id, void *mem_region, @@ -23,4 +25,37 @@ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, phys_addr_t *reloc_base); void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len); +#else /* !IS_ENABLED(CONFIG_QCOM_MDT_LOADER) */ + +static inline ssize_t qcom_mdt_get_size(const struct firmware *fw) +{ + return -ENODEV; +} + +static inline int qcom_mdt_load(struct device *dev, const struct firmware *fw, + const char *fw_name, int pas_id, + void *mem_region, phys_addr_t mem_phys, + size_t mem_size, phys_addr_t *reloc_base) +{ + return -ENODEV; +} + +static inline int qcom_mdt_load_no_init(struct device *dev, + const struct firmware *fw, + const char *fw_name, int pas_id, + void *mem_region, phys_addr_t mem_phys, + size_t mem_size, + phys_addr_t *reloc_base) +{ + return -ENODEV; +} + +static inline void *qcom_mdt_read_metadata(const struct firmware *fw, + size_t *data_len) +{ + return ERR_PTR(-ENODEV); +} + +#endif /* !IS_ENABLED(CONFIG_QCOM_MDT_LOADER) */ + #endif -- cgit v1.2.3 From 1d11fa696733ffb9ac24771716b1b1b9953e5a48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jan 2021 03:39:03 -0800 Subject: net-gro: remove GRO_DROP GRO_DROP can only be returned from napi_gro_frags() if the skb has not been allocated by a prior napi_get_frags() Since drivers must use napi_get_frags() and test its result before populating the skb with metadata, we can safely remove GRO_DROP since it offers no practical use. Signed-off-by: Eric Dumazet Cc: Jesse Brandeburg Acked-by: Edward Cree Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - net/core/dev.c | 11 ----------- 2 files changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ec3ac5d5bbf..5b949076ed23 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -376,7 +376,6 @@ enum gro_result { GRO_MERGED_FREE, GRO_HELD, GRO_NORMAL, - GRO_DROP, GRO_CONSUMED, }; typedef enum gro_result gro_result_t; diff --git a/net/core/dev.c b/net/core/dev.c index 7afbb642e203..e4d77c8abe76 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6070,10 +6070,6 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, gro_normal_one(napi, skb); break; - case GRO_DROP: - kfree_skb(skb); - break; - case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); @@ -6158,10 +6154,6 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, gro_normal_one(napi, skb); break; - case GRO_DROP: - napi_reuse_skb(napi, skb); - break; - case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); @@ -6223,9 +6215,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); - if (!skb) - return GRO_DROP; - trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); -- cgit v1.2.3 From 0f9f696a502e1b01fbb137a08f56f157da9d95eb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 8 Jan 2021 16:47:59 -0800 Subject: mptcp: add set_flags command in PM netlink This patch added a new command MPTCP_PM_CMD_SET_FLAGS in PM netlink: In mptcp_nl_cmd_set_flags, parse the input address, get the backup value according to whether the address's FLAG_BACKUP flag is set from the user-space. Then check whether this address had been added in the local address list. If it had been, then call mptcp_nl_addr_backup to deal with this address. In mptcp_nl_addr_backup, traverse all the existing msk sockets to find the relevant sockets, and call mptcp_pm_nl_mp_prio_send_ack to send out a MP_PRIO ACK packet. Finally in mptcp_nl_cmd_set_flags, set or clear the address's FLAG_BACKUP flag. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm_netlink.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 9762660df741..3674a451a18c 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -82,6 +82,7 @@ enum { MPTCP_PM_CMD_FLUSH_ADDRS, MPTCP_PM_CMD_SET_LIMITS, MPTCP_PM_CMD_GET_LIMITS, + MPTCP_PM_CMD_SET_FLAGS, __MPTCP_PM_CMD_AFTER_LAST }; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index bf0d13c85a68..8f80099f1657 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1164,6 +1164,66 @@ fail: return -EMSGSIZE; } +static int mptcp_nl_addr_backup(struct net *net, + struct mptcp_addr_info *addr, + u8 bkup) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + int ret = -EINVAL; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (list_empty(&msk->conn_list)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return ret; +} + +static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct net *net = sock_net(skb->sk); + u8 bkup = 0; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, true, &addr); + if (ret < 0) + return ret; + + if (addr.addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + bkup = 1; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (addresses_equal(&entry->addr, &addr.addr, true)) { + ret = mptcp_nl_addr_backup(net, &entry->addr, bkup); + if (ret) + return ret; + + if (bkup) + entry->addr.flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->addr.flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + } + } + + return 0; +} + static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_ADD_ADDR, @@ -1194,6 +1254,11 @@ static const struct genl_small_ops mptcp_pm_ops[] = { .cmd = MPTCP_PM_CMD_GET_LIMITS, .doit = mptcp_nl_cmd_get_limits, }, + { + .cmd = MPTCP_PM_CMD_SET_FLAGS, + .doit = mptcp_nl_cmd_set_flags, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family mptcp_genl_family __ro_after_init = { -- cgit v1.2.3 From b7a9e0da2d1c954b7c38217a29e002528b90d174 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 9 Jan 2021 02:01:46 +0200 Subject: net: switchdev: remove vid_begin -> vid_end range from VLAN objects The call path of a switchdev VLAN addition to the bridge looks something like this today: nbp_vlan_init | __br_vlan_set_default_pvid | | | | | br_afspec | | | | | | | v | | | br_process_vlan_info | | | | | | | v | | | br_vlan_info | | | / \ / | | / \ / | | / \ / | | / \ / v v v v v nbp_vlan_add br_vlan_add ------+ | ^ ^ | | | / | | | | / / / | \ br_vlan_get_master/ / v \ ^ / / br_vlan_add_existing \ | / / | \ | / / / \ | / / / \ | / / / \ | / / / v | | v / __vlan_add / / | / / | / v | / __vlan_vid_add | / \ | / v v v br_switchdev_port_vlan_add The ranges UAPI was introduced to the bridge in commit bdced7ef7838 ("bridge: support for multiple vlans and vlan ranges in setlink and dellink requests") (Jan 10 2015). But the VLAN ranges (parsed in br_afspec) have always been passed one by one, through struct bridge_vlan_info tmp_vinfo, to br_vlan_info. So the range never went too far in depth. Then Scott Feldman introduced the switchdev_port_bridge_setlink function in commit 47f8328bb1a4 ("switchdev: add new switchdev bridge setlink"). That marked the introduction of the SWITCHDEV_OBJ_PORT_VLAN, which made full use of the range. But switchdev_port_bridge_setlink was called like this: br_setlink -> br_afspec -> switchdev_port_bridge_setlink Basically, the switchdev and the bridge code were not tightly integrated. Then commit 41c498b9359e ("bridge: restore br_setlink back to original") came, and switchdev drivers were required to implement .ndo_bridge_setlink = switchdev_port_bridge_setlink for a while. In the meantime, commits such as 0944d6b5a2fa ("bridge: try switchdev op first in __vlan_vid_add/del") finally made switchdev penetrate the br_vlan_info() barrier and start to develop the call path we have today. But remember, br_vlan_info() still receives VLANs one by one. Then Arkadi Sharshevsky refactored the switchdev API in 2017 in commit 29ab586c3d83 ("net: switchdev: Remove bridge bypass support from switchdev") so that drivers would not implement .ndo_bridge_setlink any longer. The switchdev_port_bridge_setlink also got deleted. This refactoring removed the parallel bridge_setlink implementation from switchdev, and left the only switchdev VLAN objects to be the ones offloaded from __vlan_vid_add (basically RX filtering) and __vlan_add (the latter coming from commit 9c86ce2c1ae3 ("net: bridge: Notify about bridge VLANs")). That is to say, today the switchdev VLAN object ranges are not used in the kernel. Refactoring the above call path is a bit complicated, when the bridge VLAN call path is already a bit complicated. Let's go off and finish the job of commit 29ab586c3d83 by deleting the bogus iteration through the VLAN ranges from the drivers. Some aspects of this feature never made too much sense in the first place. For example, what is a range of VLANs all having the BRIDGE_VLAN_INFO_PVID flag supposed to mean, when a port can obviously have a single pvid? This particular configuration _is_ denied as of commit 6623c60dc28e ("bridge: vlan: enforce no pvid flag in vlan ranges"), but from an API perspective, the driver still has to play pretend, and only offload the vlan->vid_end as pvid. And the addition of a switchdev VLAN object can modify the flags of another, completely unrelated, switchdev VLAN object! (a VLAN that is PVID will invalidate the PVID flag from whatever other VLAN had previously been offloaded with switchdev and had that flag. Yet switchdev never notifies about that change, drivers are supposed to guess). Nonetheless, having a VLAN range in the API makes error handling look scarier than it really is - unwinding on errors and all of that. When in reality, no one really calls this API with more than one VLAN. It is all unnecessary complexity. And despite appearing pretentious (two-phase transactional model and all), the switchdev API is really sloppy because the VLAN addition and removal operations are not paired with one another (you can add a VLAN 100 times and delete it just once). The bridge notifies through switchdev of a VLAN addition not only when the flags of an existing VLAN change, but also when nothing changes. There are switchdev drivers out there who don't like adding a VLAN that has already been added, and those checks don't really belong at driver level. But the fact that the API contains ranges is yet another factor that prevents this from being addressed in the future. Of the existing switchdev pieces of hardware, it appears that only Mellanox Spectrum supports offloading more than one VLAN at a time, through mlxsw_sp_port_vlan_set. I have kept that code internal to the driver, because there is some more bookkeeping that makes use of it, but I deleted it from the switchdev API. But since the switchdev support for ranges has already been de facto deleted by a Mellanox employee and nobody noticed for 4 years, I'm going to assume it's not a biggie. Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel # switchdev and mlxsw Reviewed-by: Florian Fainelli Reviewed-by: Kurt Kanzenbach # hellcreek Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 54 +++++----- drivers/net/dsa/bcm_sf2_cfp.c | 3 +- drivers/net/dsa/dsa_loop.c | 46 ++++----- drivers/net/dsa/hirschmann/hellcreek.c | 22 ++-- drivers/net/dsa/lantiq_gswip.c | 61 +++++------ drivers/net/dsa/microchip/ksz8795.c | 62 ++++++----- drivers/net/dsa/microchip/ksz9477.c | 66 ++++++------ drivers/net/dsa/mt7530.c | 32 +++--- drivers/net/dsa/mv88e6xxx/chip.c | 89 ++++++++-------- drivers/net/dsa/ocelot/felix.c | 42 ++------ drivers/net/dsa/qca8k.c | 17 +-- drivers/net/dsa/rtl8366.c | 115 +++++++++------------ drivers/net/dsa/sja1105/sja1105_main.c | 33 +++--- .../ethernet/marvell/prestera/prestera_switchdev.c | 18 +--- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 63 ++++------- drivers/net/ethernet/mscc/ocelot_net.c | 41 ++------ drivers/net/ethernet/rocker/rocker_ofdpa.c | 20 +--- drivers/net/ethernet/ti/cpsw_switchdev.c | 33 +----- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 47 +++------ include/net/switchdev.h | 3 +- net/bridge/br_switchdev.c | 6 +- net/dsa/slave.c | 23 ++--- 22 files changed, 324 insertions(+), 572 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 85dddd87bcfc..bab174c052b3 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1393,7 +1393,7 @@ int b53_vlan_prepare(struct dsa_switch *ds, int port, { struct b53_device *dev = ds->priv; - if ((is5325(dev) || is5365(dev)) && vlan->vid_begin == 0) + if ((is5325(dev) || is5365(dev)) && vlan->vid == 0) return -EOPNOTSUPP; /* Port 7 on 7278 connects to the ASP's UniMAC which is not capable of @@ -1404,7 +1404,7 @@ int b53_vlan_prepare(struct dsa_switch *ds, int port, !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return -EINVAL; - if (vlan->vid_end > dev->num_vlans) + if (vlan->vid > dev->num_vlans) return -ERANGE; b53_enable_vlan(dev, true, ds->vlan_filtering); @@ -1420,30 +1420,27 @@ void b53_vlan_add(struct dsa_switch *ds, int port, bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct b53_vlan *vl; - u16 vid; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - vl = &dev->vlans[vid]; + vl = &dev->vlans[vlan->vid]; - b53_get_vlan_entry(dev, vid, vl); + b53_get_vlan_entry(dev, vlan->vid, vl); - if (vid == 0 && vid == b53_default_pvid(dev)) - untagged = true; + if (vlan->vid == 0 && vlan->vid == b53_default_pvid(dev)) + untagged = true; - vl->members |= BIT(port); - if (untagged && !dsa_is_cpu_port(ds, port)) - vl->untag |= BIT(port); - else - vl->untag &= ~BIT(port); + vl->members |= BIT(port); + if (untagged && !dsa_is_cpu_port(ds, port)) + vl->untag |= BIT(port); + else + vl->untag &= ~BIT(port); - b53_set_vlan_entry(dev, vid, vl); - b53_fast_age_vlan(dev, vid); - } + b53_set_vlan_entry(dev, vlan->vid, vl); + b53_fast_age_vlan(dev, vlan->vid); if (pvid && !dsa_is_cpu_port(ds, port)) { b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), - vlan->vid_end); - b53_fast_age_vlan(dev, vid); + vlan->vid); + b53_fast_age_vlan(dev, vlan->vid); } } EXPORT_SYMBOL(b53_vlan_add); @@ -1454,27 +1451,24 @@ int b53_vlan_del(struct dsa_switch *ds, int port, struct b53_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct b53_vlan *vl; - u16 vid; u16 pvid; b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - vl = &dev->vlans[vid]; + vl = &dev->vlans[vlan->vid]; - b53_get_vlan_entry(dev, vid, vl); + b53_get_vlan_entry(dev, vlan->vid, vl); - vl->members &= ~BIT(port); + vl->members &= ~BIT(port); - if (pvid == vid) - pvid = b53_default_pvid(dev); + if (pvid == vlan->vid) + pvid = b53_default_pvid(dev); - if (untagged && !dsa_is_cpu_port(ds, port)) - vl->untag &= ~(BIT(port)); + if (untagged && !dsa_is_cpu_port(ds, port)) + vl->untag &= ~(BIT(port)); - b53_set_vlan_entry(dev, vid, vl); - b53_fast_age_vlan(dev, vid); - } + b53_set_vlan_entry(dev, vlan->vid, vl); + b53_fast_age_vlan(dev, vlan->vid); b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), pvid); b53_fast_age_vlan(dev, pvid); diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index d82cee5d9202..59d799ac1b60 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -885,8 +885,7 @@ static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port, return -EINVAL; vid = be16_to_cpu(fs->h_ext.vlan_tci) & VLAN_VID_MASK; - vlan.vid_begin = vid; - vlan.vid_end = vid; + vlan.vid = vid; if (cpu_to_be32(fs->h_ext.data[1]) & 1) vlan.flags = BRIDGE_VLAN_INFO_UNTAGGED; else diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index e38906ae8f23..3be9f665d174 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -206,13 +206,12 @@ dsa_loop_port_vlan_prepare(struct dsa_switch *ds, int port, struct dsa_loop_priv *ps = ds->priv; struct mii_bus *bus = ps->bus; - dev_dbg(ds->dev, "%s: port: %d, vlan: %d-%d", - __func__, port, vlan->vid_begin, vlan->vid_end); + dev_dbg(ds->dev, "%s: port: %d, vlan: %d", __func__, port, vlan->vid); /* Just do a sleeping operation to make lockdep checks effective */ mdiobus_read(bus, ps->port_base + port, MII_BMSR); - if (vlan->vid_end > ARRAY_SIZE(ps->vlans)) + if (vlan->vid > ARRAY_SIZE(ps->vlans)) return -ERANGE; return 0; @@ -226,26 +225,23 @@ static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, struct dsa_loop_priv *ps = ds->priv; struct mii_bus *bus = ps->bus; struct dsa_loop_vlan *vl; - u16 vid; /* Just do a sleeping operation to make lockdep checks effective */ mdiobus_read(bus, ps->port_base + port, MII_BMSR); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - vl = &ps->vlans[vid]; + vl = &ps->vlans[vlan->vid]; - vl->members |= BIT(port); - if (untagged) - vl->untagged |= BIT(port); - else - vl->untagged &= ~BIT(port); + vl->members |= BIT(port); + if (untagged) + vl->untagged |= BIT(port); + else + vl->untagged &= ~BIT(port); - dev_dbg(ds->dev, "%s: port: %d vlan: %d, %stagged, pvid: %d\n", - __func__, port, vid, untagged ? "un" : "", pvid); - } + dev_dbg(ds->dev, "%s: port: %d vlan: %d, %stagged, pvid: %d\n", + __func__, port, vlan->vid, untagged ? "un" : "", pvid); if (pvid) - ps->ports[port].pvid = vid; + ps->ports[port].pvid = vlan->vid; } static int dsa_loop_port_vlan_del(struct dsa_switch *ds, int port, @@ -253,26 +249,24 @@ static int dsa_loop_port_vlan_del(struct dsa_switch *ds, int port, { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct dsa_loop_priv *ps = ds->priv; + u16 pvid = ps->ports[port].pvid; struct mii_bus *bus = ps->bus; struct dsa_loop_vlan *vl; - u16 vid, pvid = ps->ports[port].pvid; /* Just do a sleeping operation to make lockdep checks effective */ mdiobus_read(bus, ps->port_base + port, MII_BMSR); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - vl = &ps->vlans[vid]; + vl = &ps->vlans[vlan->vid]; - vl->members &= ~BIT(port); - if (untagged) - vl->untagged &= ~BIT(port); + vl->members &= ~BIT(port); + if (untagged) + vl->untagged &= ~BIT(port); - if (pvid == vid) - pvid = 1; + if (pvid == vlan->vid) + pvid = 1; - dev_dbg(ds->dev, "%s: port: %d vlan: %d, %stagged, pvid: %d\n", - __func__, port, vid, untagged ? "un" : "", pvid); - } + dev_dbg(ds->dev, "%s: port: %d vlan: %d, %stagged, pvid: %d\n", + __func__, port, vlan->vid, untagged ? "un" : "", pvid); ps->ports[port].pvid = pvid; return 0; diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index 6420b76ea37c..a59cc40170fc 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -348,14 +348,12 @@ static int hellcreek_vlan_prepare(struct dsa_switch *ds, int port, */ for (i = 0; i < hellcreek->pdata->num_ports; ++i) { const u16 restricted_vid = hellcreek_private_vid(i); - u16 vid; if (!dsa_is_user_port(ds, i)) continue; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) - if (vid == restricted_vid) - return -EBUSY; + if (vlan->vid == restricted_vid) + return -EBUSY; } return 0; @@ -446,28 +444,22 @@ static void hellcreek_vlan_add(struct dsa_switch *ds, int port, bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct hellcreek *hellcreek = ds->priv; - u16 vid; - dev_dbg(hellcreek->dev, "Add VLANs (%d -- %d) on port %d, %s, %s\n", - vlan->vid_begin, vlan->vid_end, port, - untagged ? "untagged" : "tagged", + dev_dbg(hellcreek->dev, "Add VLAN %d on port %d, %s, %s\n", + vlan->vid, port, untagged ? "untagged" : "tagged", pvid ? "PVID" : "no PVID"); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) - hellcreek_apply_vlan(hellcreek, port, vid, pvid, untagged); + hellcreek_apply_vlan(hellcreek, port, vlan->vid, pvid, untagged); } static int hellcreek_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct hellcreek *hellcreek = ds->priv; - u16 vid; - dev_dbg(hellcreek->dev, "Remove VLANs (%d -- %d) on port %d\n", - vlan->vid_begin, vlan->vid_end, port); + dev_dbg(hellcreek->dev, "Remove VLAN %d on port %d\n", vlan->vid, port); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) - hellcreek_unapply_vlan(hellcreek, port, vid); + hellcreek_unapply_vlan(hellcreek, port, vlan->vid); return 0; } diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 662e68a0e7e6..72cadd16056f 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1146,43 +1146,38 @@ static int gswip_port_vlan_prepare(struct dsa_switch *ds, int port, struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; unsigned int max_ports = priv->hw_info->max_ports; - u16 vid; - int i; int pos = max_ports; + int i, idx = -1; /* We only support VLAN filtering on bridges */ if (!dsa_is_cpu_port(ds, port) && !bridge) return -EOPNOTSUPP; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - int idx = -1; + /* Check if there is already a page for this VLAN */ + for (i = max_ports; i < ARRAY_SIZE(priv->vlans); i++) { + if (priv->vlans[i].bridge == bridge && + priv->vlans[i].vid == vlan->vid) { + idx = i; + break; + } + } - /* Check if there is already a page for this VLAN */ - for (i = max_ports; i < ARRAY_SIZE(priv->vlans); i++) { - if (priv->vlans[i].bridge == bridge && - priv->vlans[i].vid == vid) { - idx = i; + /* If this VLAN is not programmed yet, we have to reserve + * one entry in the VLAN table. Make sure we start at the + * next position round. + */ + if (idx == -1) { + /* Look for a free slot */ + for (; pos < ARRAY_SIZE(priv->vlans); pos++) { + if (!priv->vlans[pos].bridge) { + idx = pos; + pos++; break; } } - /* If this VLAN is not programmed yet, we have to reserve - * one entry in the VLAN table. Make sure we start at the - * next position round. - */ - if (idx == -1) { - /* Look for a free slot */ - for (; pos < ARRAY_SIZE(priv->vlans); pos++) { - if (!priv->vlans[pos].bridge) { - idx = pos; - pos++; - break; - } - } - - if (idx == -1) - return -ENOSPC; - } + if (idx == -1) + return -ENOSPC; } return 0; @@ -1195,7 +1190,6 @@ static void gswip_port_vlan_add(struct dsa_switch *ds, int port, struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; - u16 vid; /* We have to receive all packets on the CPU port and should not * do any VLAN filtering here. This is also called with bridge @@ -1205,8 +1199,7 @@ static void gswip_port_vlan_add(struct dsa_switch *ds, int port, if (dsa_is_cpu_port(ds, port)) return; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) - gswip_vlan_add_aware(priv, bridge, port, vid, untagged, pvid); + gswip_vlan_add_aware(priv, bridge, port, vlan->vid, untagged, pvid); } static int gswip_port_vlan_del(struct dsa_switch *ds, int port, @@ -1215,8 +1208,6 @@ static int gswip_port_vlan_del(struct dsa_switch *ds, int port, struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; - u16 vid; - int err; /* We have to receive all packets on the CPU port and should not * do any VLAN filtering here. This is also called with bridge @@ -1226,13 +1217,7 @@ static int gswip_port_vlan_del(struct dsa_switch *ds, int port, if (dsa_is_cpu_port(ds, port)) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - err = gswip_vlan_remove(priv, bridge, port, vid, pvid, true); - if (err) - return err; - } - - return 0; + return gswip_vlan_remove(priv, bridge, port, vlan->vid, pvid, true); } static void gswip_port_fast_age(struct dsa_switch *ds, int port) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index c973db101b72..a4c814f6a4b5 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -801,32 +801,32 @@ static void ksz8795_port_vlan_add(struct dsa_switch *ds, int port, { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; - u16 data, vid, new_pvid = 0; + u16 data, new_pvid = 0; u8 fid, member, valid; ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - ksz8795_r_vlan_table(dev, vid, &data); - ksz8795_from_vlan(data, &fid, &member, &valid); + ksz8795_r_vlan_table(dev, vlan->vid, &data); + ksz8795_from_vlan(data, &fid, &member, &valid); - /* First time to setup the VLAN entry. */ - if (!valid) { - /* Need to find a way to map VID to FID. */ - fid = 1; - valid = 1; - } - member |= BIT(port); + /* First time to setup the VLAN entry. */ + if (!valid) { + /* Need to find a way to map VID to FID. */ + fid = 1; + valid = 1; + } + member |= BIT(port); - ksz8795_to_vlan(fid, member, valid, &data); - ksz8795_w_vlan_table(dev, vid, data); + ksz8795_to_vlan(fid, member, valid, &data); + ksz8795_w_vlan_table(dev, vlan->vid, data); - /* change PVID */ - if (vlan->flags & BRIDGE_VLAN_INFO_PVID) - new_pvid = vid; - } + /* change PVID */ + if (vlan->flags & BRIDGE_VLAN_INFO_PVID) + new_pvid = vlan->vid; if (new_pvid) { + u16 vid; + ksz_pread16(dev, port, REG_PORT_CTRL_VID, &vid); vid &= 0xfff; vid |= new_pvid; @@ -839,7 +839,7 @@ static int ksz8795_port_vlan_del(struct dsa_switch *ds, int port, { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; - u16 data, vid, pvid, new_pvid = 0; + u16 data, pvid, new_pvid = 0; u8 fid, member, valid; ksz_pread16(dev, port, REG_PORT_CTRL_VID, &pvid); @@ -847,24 +847,22 @@ static int ksz8795_port_vlan_del(struct dsa_switch *ds, int port, ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - ksz8795_r_vlan_table(dev, vid, &data); - ksz8795_from_vlan(data, &fid, &member, &valid); + ksz8795_r_vlan_table(dev, vlan->vid, &data); + ksz8795_from_vlan(data, &fid, &member, &valid); - member &= ~BIT(port); + member &= ~BIT(port); - /* Invalidate the entry if no more member. */ - if (!member) { - fid = 0; - valid = 0; - } + /* Invalidate the entry if no more member. */ + if (!member) { + fid = 0; + valid = 0; + } - if (pvid == vid) - new_pvid = 1; + if (pvid == vlan->vid) + new_pvid = 1; - ksz8795_to_vlan(fid, member, valid, &data); - ksz8795_w_vlan_table(dev, vid, data); - } + ksz8795_to_vlan(fid, member, valid, &data); + ksz8795_w_vlan_table(dev, vlan->vid, data); if (new_pvid != pvid) ksz_pwrite16(dev, port, REG_PORT_CTRL_VID, pvid); diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 42e647b67abd..5a6ac0749ab0 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -519,33 +519,30 @@ static void ksz9477_port_vlan_add(struct dsa_switch *ds, int port, { struct ksz_device *dev = ds->priv; u32 vlan_table[3]; - u16 vid; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - if (ksz9477_get_vlan_table(dev, vid, vlan_table)) { - dev_dbg(dev->dev, "Failed to get vlan table\n"); - return; - } - - vlan_table[0] = VLAN_VALID | (vid & VLAN_FID_M); - if (untagged) - vlan_table[1] |= BIT(port); - else - vlan_table[1] &= ~BIT(port); - vlan_table[1] &= ~(BIT(dev->cpu_port)); + if (ksz9477_get_vlan_table(dev, vlan->vid, vlan_table)) { + dev_dbg(dev->dev, "Failed to get vlan table\n"); + return; + } - vlan_table[2] |= BIT(port) | BIT(dev->cpu_port); + vlan_table[0] = VLAN_VALID | (vlan->vid & VLAN_FID_M); + if (untagged) + vlan_table[1] |= BIT(port); + else + vlan_table[1] &= ~BIT(port); + vlan_table[1] &= ~(BIT(dev->cpu_port)); - if (ksz9477_set_vlan_table(dev, vid, vlan_table)) { - dev_dbg(dev->dev, "Failed to set vlan table\n"); - return; - } + vlan_table[2] |= BIT(port) | BIT(dev->cpu_port); - /* change PVID */ - if (vlan->flags & BRIDGE_VLAN_INFO_PVID) - ksz_pwrite16(dev, port, REG_PORT_DEFAULT_VID, vid); + if (ksz9477_set_vlan_table(dev, vlan->vid, vlan_table)) { + dev_dbg(dev->dev, "Failed to set vlan table\n"); + return; } + + /* change PVID */ + if (vlan->flags & BRIDGE_VLAN_INFO_PVID) + ksz_pwrite16(dev, port, REG_PORT_DEFAULT_VID, vlan->vid); } static int ksz9477_port_vlan_del(struct dsa_switch *ds, int port, @@ -554,30 +551,27 @@ static int ksz9477_port_vlan_del(struct dsa_switch *ds, int port, struct ksz_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; u32 vlan_table[3]; - u16 vid; u16 pvid; ksz_pread16(dev, port, REG_PORT_DEFAULT_VID, &pvid); pvid = pvid & 0xFFF; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - if (ksz9477_get_vlan_table(dev, vid, vlan_table)) { - dev_dbg(dev->dev, "Failed to get vlan table\n"); - return -ETIMEDOUT; - } + if (ksz9477_get_vlan_table(dev, vlan->vid, vlan_table)) { + dev_dbg(dev->dev, "Failed to get vlan table\n"); + return -ETIMEDOUT; + } - vlan_table[2] &= ~BIT(port); + vlan_table[2] &= ~BIT(port); - if (pvid == vid) - pvid = 1; + if (pvid == vlan->vid) + pvid = 1; - if (untagged) - vlan_table[1] &= ~BIT(port); + if (untagged) + vlan_table[1] &= ~BIT(port); - if (ksz9477_set_vlan_table(dev, vid, vlan_table)) { - dev_dbg(dev->dev, "Failed to set vlan table\n"); - return -ETIMEDOUT; - } + if (ksz9477_set_vlan_table(dev, vlan->vid, vlan_table)) { + dev_dbg(dev->dev, "Failed to set vlan table\n"); + return -ETIMEDOUT; } ksz_pwrite16(dev, port, REG_PORT_DEFAULT_VID, pvid); diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index a67cac15a724..31d2d23bc815 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1501,20 +1501,16 @@ mt7530_port_vlan_add(struct dsa_switch *ds, int port, bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct mt7530_hw_vlan_entry new_entry; struct mt7530_priv *priv = ds->priv; - u16 vid; mutex_lock(&priv->reg_mutex); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - mt7530_hw_vlan_entry_init(&new_entry, port, untagged); - mt7530_hw_vlan_update(priv, vid, &new_entry, - mt7530_hw_vlan_add); - } + mt7530_hw_vlan_entry_init(&new_entry, port, untagged); + mt7530_hw_vlan_update(priv, vlan->vid, &new_entry, mt7530_hw_vlan_add); if (pvid) { mt7530_rmw(priv, MT7530_PPBV1_P(port), G0_PORT_VID_MASK, - G0_PORT_VID(vlan->vid_end)); - priv->ports[port].pvid = vlan->vid_end; + G0_PORT_VID(vlan->vid)); + priv->ports[port].pvid = vlan->vid; } mutex_unlock(&priv->reg_mutex); @@ -1526,22 +1522,20 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, { struct mt7530_hw_vlan_entry target_entry; struct mt7530_priv *priv = ds->priv; - u16 vid, pvid; + u16 pvid; mutex_lock(&priv->reg_mutex); pvid = priv->ports[port].pvid; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - mt7530_hw_vlan_entry_init(&target_entry, port, 0); - mt7530_hw_vlan_update(priv, vid, &target_entry, - mt7530_hw_vlan_del); + mt7530_hw_vlan_entry_init(&target_entry, port, 0); + mt7530_hw_vlan_update(priv, vlan->vid, &target_entry, + mt7530_hw_vlan_del); - /* PVID is being restored to the default whenever the PVID port - * is being removed from the VLAN. - */ - if (pvid == vid) - pvid = G0_PORT_VID_DEF; - } + /* PVID is being restored to the default whenever the PVID port + * is being removed from the VLAN. + */ + if (pvid == vlan->vid) + pvid = G0_PORT_VID_DEF; mt7530_rmw(priv, MT7530_PPBV1_P(port), G0_PORT_VID_MASK, pvid); priv->ports[port].pvid = pvid; diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index eafe6bedc692..4834be9e4e86 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1529,7 +1529,7 @@ static int mv88e6xxx_atu_new(struct mv88e6xxx_chip *chip, u16 *fid) } static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, - u16 vid_begin, u16 vid_end) + u16 vid) { struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_vtu_entry vlan; @@ -1539,47 +1539,45 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) return 0; - if (!vid_begin) + if (!vid) return -EOPNOTSUPP; - vlan.vid = vid_begin - 1; + vlan.vid = vid - 1; vlan.valid = false; - do { - err = mv88e6xxx_vtu_getnext(chip, &vlan); - if (err) - return err; + err = mv88e6xxx_vtu_getnext(chip, &vlan); + if (err) + return err; - if (!vlan.valid) - break; + if (!vlan.valid) + return 0; - if (vlan.vid > vid_end) - break; + if (vlan.vid != vid) + return 0; - for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) { - if (dsa_is_dsa_port(ds, i) || dsa_is_cpu_port(ds, i)) - continue; + for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) { + if (dsa_is_dsa_port(ds, i) || dsa_is_cpu_port(ds, i)) + continue; - if (!dsa_to_port(ds, i)->slave) - continue; + if (!dsa_to_port(ds, i)->slave) + continue; - if (vlan.member[i] == - MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_NON_MEMBER) - continue; + if (vlan.member[i] == + MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_NON_MEMBER) + continue; - if (dsa_to_port(ds, i)->bridge_dev == - dsa_to_port(ds, port)->bridge_dev) - break; /* same bridge, check next VLAN */ + if (dsa_to_port(ds, i)->bridge_dev == + dsa_to_port(ds, port)->bridge_dev) + break; /* same bridge, check next VLAN */ - if (!dsa_to_port(ds, i)->bridge_dev) - continue; + if (!dsa_to_port(ds, i)->bridge_dev) + continue; - dev_err(ds->dev, "p%d: hw VLAN %d already used by port %d in %s\n", - port, vlan.vid, i, - netdev_name(dsa_to_port(ds, i)->bridge_dev)); - return -EOPNOTSUPP; - } - } while (vlan.vid < vid_end); + dev_err(ds->dev, "p%d: hw VLAN %d already used by port %d in %s\n", + port, vlan.vid, i, + netdev_name(dsa_to_port(ds, i)->bridge_dev)); + return -EOPNOTSUPP; + } return 0; } @@ -1617,8 +1615,7 @@ mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port, * members, do not support it (yet) and fallback to software VLAN. */ mv88e6xxx_reg_lock(chip); - err = mv88e6xxx_port_check_hw_vlan(ds, port, vlan->vid_begin, - vlan->vid_end); + err = mv88e6xxx_port_check_hw_vlan(ds, port, vlan->vid); mv88e6xxx_reg_unlock(chip); /* We don't need any dynamic resource from the kernel (yet), @@ -1978,7 +1975,6 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; bool warn; u8 member; - u16 vid; if (!mv88e6xxx_max_vid(chip)) return; @@ -1997,14 +1993,13 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, mv88e6xxx_reg_lock(chip); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) - if (mv88e6xxx_port_vlan_join(chip, port, vid, member, warn)) - dev_err(ds->dev, "p%d: failed to add VLAN %d%c\n", port, - vid, untagged ? 'u' : 't'); + if (mv88e6xxx_port_vlan_join(chip, port, vlan->vid, member, warn)) + dev_err(ds->dev, "p%d: failed to add VLAN %d%c\n", port, + vlan->vid, untagged ? 'u' : 't'); - if (pvid && mv88e6xxx_port_set_pvid(chip, port, vlan->vid_end)) + if (pvid && mv88e6xxx_port_set_pvid(chip, port, vlan->vid)) dev_err(ds->dev, "p%d: failed to set PVID %d\n", port, - vlan->vid_end); + vlan->vid); mv88e6xxx_reg_unlock(chip); } @@ -2055,8 +2050,8 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct mv88e6xxx_chip *chip = ds->priv; - u16 pvid, vid; int err = 0; + u16 pvid; if (!mv88e6xxx_max_vid(chip)) return -EOPNOTSUPP; @@ -2067,16 +2062,14 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, if (err) goto unlock; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - err = mv88e6xxx_port_vlan_leave(chip, port, vid); + err = mv88e6xxx_port_vlan_leave(chip, port, vlan->vid); + if (err) + goto unlock; + + if (vlan->vid == pvid) { + err = mv88e6xxx_port_set_pvid(chip, port, 0); if (err) goto unlock; - - if (vid == pvid) { - err = mv88e6xxx_port_set_pvid(chip, port, 0); - if (err) - goto unlock; - } } unlock: diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 90c3c76f21b2..216fcfbc5daf 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -116,8 +116,7 @@ static int felix_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct ocelot *ocelot = ds->priv; - u16 vid, flags = vlan->flags; - int err; + u16 flags = vlan->flags; /* Ocelot switches copy frames as-is to the CPU, so the flags: * egress-untagged or not, pvid or not, make no difference. This @@ -130,15 +129,9 @@ static int felix_vlan_prepare(struct dsa_switch *ds, int port, if (port == ocelot->npi) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - err = ocelot_vlan_prepare(ocelot, port, vid, - flags & BRIDGE_VLAN_INFO_PVID, - flags & BRIDGE_VLAN_INFO_UNTAGGED); - if (err) - return err; - } - - return 0; + return ocelot_vlan_prepare(ocelot, port, vlan->vid, + flags & BRIDGE_VLAN_INFO_PVID, + flags & BRIDGE_VLAN_INFO_UNTAGGED); } static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, @@ -154,37 +147,18 @@ static void felix_vlan_add(struct dsa_switch *ds, int port, { struct ocelot *ocelot = ds->priv; u16 flags = vlan->flags; - u16 vid; - int err; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - err = ocelot_vlan_add(ocelot, port, vid, - flags & BRIDGE_VLAN_INFO_PVID, - flags & BRIDGE_VLAN_INFO_UNTAGGED); - if (err) { - dev_err(ds->dev, "Failed to add VLAN %d to port %d: %d\n", - vid, port, err); - return; - } - } + ocelot_vlan_add(ocelot, port, vlan->vid, + flags & BRIDGE_VLAN_INFO_PVID, + flags & BRIDGE_VLAN_INFO_UNTAGGED); } static int felix_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct ocelot *ocelot = ds->priv; - u16 vid; - int err; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - err = ocelot_vlan_del(ocelot, port, vid); - if (err) { - dev_err(ds->dev, "Failed to remove VLAN %d from port %d: %d\n", - vid, port, err); - return err; - } - } - return 0; + return ocelot_vlan_del(ocelot, port, vlan->vid); } static int felix_port_enable(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 5bdac669a339..df99c696b688 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1330,11 +1330,8 @@ qca8k_port_vlan_add(struct dsa_switch *ds, int port, bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct qca8k_priv *priv = ds->priv; int ret = 0; - u16 vid; - - for (vid = vlan->vid_begin; vid <= vlan->vid_end && !ret; ++vid) - ret = qca8k_vlan_add(priv, port, vid, untagged); + ret = qca8k_vlan_add(priv, port, vlan->vid, untagged); if (ret) dev_err(priv->dev, "Failed to add VLAN to port %d (%d)", port, ret); @@ -1342,11 +1339,10 @@ qca8k_port_vlan_add(struct dsa_switch *ds, int port, int shift = 16 * (port % 2); qca8k_rmw(priv, QCA8K_EGRESS_VLAN(port), - 0xfff << shift, - vlan->vid_end << shift); + 0xfff << shift, vlan->vid << shift); qca8k_write(priv, QCA8K_REG_PORT_VLAN_CTRL0(port), - QCA8K_PORT_VLAN_CVID(vlan->vid_end) | - QCA8K_PORT_VLAN_SVID(vlan->vid_end)); + QCA8K_PORT_VLAN_CVID(vlan->vid) | + QCA8K_PORT_VLAN_SVID(vlan->vid)); } } @@ -1356,11 +1352,8 @@ qca8k_port_vlan_del(struct dsa_switch *ds, int port, { struct qca8k_priv *priv = ds->priv; int ret = 0; - u16 vid; - - for (vid = vlan->vid_begin; vid <= vlan->vid_end && !ret; ++vid) - ret = qca8k_vlan_del(priv, port, vid); + ret = qca8k_vlan_del(priv, port, vlan->vid); if (ret) dev_err(priv->dev, "Failed to delete VLAN from port %d (%d)", port, ret); diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index 83d481ef9273..1a8f93112bc6 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -383,14 +383,11 @@ int rtl8366_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct realtek_smi *smi = ds->priv; - u16 vid; - for (vid = vlan->vid_begin; vid < vlan->vid_end; vid++) - if (!smi->ops->is_vlan_valid(smi, vid)) - return -EINVAL; + if (!smi->ops->is_vlan_valid(smi, vlan->vid)) + return -EINVAL; - dev_info(smi->dev, "prepare VLANs %04x..%04x\n", - vlan->vid_begin, vlan->vid_end); + dev_info(smi->dev, "prepare VLAN %04x\n", vlan->vid); /* Enable VLAN in the hardware * FIXME: what's with this 4k business? @@ -408,47 +405,38 @@ void rtl8366_vlan_add(struct dsa_switch *ds, int port, struct realtek_smi *smi = ds->priv; u32 member = 0; u32 untag = 0; - u16 vid; int ret; - for (vid = vlan->vid_begin; vid < vlan->vid_end; vid++) - if (!smi->ops->is_vlan_valid(smi, vid)) - return; + if (!smi->ops->is_vlan_valid(smi, vlan->vid)) + return; dev_info(smi->dev, "add VLAN %d on port %d, %s, %s\n", - vlan->vid_begin, - port, - untagged ? "untagged" : "tagged", + vlan->vid, port, untagged ? "untagged" : "tagged", pvid ? " PVID" : "no PVID"); if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) dev_err(smi->dev, "port is DSA or CPU port\n"); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - member |= BIT(port); + member |= BIT(port); - if (untagged) - untag |= BIT(port); + if (untagged) + untag |= BIT(port); - ret = rtl8366_set_vlan(smi, vid, member, untag, 0); - if (ret) - dev_err(smi->dev, - "failed to set up VLAN %04x", - vid); + ret = rtl8366_set_vlan(smi, vlan->vid, member, untag, 0); + if (ret) + dev_err(smi->dev, "failed to set up VLAN %04x", vlan->vid); - if (!pvid) - continue; + if (!pvid) + return; - ret = rtl8366_set_pvid(smi, port, vid); - if (ret) - dev_err(smi->dev, - "failed to set PVID on port %d to VLAN %04x", - port, vid); + ret = rtl8366_set_pvid(smi, port, vlan->vid); + if (ret) + dev_err(smi->dev, "failed to set PVID on port %d to VLAN %04x", + port, vlan->vid); - if (!ret) - dev_dbg(smi->dev, "VLAN add: added VLAN %d with PVID on port %d\n", - vid, port); - } + if (!ret) + dev_dbg(smi->dev, "VLAN add: added VLAN %d with PVID on port %d\n", + vlan->vid, port); } EXPORT_SYMBOL_GPL(rtl8366_vlan_add); @@ -456,46 +444,39 @@ int rtl8366_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct realtek_smi *smi = ds->priv; - u16 vid; - int ret; + int ret, i; - dev_info(smi->dev, "del VLAN on port %d\n", port); + dev_info(smi->dev, "del VLAN %04x on port %d\n", vlan->vid, port); - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - int i; - - dev_info(smi->dev, "del VLAN %04x\n", vid); + for (i = 0; i < smi->num_vlan_mc; i++) { + struct rtl8366_vlan_mc vlanmc; - for (i = 0; i < smi->num_vlan_mc; i++) { - struct rtl8366_vlan_mc vlanmc; + ret = smi->ops->get_vlan_mc(smi, i, &vlanmc); + if (ret) + return ret; - ret = smi->ops->get_vlan_mc(smi, i, &vlanmc); - if (ret) + if (vlan->vid == vlanmc.vid) { + /* Remove this port from the VLAN */ + vlanmc.member &= ~BIT(port); + vlanmc.untag &= ~BIT(port); + /* + * If no ports are members of this VLAN + * anymore then clear the whole member + * config so it can be reused. + */ + if (!vlanmc.member && vlanmc.untag) { + vlanmc.vid = 0; + vlanmc.priority = 0; + vlanmc.fid = 0; + } + ret = smi->ops->set_vlan_mc(smi, i, &vlanmc); + if (ret) { + dev_err(smi->dev, + "failed to remove VLAN %04x\n", + vlan->vid); return ret; - - if (vid == vlanmc.vid) { - /* Remove this port from the VLAN */ - vlanmc.member &= ~BIT(port); - vlanmc.untag &= ~BIT(port); - /* - * If no ports are members of this VLAN - * anymore then clear the whole member - * config so it can be reused. - */ - if (!vlanmc.member && vlanmc.untag) { - vlanmc.vid = 0; - vlanmc.priority = 0; - vlanmc.fid = 0; - } - ret = smi->ops->set_vlan_mc(smi, i, &vlanmc); - if (ret) { - dev_err(smi->dev, - "failed to remove VLAN %04x\n", - vid); - return ret; - } - break; } + break; } } diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 59e00d55780b..807e65ac2518 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2611,7 +2611,6 @@ static int sja1105_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct sja1105_private *priv = ds->priv; - u16 vid; if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) return 0; @@ -2620,11 +2619,9 @@ static int sja1105_vlan_prepare(struct dsa_switch *ds, int port, * bridge plus tagging), be sure to at least deny alterations to the * configuration done by dsa_8021q. */ - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - if (vid_is_dsa_8021q(vid)) { - dev_err(ds->dev, "Range 1024-3071 reserved for dsa_8021q operation\n"); - return -EBUSY; - } + if (vid_is_dsa_8021q(vlan->vid)) { + dev_err(ds->dev, "Range 1024-3071 reserved for dsa_8021q operation\n"); + return -EBUSY; } return 0; @@ -2799,17 +2796,14 @@ static void sja1105_vlan_add(struct dsa_switch *ds, int port, { struct sja1105_private *priv = ds->priv; bool vlan_table_changed = false; - u16 vid; int rc; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - rc = sja1105_vlan_add_one(ds, port, vid, vlan->flags, - &priv->bridge_vlans); - if (rc < 0) - return; - if (rc > 0) - vlan_table_changed = true; - } + rc = sja1105_vlan_add_one(ds, port, vlan->vid, vlan->flags, + &priv->bridge_vlans); + if (rc < 0) + return; + if (rc > 0) + vlan_table_changed = true; if (!vlan_table_changed) return; @@ -2824,14 +2818,11 @@ static int sja1105_vlan_del(struct dsa_switch *ds, int port, { struct sja1105_private *priv = ds->priv; bool vlan_table_changed = false; - u16 vid; int rc; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - rc = sja1105_vlan_del_one(ds, port, vid, &priv->bridge_vlans); - if (rc > 0) - vlan_table_changed = true; - } + rc = sja1105_vlan_del_one(ds, port, vlan->vid, &priv->bridge_vlans); + if (rc > 0) + vlan_table_changed = true; if (!vlan_table_changed) return 0; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 7d83e1f91ef1..c87667c1cca0 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -1045,17 +1045,9 @@ static int prestera_port_vlans_add(struct prestera_port *port, if (!bridge->vlan_enabled) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - int err; - - err = prestera_bridge_port_vlan_add(port, br_port, - vid, flag_untagged, - flag_pvid, extack); - if (err) - return err; - } - - return 0; + return prestera_bridge_port_vlan_add(port, br_port, + vid, flag_untagged, + flag_pvid, extack); } static int prestera_port_obj_add(struct net_device *dev, @@ -1081,7 +1073,6 @@ static int prestera_port_vlans_del(struct prestera_port *port, struct net_device *dev = vlan->obj.orig_dev; struct prestera_bridge_port *br_port; struct prestera_switch *sw = port->sw; - u16 vid; if (netif_is_bridge_master(dev)) return -EOPNOTSUPP; @@ -1093,8 +1084,7 @@ static int prestera_port_vlans_del(struct prestera_port *port, if (!br_port->bridge->vlan_enabled) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) - prestera_bridge_port_vlan_del(port, br_port, vid); + prestera_bridge_port_vlan_del(port, br_port, vlan->vid); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index cea42f6ed89b..7039cff69680 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1211,23 +1211,20 @@ mlxsw_sp_br_ban_rif_pvid_change(struct mlxsw_sp *mlxsw_sp, const struct switchdev_obj_port_vlan *vlan) { u16 pvid; - u16 vid; pvid = mlxsw_sp_rif_vid(mlxsw_sp, br_dev); if (!pvid) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { - if (vlan->flags & BRIDGE_VLAN_INFO_PVID) { - if (vid != pvid) { - netdev_err(br_dev, "Can't change PVID, it's used by router interface\n"); - return -EBUSY; - } - } else { - if (vid == pvid) { - netdev_err(br_dev, "Can't remove PVID, it's used by router interface\n"); - return -EBUSY; - } + if (vlan->flags & BRIDGE_VLAN_INFO_PVID) { + if (vlan->vid != pvid) { + netdev_err(br_dev, "Can't change PVID, it's used by router interface\n"); + return -EBUSY; + } + } else { + if (vlan->vid == pvid) { + netdev_err(br_dev, "Can't remove PVID, it's used by router interface\n"); + return -EBUSY; } } @@ -1244,7 +1241,6 @@ static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct net_device *orig_dev = vlan->obj.orig_dev; struct mlxsw_sp_bridge_port *bridge_port; - u16 vid; if (netif_is_bridge_master(orig_dev)) { int err = 0; @@ -1269,17 +1265,9 @@ static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port, if (!bridge_port->bridge_device->vlan_enabled) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - int err; - - err = mlxsw_sp_bridge_port_vlan_add(mlxsw_sp_port, bridge_port, - vid, flag_untagged, - flag_pvid, extack); - if (err) - return err; - } - - return 0; + return mlxsw_sp_bridge_port_vlan_add(mlxsw_sp_port, bridge_port, + vlan->vid, flag_untagged, + flag_pvid, extack); } static enum mlxsw_reg_sfdf_flush_type mlxsw_sp_fdb_flush_type(bool lagged) @@ -1873,7 +1861,6 @@ static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct net_device *orig_dev = vlan->obj.orig_dev; struct mlxsw_sp_bridge_port *bridge_port; - u16 vid; if (netif_is_bridge_master(orig_dev)) return -EOPNOTSUPP; @@ -1885,8 +1872,7 @@ static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port, if (!bridge_port->bridge_device->vlan_enabled) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) - mlxsw_sp_bridge_port_vlan_del(mlxsw_sp_port, bridge_port, vid); + mlxsw_sp_bridge_port_vlan_del(mlxsw_sp_port, bridge_port, vlan->vid); return 0; } @@ -3411,7 +3397,6 @@ mlxsw_sp_switchdev_vxlan_vlans_add(struct net_device *vxlan_dev, struct netlink_ext_ack *extack; struct mlxsw_sp *mlxsw_sp; struct net_device *br_dev; - u16 vid; extack = switchdev_notifier_info_to_extack(&port_obj_info->info); br_dev = netdev_master_upper_dev_get(vxlan_dev); @@ -3434,18 +3419,10 @@ mlxsw_sp_switchdev_vxlan_vlans_add(struct net_device *vxlan_dev, if (!bridge_device->vlan_enabled) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - int err; - - err = mlxsw_sp_switchdev_vxlan_vlan_add(mlxsw_sp, bridge_device, - vxlan_dev, vid, - flag_untagged, - flag_pvid, extack); - if (err) - return err; - } - - return 0; + return mlxsw_sp_switchdev_vxlan_vlan_add(mlxsw_sp, bridge_device, + vxlan_dev, vlan->vid, + flag_untagged, + flag_pvid, extack); } static void @@ -3458,7 +3435,6 @@ mlxsw_sp_switchdev_vxlan_vlans_del(struct net_device *vxlan_dev, struct mlxsw_sp_bridge_device *bridge_device; struct mlxsw_sp *mlxsw_sp; struct net_device *br_dev; - u16 vid; br_dev = netdev_master_upper_dev_get(vxlan_dev); if (!br_dev) @@ -3477,9 +3453,8 @@ mlxsw_sp_switchdev_vxlan_vlans_del(struct net_device *vxlan_dev, if (!bridge_device->vlan_enabled) return; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) - mlxsw_sp_switchdev_vxlan_vlan_del(mlxsw_sp, bridge_device, - vxlan_dev, vid); + mlxsw_sp_switchdev_vxlan_vlan_del(mlxsw_sp, bridge_device, vxlan_dev, + vlan->vid); } static int diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 2bd2840d88bd..3b8718b143bb 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -893,39 +893,16 @@ static int ocelot_port_obj_add_vlan(struct net_device *dev, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) { + bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; + bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; int ret; - u16 vid; - - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; - bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; - - if (switchdev_trans_ph_prepare(trans)) - ret = ocelot_vlan_vid_prepare(dev, vid, pvid, - untagged); - else - ret = ocelot_vlan_vid_add(dev, vid, pvid, untagged); - if (ret) - return ret; - } - - return 0; -} - -static int ocelot_port_vlan_del_vlan(struct net_device *dev, - const struct switchdev_obj_port_vlan *vlan) -{ - int ret; - u16 vid; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - ret = ocelot_vlan_vid_del(dev, vid); - - if (ret) - return ret; - } + if (switchdev_trans_ph_prepare(trans)) + ret = ocelot_vlan_vid_prepare(dev, vlan->vid, pvid, untagged); + else + ret = ocelot_vlan_vid_add(dev, vlan->vid, pvid, untagged); - return 0; + return ret; } static int ocelot_port_obj_add_mdb(struct net_device *dev, @@ -985,8 +962,8 @@ static int ocelot_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: - ret = ocelot_port_vlan_del_vlan(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj)); + ret = ocelot_vlan_vid_del(dev, + SWITCHDEV_OBJ_PORT_VLAN(obj)->vid); break; case SWITCHDEV_OBJ_ID_PORT_MDB: ret = ocelot_port_obj_del_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 7072b249c8bd..d9d5188b7627 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -2540,32 +2540,16 @@ static int ofdpa_port_obj_vlan_add(struct rocker_port *rocker_port, const struct switchdev_obj_port_vlan *vlan) { struct ofdpa_port *ofdpa_port = rocker_port->wpriv; - u16 vid; - int err; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - err = ofdpa_port_vlan_add(ofdpa_port, vid, vlan->flags); - if (err) - return err; - } - - return 0; + return ofdpa_port_vlan_add(ofdpa_port, vlan->vid, vlan->flags); } static int ofdpa_port_obj_vlan_del(struct rocker_port *rocker_port, const struct switchdev_obj_port_vlan *vlan) { struct ofdpa_port *ofdpa_port = rocker_port->wpriv; - u16 vid; - int err; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - err = ofdpa_port_vlan_del(ofdpa_port, vid, vlan->flags); - if (err) - return err; - } - - return 0; + return ofdpa_port_vlan_del(ofdpa_port, vlan->vid, vlan->flags); } static int ofdpa_port_obj_fdb_add(struct rocker_port *rocker_port, diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index 29747da5c514..8a36228acc5d 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -260,10 +260,9 @@ static int cpsw_port_vlans_add(struct cpsw_priv *priv, struct net_device *orig_dev = vlan->obj.orig_dev; bool cpu_port = netif_is_bridge_master(orig_dev); bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; - u16 vid; dev_dbg(priv->dev, "VID add: %s: vid:%u flags:%X\n", - priv->ndev->name, vlan->vid_begin, vlan->flags); + priv->ndev->name, vlan->vid, vlan->flags); if (cpu_port && !(vlan->flags & BRIDGE_VLAN_INFO_BRENTRY)) return 0; @@ -271,33 +270,7 @@ static int cpsw_port_vlans_add(struct cpsw_priv *priv, if (switchdev_trans_ph_prepare(trans)) return 0; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - int err; - - err = cpsw_port_vlan_add(priv, untag, pvid, vid, orig_dev); - if (err) - return err; - } - - return 0; -} - -static int cpsw_port_vlans_del(struct cpsw_priv *priv, - const struct switchdev_obj_port_vlan *vlan) - -{ - struct net_device *orig_dev = vlan->obj.orig_dev; - u16 vid; - - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - int err; - - err = cpsw_port_vlan_del(priv, vid, orig_dev); - if (err) - return err; - } - - return 0; + return cpsw_port_vlan_add(priv, untag, pvid, vlan->vid, orig_dev); } static int cpsw_port_mdb_add(struct cpsw_priv *priv, @@ -392,7 +365,7 @@ static int cpsw_port_obj_del(struct net_device *ndev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = cpsw_port_vlans_del(priv, vlan); + err = cpsw_port_vlan_del(priv, vlan->vid, vlan->obj.orig_dev); break; case SWITCHDEV_OBJ_ID_PORT_MDB: case SWITCHDEV_OBJ_ID_HOST_MDB: diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index d524e92051a3..62edb8d01f4e 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -981,19 +981,14 @@ static int dpaa2_switch_port_vlans_add(struct net_device *netdev, struct ethsw_port_priv *port_priv = netdev_priv(netdev); struct ethsw_core *ethsw = port_priv->ethsw_data; struct dpsw_attr *attr = ðsw->sw_attr; - int vid, err = 0, new_vlans = 0; + int err = 0; if (switchdev_trans_ph_prepare(trans)) { - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - if (!port_priv->ethsw_data->vlans[vid]) - new_vlans++; - - /* Make sure that the VLAN is not already configured - * on the switch port - */ - if (port_priv->vlans[vid] & ETHSW_VLAN_MEMBER) - return -EEXIST; - } + /* Make sure that the VLAN is not already configured + * on the switch port + */ + if (port_priv->vlans[vlan->vid] & ETHSW_VLAN_MEMBER) + return -EEXIST; /* Check if there is space for a new VLAN */ err = dpsw_get_attributes(ethsw->mc_io, 0, ethsw->dpsw_handle, @@ -1002,27 +997,22 @@ static int dpaa2_switch_port_vlans_add(struct net_device *netdev, netdev_err(netdev, "dpsw_get_attributes err %d\n", err); return err; } - if (attr->max_vlans - attr->num_vlans < new_vlans) + if (attr->max_vlans - attr->num_vlans < 1) return -ENOSPC; return 0; } - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - if (!port_priv->ethsw_data->vlans[vid]) { - /* this is a new VLAN */ - err = dpaa2_switch_add_vlan(port_priv->ethsw_data, vid); - if (err) - return err; - - port_priv->ethsw_data->vlans[vid] |= ETHSW_VLAN_GLOBAL; - } - err = dpaa2_switch_port_add_vlan(port_priv, vid, vlan->flags); + if (!port_priv->ethsw_data->vlans[vlan->vid]) { + /* this is a new VLAN */ + err = dpaa2_switch_add_vlan(port_priv->ethsw_data, vlan->vid); if (err) - break; + return err; + + port_priv->ethsw_data->vlans[vlan->vid] |= ETHSW_VLAN_GLOBAL; } - return err; + return dpaa2_switch_port_add_vlan(port_priv, vlan->vid, vlan->flags); } static int dpaa2_switch_port_lookup_address(struct net_device *netdev, int is_uc, @@ -1155,18 +1145,11 @@ static int dpaa2_switch_port_vlans_del(struct net_device *netdev, const struct switchdev_obj_port_vlan *vlan) { struct ethsw_port_priv *port_priv = netdev_priv(netdev); - int vid, err = 0; if (netif_is_bridge_master(vlan->obj.orig_dev)) return -EOPNOTSUPP; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) { - err = dpaa2_switch_port_del_vlan(port_priv, vid); - if (err) - break; - } - - return err; + return dpaa2_switch_port_del_vlan(port_priv, vlan->vid); } static int dpaa2_switch_port_mdb_del(struct net_device *netdev, diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 99cd538d6519..bac7d3ba574f 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -97,8 +97,7 @@ struct switchdev_obj { struct switchdev_obj_port_vlan { struct switchdev_obj obj; u16 flags; - u16 vid_begin; - u16 vid_end; + u16 vid; }; #define SWITCHDEV_OBJ_PORT_VLAN(OBJ) \ diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 015209bf44aa..a9c23ef83443 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -153,8 +153,7 @@ int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, - .vid_begin = vid, - .vid_end = vid, + .vid = vid, }; return switchdev_port_obj_add(dev, &v.obj, extack); @@ -165,8 +164,7 @@ int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, + .vid = vid, }; return switchdev_port_obj_del(dev, &v.obj); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f8b6a69b6873..653d64f4a637 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -318,7 +318,7 @@ dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave, continue; vid = vlan_dev_vlan_id(upper_dev); - if (vid >= vlan->vid_begin && vid <= vlan->vid_end) + if (vid == vlan->vid) return -EBUSY; } @@ -332,7 +332,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan; - int vid, err; + int err; if (obj->orig_dev != dev) return -EOPNOTSUPP; @@ -367,13 +367,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, if (err) return err; - for (vid = vlan.vid_begin; vid <= vlan.vid_end; vid++) { - err = vlan_vid_add(master, htons(ETH_P_8021Q), vid); - if (err) - return err; - } - - return 0; + return vlan_vid_add(master, htons(ETH_P_8021Q), vlan.vid); } static int dsa_slave_port_obj_add(struct net_device *dev, @@ -419,7 +413,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan *vlan; - int vid, err; + int err; if (obj->orig_dev != dev) return -EOPNOTSUPP; @@ -436,8 +430,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, if (err) return err; - for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) - vlan_vid_del(master, htons(ETH_P_8021Q), vid); + vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid); return 0; } @@ -1289,8 +1282,7 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, + .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; @@ -1328,8 +1320,7 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); struct switchdev_obj_port_vlan vlan = { - .vid_begin = vid, - .vid_end = vid, + .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; -- cgit v1.2.3 From ffb68fc58e9640762be891f9aebe4f5aac615ab3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 9 Jan 2021 02:01:48 +0200 Subject: net: switchdev: remove the transaction structure from port object notifiers Since the introduction of the switchdev API, port objects were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port object notifier structures, and converts drivers to not look at this member. Where driver conversion is trivial (like in the case of the Marvell Prestera driver, NXP DPAA2 switch, TI CPSW, and Rocker drivers), it is done in this patch. Where driver conversion needs more attention (DSA, Mellanox Spectrum), the conversion is left for subsequent patches and here we only fake the prepare/commit phases at a lower level, just not in the switchdev notifier itself. Where the code has a natural structure that is best left alone as a preparation and a commit phase (as in the case of the Ocelot switch), that structure is left in place, just made to not depend upon the switchdev transactional model. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Acked-by: Linus Walleij Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/prestera/prestera_switchdev.c | 7 +-- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 52 +++++++++++-------- drivers/net/ethernet/mscc/ocelot_net.c | 25 +++------ drivers/net/ethernet/rocker/rocker_main.c | 15 ++---- drivers/net/ethernet/ti/cpsw_switchdev.c | 17 ++----- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 59 ++++++++++------------ include/net/switchdev.h | 3 -- net/dsa/dsa_priv.h | 8 +-- net/dsa/port.c | 8 +-- net/dsa/slave.c | 34 ++++--------- net/dsa/switch.c | 36 ++----------- net/switchdev/switchdev.c | 42 ++------------- 12 files changed, 98 insertions(+), 208 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index c87667c1cca0..3235458a5501 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -1020,7 +1020,6 @@ prestera_bridge_port_vlan_del(struct prestera_port *port, static int prestera_port_vlans_add(struct prestera_port *port, const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { bool flag_untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; @@ -1034,9 +1033,6 @@ static int prestera_port_vlans_add(struct prestera_port *port, if (netif_is_bridge_master(dev)) return 0; - if (switchdev_trans_ph_commit(trans)) - return 0; - br_port = prestera_bridge_port_by_dev(sw->swdev, dev); if (WARN_ON(!br_port)) return -EINVAL; @@ -1052,7 +1048,6 @@ static int prestera_port_vlans_add(struct prestera_port *port, static int prestera_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { struct prestera_port *port = netdev_priv(dev); @@ -1061,7 +1056,7 @@ static int prestera_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); - return prestera_port_vlans_add(port, vlan, trans, extack); + return prestera_port_vlans_add(port, vlan, extack); default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 7039cff69680..eff5920aed06 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1704,8 +1704,7 @@ static int mlxsw_sp_port_remove_from_mid(struct mlxsw_sp_port *mlxsw_sp_port, } static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) + const struct switchdev_obj_port_mdb *mdb) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct net_device *orig_dev = mdb->obj.orig_dev; @@ -1717,9 +1716,6 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, u16 fid_index; int err = 0; - if (switchdev_trans_ph_commit(trans)) - return 0; - bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev); if (!bridge_port) return 0; @@ -1801,32 +1797,38 @@ mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port, static int mlxsw_sp_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); const struct switchdev_obj_port_vlan *vlan; + struct switchdev_trans trans; int err = 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); - err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, trans, + + trans.ph_prepare = true; + err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, &trans, extack); - if (switchdev_trans_ph_prepare(trans)) { - /* The event is emitted before the changes are actually - * applied to the bridge. Therefore schedule the respin - * call for later, so that the respin logic sees the - * updated bridge state. - */ - mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp); - } + /* The event is emitted before the changes are actually + * applied to the bridge. Therefore schedule the respin + * call for later, so that the respin logic sees the + * updated bridge state. + */ + mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp); + + if (err) + break; + + trans.ph_prepare = false; + err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, &trans, + extack); break; case SWITCHDEV_OBJ_ID_PORT_MDB: err = mlxsw_sp_port_mdb_add(mlxsw_sp_port, - SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + SWITCHDEV_OBJ_PORT_MDB(obj)); break; default: err = -EOPNOTSUPP; @@ -3386,13 +3388,13 @@ out: static int mlxsw_sp_switchdev_vxlan_vlans_add(struct net_device *vxlan_dev, struct switchdev_notifier_port_obj_info * - port_obj_info) + port_obj_info, + struct switchdev_trans *trans) { struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(port_obj_info->obj); bool flag_untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool flag_pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; - struct switchdev_trans *trans = port_obj_info->trans; struct mlxsw_sp_bridge_device *bridge_device; struct netlink_ext_ack *extack; struct mlxsw_sp *mlxsw_sp; @@ -3462,12 +3464,22 @@ mlxsw_sp_switchdev_handle_vxlan_obj_add(struct net_device *vxlan_dev, struct switchdev_notifier_port_obj_info * port_obj_info) { + struct switchdev_trans trans; int err = 0; switch (port_obj_info->obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: + trans.ph_prepare = true; + err = mlxsw_sp_switchdev_vxlan_vlans_add(vxlan_dev, + port_obj_info, + &trans); + if (err) + break; + + trans.ph_prepare = false; err = mlxsw_sp_switchdev_vxlan_vlans_add(vxlan_dev, - port_obj_info); + port_obj_info, + &trans); break; default: break; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 3b8718b143bb..16d958a6e206 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -890,33 +890,27 @@ static int ocelot_port_attr_set(struct net_device *dev, } static int ocelot_port_obj_add_vlan(struct net_device *dev, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) + const struct switchdev_obj_port_vlan *vlan) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; int ret; - if (switchdev_trans_ph_prepare(trans)) - ret = ocelot_vlan_vid_prepare(dev, vlan->vid, pvid, untagged); - else - ret = ocelot_vlan_vid_add(dev, vlan->vid, pvid, untagged); + ret = ocelot_vlan_vid_prepare(dev, vlan->vid, pvid, untagged); + if (ret) + return ret; - return ret; + return ocelot_vlan_vid_add(dev, vlan->vid, pvid, untagged); } static int ocelot_port_obj_add_mdb(struct net_device *dev, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) + const struct switchdev_obj_port_mdb *mdb) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; struct ocelot *ocelot = ocelot_port->ocelot; int port = priv->chip_port; - if (switchdev_trans_ph_prepare(trans)) - return 0; - return ocelot_port_mdb_add(ocelot, port, mdb); } @@ -933,7 +927,6 @@ static int ocelot_port_obj_del_mdb(struct net_device *dev, static int ocelot_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { int ret = 0; @@ -941,12 +934,10 @@ static int ocelot_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: ret = ocelot_port_obj_add_vlan(dev, - SWITCHDEV_OBJ_PORT_VLAN(obj), - trans); + SWITCHDEV_OBJ_PORT_VLAN(obj)); break; case SWITCHDEV_OBJ_ID_PORT_MDB: - ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); break; default: return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index dd0bc7f0aaee..1018d3759316 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -1638,17 +1638,13 @@ rocker_world_port_attr_bridge_ageing_time_set(struct rocker_port *rocker_port, static int rocker_world_port_obj_vlan_add(struct rocker_port *rocker_port, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) + const struct switchdev_obj_port_vlan *vlan) { struct rocker_world_ops *wops = rocker_port->rocker->wops; if (!wops->port_obj_vlan_add) return -EOPNOTSUPP; - if (switchdev_trans_ph_prepare(trans)) - return 0; - return wops->port_obj_vlan_add(rocker_port, vlan); } @@ -2102,8 +2098,7 @@ static int rocker_port_attr_set(struct net_device *dev, } static int rocker_port_obj_add(struct net_device *dev, - const struct switchdev_obj *obj, - struct switchdev_trans *trans) + const struct switchdev_obj *obj) { struct rocker_port *rocker_port = netdev_priv(dev); int err = 0; @@ -2111,8 +2106,7 @@ static int rocker_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: err = rocker_world_port_obj_vlan_add(rocker_port, - SWITCHDEV_OBJ_PORT_VLAN(obj), - trans); + SWITCHDEV_OBJ_PORT_VLAN(obj)); break; default: err = -EOPNOTSUPP; @@ -2847,8 +2841,7 @@ rocker_switchdev_port_obj_event(unsigned long event, struct net_device *netdev, switch (event) { case SWITCHDEV_PORT_OBJ_ADD: - err = rocker_port_obj_add(netdev, port_obj_info->obj, - port_obj_info->trans); + err = rocker_port_obj_add(netdev, port_obj_info->obj); break; case SWITCHDEV_PORT_OBJ_DEL: err = rocker_port_obj_del(netdev, port_obj_info->obj); diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index 8a36228acc5d..3232f483c068 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -253,8 +253,7 @@ static int cpsw_port_vlan_del(struct cpsw_priv *priv, u16 vid, } static int cpsw_port_vlans_add(struct cpsw_priv *priv, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) + const struct switchdev_obj_port_vlan *vlan) { bool untag = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct net_device *orig_dev = vlan->obj.orig_dev; @@ -267,15 +266,11 @@ static int cpsw_port_vlans_add(struct cpsw_priv *priv, if (cpu_port && !(vlan->flags & BRIDGE_VLAN_INFO_BRENTRY)) return 0; - if (switchdev_trans_ph_prepare(trans)) - return 0; - return cpsw_port_vlan_add(priv, untag, pvid, vlan->vid, orig_dev); } static int cpsw_port_mdb_add(struct cpsw_priv *priv, - struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) + struct switchdev_obj_port_mdb *mdb) { struct net_device *orig_dev = mdb->obj.orig_dev; @@ -284,9 +279,6 @@ static int cpsw_port_mdb_add(struct cpsw_priv *priv, int port_mask; int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (cpu_port) port_mask = BIT(HOST_PORT_NUM); else @@ -325,7 +317,6 @@ static int cpsw_port_mdb_del(struct cpsw_priv *priv, static int cpsw_port_obj_add(struct net_device *ndev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); @@ -338,11 +329,11 @@ static int cpsw_port_obj_add(struct net_device *ndev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = cpsw_port_vlans_add(priv, vlan, trans); + err = cpsw_port_vlans_add(priv, vlan); break; case SWITCHDEV_OBJ_ID_PORT_MDB: case SWITCHDEV_OBJ_ID_HOST_MDB: - err = cpsw_port_mdb_add(priv, mdb, trans); + err = cpsw_port_mdb_add(priv, mdb); break; default: err = -EOPNOTSUPP; diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index 62edb8d01f4e..197dea9c3b42 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -975,33 +975,38 @@ static int dpaa2_switch_port_attr_set(struct net_device *netdev, } static int dpaa2_switch_port_vlans_add(struct net_device *netdev, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) + const struct switchdev_obj_port_vlan *vlan) { struct ethsw_port_priv *port_priv = netdev_priv(netdev); struct ethsw_core *ethsw = port_priv->ethsw_data; struct dpsw_attr *attr = ðsw->sw_attr; int err = 0; - if (switchdev_trans_ph_prepare(trans)) { - /* Make sure that the VLAN is not already configured - * on the switch port - */ - if (port_priv->vlans[vlan->vid] & ETHSW_VLAN_MEMBER) - return -EEXIST; + /* Make sure that the VLAN is not already configured + * on the switch port + */ + if (port_priv->vlans[vlan->vid] & ETHSW_VLAN_MEMBER) + return -EEXIST; - /* Check if there is space for a new VLAN */ - err = dpsw_get_attributes(ethsw->mc_io, 0, ethsw->dpsw_handle, - ðsw->sw_attr); - if (err) { - netdev_err(netdev, "dpsw_get_attributes err %d\n", err); - return err; - } - if (attr->max_vlans - attr->num_vlans < 1) - return -ENOSPC; + /* Check if there is space for a new VLAN */ + err = dpsw_get_attributes(ethsw->mc_io, 0, ethsw->dpsw_handle, + ðsw->sw_attr); + if (err) { + netdev_err(netdev, "dpsw_get_attributes err %d\n", err); + return err; + } + if (attr->max_vlans - attr->num_vlans < 1) + return -ENOSPC; - return 0; + /* Check if there is space for a new VLAN */ + err = dpsw_get_attributes(ethsw->mc_io, 0, ethsw->dpsw_handle, + ðsw->sw_attr); + if (err) { + netdev_err(netdev, "dpsw_get_attributes err %d\n", err); + return err; } + if (attr->max_vlans - attr->num_vlans < 1) + return -ENOSPC; if (!port_priv->ethsw_data->vlans[vlan->vid]) { /* this is a new VLAN */ @@ -1033,15 +1038,11 @@ static int dpaa2_switch_port_lookup_address(struct net_device *netdev, int is_uc } static int dpaa2_switch_port_mdb_add(struct net_device *netdev, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) + const struct switchdev_obj_port_mdb *mdb) { struct ethsw_port_priv *port_priv = netdev_priv(netdev); int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - /* Check if address is already set on this port */ if (dpaa2_switch_port_lookup_address(netdev, 0, mdb->addr)) return -EEXIST; @@ -1060,21 +1061,18 @@ static int dpaa2_switch_port_mdb_add(struct net_device *netdev, } static int dpaa2_switch_port_obj_add(struct net_device *netdev, - const struct switchdev_obj *obj, - struct switchdev_trans *trans) + const struct switchdev_obj *obj) { int err; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dpaa2_switch_port_vlans_add(netdev, - SWITCHDEV_OBJ_PORT_VLAN(obj), - trans); + SWITCHDEV_OBJ_PORT_VLAN(obj)); break; case SWITCHDEV_OBJ_ID_PORT_MDB: err = dpaa2_switch_port_mdb_add(netdev, - SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + SWITCHDEV_OBJ_PORT_MDB(obj)); break; default: err = -EOPNOTSUPP; @@ -1394,8 +1392,7 @@ static int dpaa2_switch_port_obj_event(unsigned long event, switch (event) { case SWITCHDEV_PORT_OBJ_ADD: - err = dpaa2_switch_port_obj_add(netdev, port_obj_info->obj, - port_obj_info->trans); + err = dpaa2_switch_port_obj_add(netdev, port_obj_info->obj); break; case SWITCHDEV_PORT_OBJ_DEL: err = dpaa2_switch_port_obj_del(netdev, port_obj_info->obj); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index bac7d3ba574f..cbe6e35d51f5 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -233,7 +233,6 @@ struct switchdev_notifier_fdb_info { struct switchdev_notifier_port_obj_info { struct switchdev_notifier_info info; /* must be first */ const struct switchdev_obj *obj; - struct switchdev_trans *trans; bool handled; }; @@ -288,7 +287,6 @@ int switchdev_handle_port_obj_add(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)); int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, @@ -372,7 +370,6 @@ switchdev_handle_port_obj_add(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)) { return 0; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3822520eeeae..9eaa85cc87ea 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -52,7 +52,6 @@ struct dsa_notifier_fdb_info { /* DSA_NOTIFIER_MDB_* */ struct dsa_notifier_mdb_info { const struct switchdev_obj_port_mdb *mdb; - struct switchdev_trans *trans; int sw_index; int port; }; @@ -60,7 +59,6 @@ struct dsa_notifier_mdb_info { /* DSA_NOTIFIER_VLAN_* */ struct dsa_notifier_vlan_info { const struct switchdev_obj_port_vlan *vlan; - struct switchdev_trans *trans; int sw_index; int port; }; @@ -151,8 +149,7 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans); + const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, @@ -162,8 +159,7 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, struct switchdev_trans *trans); int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans); + const struct switchdev_obj_port_vlan *vlan); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); int dsa_port_link_register_of(struct dsa_port *dp); diff --git a/net/dsa/port.c b/net/dsa/port.c index 73569c9af3cc..6668fe188f47 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -425,13 +425,11 @@ int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) } int dsa_port_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans) + const struct switchdev_obj_port_mdb *mdb) { struct dsa_notifier_mdb_info info = { .sw_index = dp->ds->index, .port = dp->index, - .trans = trans, .mdb = mdb, }; @@ -451,13 +449,11 @@ int dsa_port_mdb_del(const struct dsa_port *dp, } int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans) + const struct switchdev_obj_port_vlan *vlan) { struct dsa_notifier_vlan_info info = { .sw_index = dp->ds->index, .port = dp->index, - .trans = trans, .vlan = vlan, }; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 653d64f4a637..0eefb50aae8c 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -326,8 +326,7 @@ dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave, } static int dsa_slave_vlan_add(struct net_device *dev, - const struct switchdev_obj *obj, - struct switchdev_trans *trans) + const struct switchdev_obj *obj) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); @@ -345,7 +344,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ - if (trans->ph_prepare && br_vlan_enabled(dp->bridge_dev)) { + if (br_vlan_enabled(dp->bridge_dev)) { rcu_read_lock(); err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan); rcu_read_unlock(); @@ -353,7 +352,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, return err; } - err = dsa_port_vlan_add(dp, &vlan, trans); + err = dsa_port_vlan_add(dp, &vlan); if (err) return err; @@ -363,7 +362,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; - err = dsa_port_vlan_add(dp->cpu_dp, &vlan, trans); + err = dsa_port_vlan_add(dp->cpu_dp, &vlan); if (err) return err; @@ -372,7 +371,6 @@ static int dsa_slave_vlan_add(struct net_device *dev, static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -387,17 +385,16 @@ static int dsa_slave_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: if (obj->orig_dev != dev) return -EOPNOTSUPP; - err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans); + err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: /* DSA can directly translate this to a normal MDB add, * but on the CPU port. */ - err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj), - trans); + err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: - err = dsa_slave_vlan_add(dev, obj, trans); + err = dsa_slave_vlan_add(dev, obj); break; default: err = -EOPNOTSUPP; @@ -1286,28 +1283,15 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; - struct switchdev_trans trans; int ret; /* User port... */ - trans.ph_prepare = true; - ret = dsa_port_vlan_add(dp, &vlan, &trans); - if (ret) - return ret; - - trans.ph_prepare = false; - ret = dsa_port_vlan_add(dp, &vlan, &trans); + ret = dsa_port_vlan_add(dp, &vlan); if (ret) return ret; /* And CPU port... */ - trans.ph_prepare = true; - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans); - if (ret) - return ret; - - trans.ph_prepare = false; - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans); + ret = dsa_port_vlan_add(dp->cpu_dp, &vlan); if (ret) return ret; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 3fb362b6874e..5b0bf29e1375 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -190,8 +190,8 @@ static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, return false; } -static int dsa_switch_mdb_prepare(struct dsa_switch *ds, - struct dsa_notifier_mdb_info *info) +static int dsa_switch_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) { int port, err; @@ -206,20 +206,6 @@ static int dsa_switch_mdb_prepare(struct dsa_switch *ds, } } - return 0; -} - -static int dsa_switch_mdb_add(struct dsa_switch *ds, - struct dsa_notifier_mdb_info *info) -{ - int port; - - if (switchdev_trans_ph_prepare(info->trans)) - return dsa_switch_mdb_prepare(ds, info); - - if (!ds->ops->port_mdb_add) - return 0; - for (port = 0; port < ds->num_ports; port++) if (dsa_switch_mdb_match(ds, port, info)) ds->ops->port_mdb_add(ds, port, info->mdb); @@ -251,8 +237,8 @@ static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, return false; } -static int dsa_switch_vlan_prepare(struct dsa_switch *ds, - struct dsa_notifier_vlan_info *info) +static int dsa_switch_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_vlan_info *info) { int port, err; @@ -267,20 +253,6 @@ static int dsa_switch_vlan_prepare(struct dsa_switch *ds, } } - return 0; -} - -static int dsa_switch_vlan_add(struct dsa_switch *ds, - struct dsa_notifier_vlan_info *info) -{ - int port; - - if (switchdev_trans_ph_prepare(info->trans)) - return dsa_switch_vlan_prepare(ds, info); - - if (!ds->ops->port_vlan_add) - return 0; - for (port = 0; port < ds->num_ports; port++) if (dsa_switch_vlan_match(ds, port, info)) ds->ops->port_vlan_add(ds, port, info->vlan); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 23d868545362..a575bb33ee6c 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -221,7 +221,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack) { int rc; @@ -229,7 +228,6 @@ static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct switchdev_notifier_port_obj_info obj_info = { .obj = obj, - .trans = trans, .handled = false, }; @@ -248,35 +246,10 @@ static int switchdev_port_obj_add_now(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { - struct switchdev_trans trans; - int err; - ASSERT_RTNL(); - /* Phase I: prepare for obj add. Driver/device should fail - * here if there are going to be issues in the commit phase, - * such as lack of resources or support. The driver/device - * should reserve resources needed for the commit phase here, - * but should not commit the obj. - */ - - trans.ph_prepare = true; - err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, - dev, obj, &trans, extack); - if (err) - return err; - - /* Phase II: commit obj add. This cannot fail as a fault - * of driver/device. If it does, it's a bug in the driver/device - * because the driver said everythings was OK in phase I. - */ - - trans.ph_prepare = false; - err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, - dev, obj, &trans, extack); - WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); - - return err; + return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, + dev, obj, extack); } static void switchdev_port_obj_add_deferred(struct net_device *dev, @@ -307,10 +280,6 @@ static int switchdev_port_obj_add_defer(struct net_device *dev, * @obj: object to add * @extack: netlink extended ack * - * Use a 2-phase prepare-commit transaction model to ensure - * system is not left in a partially updated state due to - * failure from driver/device. - * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ @@ -329,7 +298,7 @@ static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, - dev, obj, NULL, NULL); + dev, obj, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, @@ -449,7 +418,6 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)) { struct netlink_ext_ack *extack; @@ -462,8 +430,7 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, if (check_cb(dev)) { /* This flag is only checked if the return value is success. */ port_obj_info->handled = true; - return add_cb(dev, port_obj_info->obj, port_obj_info->trans, - extack); + return add_cb(dev, port_obj_info->obj, extack); } /* Switch ports might be stacked under e.g. a LAG. Ignore the @@ -491,7 +458,6 @@ int switchdev_handle_port_obj_add(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)) { int err; -- cgit v1.2.3 From bae33f2b5afea932176c1b9096851c81dc0983de Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 9 Jan 2021 02:01:50 +0200 Subject: net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean Acked-by: Linus Walleij Acked-by: Jiri Pirko Reviewed-by: Kurt Kanzenbach # hellcreek Reviewed-by: Linus Walleij # RTL8366RB Reviewed-by: Ido Schimmel Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 6 +- drivers/net/dsa/b53/b53_priv.h | 3 +- drivers/net/dsa/dsa_loop.c | 3 +- drivers/net/dsa/hirschmann/hellcreek.c | 6 +- drivers/net/dsa/lantiq_gswip.c | 26 ++---- drivers/net/dsa/microchip/ksz8795.c | 6 +- drivers/net/dsa/microchip/ksz9477.c | 6 +- drivers/net/dsa/mt7530.c | 6 +- drivers/net/dsa/mv88e6xxx/chip.c | 7 +- drivers/net/dsa/ocelot/felix.c | 5 +- drivers/net/dsa/qca8k.c | 6 +- drivers/net/dsa/realtek-smi-core.h | 3 +- drivers/net/dsa/rtl8366.c | 11 +-- drivers/net/dsa/sja1105/sja1105.h | 3 +- drivers/net/dsa/sja1105/sja1105_devlink.c | 9 +- drivers/net/dsa/sja1105/sja1105_main.c | 17 ++-- .../ethernet/marvell/prestera/prestera_switchdev.c | 37 ++------- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 63 +++----------- drivers/net/ethernet/mscc/ocelot.c | 32 +++---- drivers/net/ethernet/mscc/ocelot_net.c | 13 +-- drivers/net/ethernet/rocker/rocker.h | 6 +- drivers/net/ethernet/rocker/rocker_main.c | 46 +++------- drivers/net/ethernet/rocker/rocker_ofdpa.c | 23 ++--- drivers/net/ethernet/ti/cpsw_switchdev.c | 20 ++--- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 21 ++--- include/net/dsa.h | 3 +- include/net/switchdev.h | 7 +- include/soc/mscc/ocelot.h | 3 +- net/dsa/dsa_priv.h | 18 ++-- net/dsa/port.c | 97 ++++++++++------------ net/dsa/slave.c | 17 ++-- net/dsa/switch.c | 11 +-- net/switchdev/switchdev.c | 46 ++-------- 33 files changed, 162 insertions(+), 424 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index bab174c052b3..f07f547db53e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1374,14 +1374,10 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_phylink_mac_link_up); -int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, - struct switchdev_trans *trans) +int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) { struct b53_device *dev = ds->priv; - if (switchdev_trans_ph_prepare(trans)) - return 0; - b53_enable_vlan(dev, dev->vlan_enabled, vlan_filtering); return 0; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 6d0c724763c7..9ab0fb409f78 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -348,8 +348,7 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, struct phy_device *phydev, int speed, int duplex, bool tx_pause, bool rx_pause); -int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, - struct switchdev_trans *trans); +int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); int b53_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void b53_vlan_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 3be9f665d174..8e3d623f4dbd 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -190,8 +190,7 @@ static void dsa_loop_port_stp_state_set(struct dsa_switch *ds, int port, } static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans) + bool vlan_filtering) { dev_dbg(ds->dev, "%s: port: %d, vlan_filtering: %d\n", __func__, port, vlan_filtering); diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index a59cc40170fc..4e1dbf91720c 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -858,14 +858,10 @@ static int hellcreek_fdb_dump(struct dsa_switch *ds, int port, } static int hellcreek_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans) + bool vlan_filtering) { struct hellcreek *hellcreek = ds->priv; - if (switchdev_trans_ph_prepare(trans)) - return 0; - dev_dbg(hellcreek->dev, "%s VLAN filtering on port %d\n", vlan_filtering ? "Enable" : "Disable", port); diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 72cadd16056f..1dddca92d17e 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -727,23 +727,14 @@ static int gswip_pce_load_microcode(struct gswip_priv *priv) } static int gswip_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans) + bool vlan_filtering) { + struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; struct gswip_priv *priv = ds->priv; /* Do not allow changing the VLAN filtering options while in bridge */ - if (switchdev_trans_ph_prepare(trans)) { - struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; - - if (!bridge) - return 0; - - if (!!(priv->port_vlan_filter & BIT(port)) != vlan_filtering) - return -EIO; - - return 0; - } + if (bridge && !!(priv->port_vlan_filter & BIT(port)) != vlan_filtering) + return -EIO; if (vlan_filtering) { /* Use port based VLAN tag */ @@ -781,15 +772,8 @@ static int gswip_setup(struct dsa_switch *ds) /* disable port fetch/store dma on all ports */ for (i = 0; i < priv->hw_info->max_ports; i++) { - struct switchdev_trans trans; - - /* Skip the prepare phase, this shouldn't return an error - * during setup. - */ - trans.ph_prepare = false; - gswip_port_disable(ds, i); - gswip_port_vlan_filtering(ds, i, false, &trans); + gswip_port_vlan_filtering(ds, i, false); } /* enable Switch */ diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index a4c814f6a4b5..6a9ec8a0f92f 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -783,14 +783,10 @@ static void ksz8795_flush_dyn_mac_table(struct ksz_device *dev, int port) } static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port, - bool flag, - struct switchdev_trans *trans) + bool flag) { struct ksz_device *dev = ds->priv; - if (switchdev_trans_ph_prepare(trans)) - return 0; - ksz_cfg(dev, S_MIRROR_CTRL, SW_VLAN_ENABLE, flag); return 0; diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 5a6ac0749ab0..446c088ce5c5 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -493,14 +493,10 @@ static void ksz9477_flush_dyn_mac_table(struct ksz_device *dev, int port) } static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port, - bool flag, - struct switchdev_trans *trans) + bool flag) { struct ksz_device *dev = ds->priv; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (flag) { ksz_port_cfg(dev, port, REG_PORT_LUE_CTRL, PORT_VLAN_LOOKUP_VID_0, true); diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 31d2d23bc815..fcaddc9c9370 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1376,12 +1376,8 @@ mt7530_vlan_cmd(struct mt7530_priv *priv, enum mt7530_vlan_cmd cmd, u16 vid) static int mt7530_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans) + bool vlan_filtering) { - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (vlan_filtering) { /* The port is being kept as VLAN-unaware port when bridge is * set up with vlan_filtering not being set, Otherwise, the diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index fb25cb87156a..bbf1a71ce55c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1583,16 +1583,15 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, } static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans) + bool vlan_filtering) { struct mv88e6xxx_chip *chip = ds->priv; u16 mode = vlan_filtering ? MV88E6XXX_PORT_CTL2_8021Q_MODE_SECURE : MV88E6XXX_PORT_CTL2_8021Q_MODE_DISABLED; int err; - if (switchdev_trans_ph_prepare(trans)) - return mv88e6xxx_max_vid(chip) ? 0 : -EOPNOTSUPP; + if (!mv88e6xxx_max_vid(chip)) + return -EOPNOTSUPP; mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_set_8021q_mode(chip, port, mode); diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 216fcfbc5daf..8c4cd9168dba 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -134,12 +134,11 @@ static int felix_vlan_prepare(struct dsa_switch *ds, int port, flags & BRIDGE_VLAN_INFO_UNTAGGED); } -static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, - struct switchdev_trans *trans) +static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) { struct ocelot *ocelot = ds->priv; - return ocelot_port_vlan_filtering(ocelot, port, enabled, trans); + return ocelot_port_vlan_filtering(ocelot, port, enabled); } static void felix_vlan_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index df99c696b688..1de6473b221b 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1294,14 +1294,10 @@ qca8k_port_fdb_dump(struct dsa_switch *ds, int port, } static int -qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, - struct switchdev_trans *trans) +qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) { struct qca8k_priv *priv = ds->priv; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (vlan_filtering) { qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(port), QCA8K_PORT_LOOKUP_VLAN_MODE, diff --git a/drivers/net/dsa/realtek-smi-core.h b/drivers/net/dsa/realtek-smi-core.h index 6b6a3dec0984..bc7bd47fb037 100644 --- a/drivers/net/dsa/realtek-smi-core.h +++ b/drivers/net/dsa/realtek-smi-core.h @@ -131,8 +131,7 @@ int rtl8366_enable_vlan(struct realtek_smi *smi, bool enable); int rtl8366_reset_vlan(struct realtek_smi *smi); int rtl8366_init_vlan(struct realtek_smi *smi); int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans); + bool vlan_filtering); int rtl8366_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void rtl8366_vlan_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index 1a8f93112bc6..27f429aa89a6 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -340,20 +340,15 @@ int rtl8366_init_vlan(struct realtek_smi *smi) } EXPORT_SYMBOL_GPL(rtl8366_init_vlan); -int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, - struct switchdev_trans *trans) +int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) { struct realtek_smi *smi = ds->priv; struct rtl8366_vlan_4k vlan4k; int ret; /* Use VLAN nr port + 1 since VLAN0 is not valid */ - if (switchdev_trans_ph_prepare(trans)) { - if (!smi->ops->is_vlan_valid(smi, port + 1)) - return -EINVAL; - - return 0; - } + if (!smi->ops->is_vlan_valid(smi, port + 1)) + return -EINVAL; dev_info(smi->dev, "%s filtering on port %d\n", vlan_filtering ? "enable" : "disable", diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 4ebc4a5a7b35..d582308c2401 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -245,8 +245,7 @@ enum sja1105_reset_reason { int sja1105_static_config_reload(struct sja1105_private *priv, enum sja1105_reset_reason reason); -int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, - struct switchdev_trans *trans); +int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled); void sja1105_frame_memory_partitioning(struct sja1105_private *priv); /* From sja1105_devlink.c */ diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c index 4a2ec395bcb0..b4bf1b10e66c 100644 --- a/drivers/net/dsa/sja1105/sja1105_devlink.c +++ b/drivers/net/dsa/sja1105/sja1105_devlink.c @@ -135,7 +135,6 @@ static int sja1105_best_effort_vlan_filtering_set(struct sja1105_private *priv, rtnl_lock(); for (port = 0; port < ds->num_ports; port++) { - struct switchdev_trans trans; struct dsa_port *dp; if (!dsa_is_user_port(ds, port)) @@ -144,13 +143,7 @@ static int sja1105_best_effort_vlan_filtering_set(struct sja1105_private *priv, dp = dsa_to_port(ds, port); vlan_filtering = dsa_port_is_vlan_filtering(dp); - trans.ph_prepare = true; - rc = sja1105_vlan_filtering(ds, port, vlan_filtering, &trans); - if (rc) - break; - - trans.ph_prepare = false; - rc = sja1105_vlan_filtering(ds, port, vlan_filtering, &trans); + rc = sja1105_vlan_filtering(ds, port, vlan_filtering); if (rc) break; } diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 807e65ac2518..c1b7f2b0e40c 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2631,8 +2631,7 @@ static int sja1105_vlan_prepare(struct dsa_switch *ds, int port, * which can only be partially reconfigured at runtime (and not the TPID). * So a switch reset is required. */ -int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, - struct switchdev_trans *trans) +int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) { struct sja1105_l2_lookup_params_entry *l2_lookup_params; struct sja1105_general_params_entry *general_params; @@ -2644,16 +2643,12 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, u16 tpid, tpid2; int rc; - if (switchdev_trans_ph_prepare(trans)) { - list_for_each_entry(rule, &priv->flow_block.rules, list) { - if (rule->type == SJA1105_RULE_VL) { - dev_err(ds->dev, - "Cannot change VLAN filtering with active VL rules\n"); - return -EBUSY; - } + list_for_each_entry(rule, &priv->flow_block.rules, list) { + if (rule->type == SJA1105_RULE_VL) { + dev_err(ds->dev, + "Cannot change VLAN filtering with active VL rules\n"); + return -EBUSY; } - - return 0; } if (enabled) { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 3235458a5501..e2374a39e4f8 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -580,16 +580,12 @@ int prestera_bridge_port_event(struct net_device *dev, unsigned long event, } static int prestera_port_attr_br_flags_set(struct prestera_port *port, - struct switchdev_trans *trans, struct net_device *dev, unsigned long flags) { struct prestera_bridge_port *br_port; int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - br_port = prestera_bridge_port_by_dev(port->sw->swdev, dev); if (!br_port) return 0; @@ -608,35 +604,26 @@ static int prestera_port_attr_br_flags_set(struct prestera_port *port, } static int prestera_port_attr_br_ageing_set(struct prestera_port *port, - struct switchdev_trans *trans, unsigned long ageing_clock_t) { unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock_t); u32 ageing_time_ms = jiffies_to_msecs(ageing_jiffies); struct prestera_switch *sw = port->sw; - if (switchdev_trans_ph_prepare(trans)) { - if (ageing_time_ms < PRESTERA_MIN_AGEING_TIME_MS || - ageing_time_ms > PRESTERA_MAX_AGEING_TIME_MS) - return -ERANGE; - else - return 0; - } + if (ageing_time_ms < PRESTERA_MIN_AGEING_TIME_MS || + ageing_time_ms > PRESTERA_MAX_AGEING_TIME_MS) + return -ERANGE; return prestera_hw_switch_ageing_set(sw, ageing_time_ms); } static int prestera_port_attr_br_vlan_set(struct prestera_port *port, - struct switchdev_trans *trans, struct net_device *dev, bool vlan_enabled) { struct prestera_switch *sw = port->sw; struct prestera_bridge *bridge; - if (!switchdev_trans_ph_prepare(trans)) - return 0; - bridge = prestera_bridge_by_dev(sw->swdev, dev); if (WARN_ON(!bridge)) return -EINVAL; @@ -666,7 +653,6 @@ static int prestera_port_bridge_vlan_stp_set(struct prestera_port *port, } static int presterar_port_attr_stp_state_set(struct prestera_port *port, - struct switchdev_trans *trans, struct net_device *dev, u8 state) { @@ -675,9 +661,6 @@ static int presterar_port_attr_stp_state_set(struct prestera_port *port, int err; u16 vid; - if (switchdev_trans_ph_prepare(trans)) - return 0; - br_port = prestera_bridge_port_by_dev(port->sw->swdev, dev); if (!br_port) return 0; @@ -712,16 +695,14 @@ err_port_stp_set: } static int prestera_port_obj_attr_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { struct prestera_port *port = netdev_priv(dev); int err = 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - err = presterar_port_attr_stp_state_set(port, trans, - attr->orig_dev, + err = presterar_port_attr_stp_state_set(port, attr->orig_dev, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: @@ -730,17 +711,15 @@ static int prestera_port_obj_attr_set(struct net_device *dev, err = -EINVAL; break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - err = prestera_port_attr_br_flags_set(port, trans, - attr->orig_dev, + err = prestera_port_attr_br_flags_set(port, attr->orig_dev, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: - err = prestera_port_attr_br_ageing_set(port, trans, + err = prestera_port_attr_br_ageing_set(port, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - err = prestera_port_attr_br_vlan_set(port, trans, - attr->orig_dev, + err = prestera_port_attr_br_vlan_set(port, attr->orig_dev, attr->u.vlan_filtering); break; default: diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index eff5920aed06..409c206f813f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -527,7 +527,6 @@ mlxsw_sp_port_bridge_vlan_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, } static int mlxsw_sp_port_attr_stp_state_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, u8 state) { @@ -535,9 +534,6 @@ static int mlxsw_sp_port_attr_stp_state_set(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_vlan *bridge_vlan; int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - /* It's possible we failed to enslave the port, yet this * operation is executed due to it being deferred. */ @@ -659,7 +655,6 @@ err_port_bridge_vlan_learning_set: static int mlxsw_sp_port_attr_br_pre_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, unsigned long brport_flags) { if (brport_flags & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) @@ -669,16 +664,12 @@ static int mlxsw_sp_port_attr_br_pre_flags_set(struct mlxsw_sp_port } static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, unsigned long brport_flags) { struct mlxsw_sp_bridge_port *bridge_port; int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge, orig_dev); if (!bridge_port) @@ -724,35 +715,26 @@ static int mlxsw_sp_ageing_set(struct mlxsw_sp *mlxsw_sp, u32 ageing_time) } static int mlxsw_sp_port_attr_br_ageing_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, unsigned long ageing_clock_t) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock_t); u32 ageing_time = jiffies_to_msecs(ageing_jiffies) / 1000; - if (switchdev_trans_ph_prepare(trans)) { - if (ageing_time < MLXSW_SP_MIN_AGEING_TIME || - ageing_time > MLXSW_SP_MAX_AGEING_TIME) - return -ERANGE; - else - return 0; - } + if (ageing_time < MLXSW_SP_MIN_AGEING_TIME || + ageing_time > MLXSW_SP_MAX_AGEING_TIME) + return -ERANGE; return mlxsw_sp_ageing_set(mlxsw_sp, ageing_time); } static int mlxsw_sp_port_attr_br_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, bool vlan_enabled) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_bridge_device *bridge_device; - if (!switchdev_trans_ph_prepare(trans)) - return 0; - bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev); if (WARN_ON(!bridge_device)) return -EINVAL; @@ -765,16 +747,12 @@ static int mlxsw_sp_port_attr_br_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, } static int mlxsw_sp_port_attr_br_vlan_proto_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, u16 vlan_proto) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_bridge_device *bridge_device; - if (!switchdev_trans_ph_prepare(trans)) - return 0; - bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev); if (WARN_ON(!bridge_device)) return -EINVAL; @@ -784,16 +762,12 @@ static int mlxsw_sp_port_attr_br_vlan_proto_set(struct mlxsw_sp_port *mlxsw_sp_p } static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, bool is_port_mrouter) { struct mlxsw_sp_bridge_port *bridge_port; int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge, orig_dev); if (!bridge_port) @@ -825,7 +799,6 @@ static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port) } static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, bool mc_disabled) { @@ -834,9 +807,6 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_port *bridge_port; int err; - if (switchdev_trans_ph_prepare(trans)) - return 0; - /* It's possible we failed to enslave the port, yet this * operation is executed due to it being deferred. */ @@ -896,16 +866,12 @@ mlxsw_sp_bridge_mrouter_update_mdb(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, struct net_device *orig_dev, bool is_mrouter) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_bridge_device *bridge_device; - if (switchdev_trans_ph_prepare(trans)) - return 0; - /* It's possible we failed to enslave the port, yet this * operation is executed due to it being deferred. */ @@ -921,54 +887,52 @@ mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, } static int mlxsw_sp_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); int err; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - err = mlxsw_sp_port_attr_stp_state_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_stp_state_set(mlxsw_sp_port, attr->orig_dev, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: err = mlxsw_sp_port_attr_br_pre_flags_set(mlxsw_sp_port, - trans, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - err = mlxsw_sp_port_attr_br_flags_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_br_flags_set(mlxsw_sp_port, attr->orig_dev, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: - err = mlxsw_sp_port_attr_br_ageing_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_br_ageing_set(mlxsw_sp_port, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - err = mlxsw_sp_port_attr_br_vlan_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_br_vlan_set(mlxsw_sp_port, attr->orig_dev, attr->u.vlan_filtering); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL: - err = mlxsw_sp_port_attr_br_vlan_proto_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_br_vlan_proto_set(mlxsw_sp_port, attr->orig_dev, attr->u.vlan_protocol); break; case SWITCHDEV_ATTR_ID_PORT_MROUTER: - err = mlxsw_sp_port_attr_mrouter_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_mrouter_set(mlxsw_sp_port, attr->orig_dev, attr->u.mrouter); break; case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED: - err = mlxsw_sp_port_mc_disabled_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_mc_disabled_set(mlxsw_sp_port, attr->orig_dev, attr->u.mc_disabled); break; case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: - err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, trans, + err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, attr->orig_dev, attr->u.mrouter); break; @@ -977,8 +941,7 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev, break; } - if (switchdev_trans_ph_commit(trans)) - mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp); + mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp); return err; } diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 0b9992bd6626..af620f7ce469 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -208,25 +208,20 @@ static void ocelot_port_set_pvid(struct ocelot *ocelot, int port, } int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, - bool vlan_aware, struct switchdev_trans *trans) + bool vlan_aware) { + struct ocelot_vcap_block *block = &ocelot->block[VCAP_IS1]; struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_vcap_filter *filter; u32 val; - if (switchdev_trans_ph_prepare(trans)) { - struct ocelot_vcap_block *block = &ocelot->block[VCAP_IS1]; - struct ocelot_vcap_filter *filter; - - list_for_each_entry(filter, &block->rules, list) { - if (filter->ingress_port_mask & BIT(port) && - filter->action.vid_replace_ena) { - dev_err(ocelot->dev, - "Cannot change VLAN state with vlan modify rules active\n"); - return -EBUSY; - } + list_for_each_entry(filter, &block->rules, list) { + if (filter->ingress_port_mask & BIT(port) && + filter->action.vid_replace_ena) { + dev_err(ocelot->dev, + "Cannot change VLAN state with vlan modify rules active\n"); + return -EBUSY; } - - return 0; } ocelot_port->vlan_aware = vlan_aware; @@ -1179,7 +1174,6 @@ int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, struct net_device *bridge) { struct ocelot_vlan pvid = {0}, native_vlan = {0}; - struct switchdev_trans trans; int ret; ocelot->bridge_mask &= ~BIT(port); @@ -1187,13 +1181,7 @@ int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, if (!ocelot->bridge_mask) ocelot->hw_bridge_dev = NULL; - trans.ph_prepare = true; - ret = ocelot_port_vlan_filtering(ocelot, port, false, &trans); - if (ret) - return ret; - - trans.ph_prepare = false; - ret = ocelot_port_vlan_filtering(ocelot, port, false, &trans); + ret = ocelot_port_vlan_filtering(ocelot, port, false); if (ret) return ret; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 16d958a6e206..4fb9095be3ea 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -825,12 +825,8 @@ static const struct ethtool_ops ocelot_ethtool_ops = { }; static void ocelot_port_attr_stp_state_set(struct ocelot *ocelot, int port, - struct switchdev_trans *trans, u8 state) { - if (switchdev_trans_ph_prepare(trans)) - return; - ocelot_bridge_stp_state_set(ocelot, port, state); } @@ -858,8 +854,7 @@ static void ocelot_port_attr_mc_set(struct ocelot *ocelot, int port, bool mc) } static int ocelot_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot *ocelot = priv->port.ocelot; @@ -868,15 +863,13 @@ static int ocelot_port_attr_set(struct net_device *dev, switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - ocelot_port_attr_stp_state_set(ocelot, port, trans, - attr->u.stp_state); + ocelot_port_attr_stp_state_set(ocelot, port, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: ocelot_port_attr_ageing_set(ocelot, port, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - ocelot_port_vlan_filtering(ocelot, port, - attr->u.vlan_filtering, trans); + ocelot_port_vlan_filtering(ocelot, port, attr->u.vlan_filtering); break; case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED: ocelot_port_attr_mc_set(ocelot, port, !attr->u.mc_disabled); diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h index 6fad25321dc5..315a6e5c0f59 100644 --- a/drivers/net/ethernet/rocker/rocker.h +++ b/drivers/net/ethernet/rocker/rocker.h @@ -103,15 +103,13 @@ struct rocker_world_ops { int (*port_attr_stp_state_set)(struct rocker_port *rocker_port, u8 state); int (*port_attr_bridge_flags_set)(struct rocker_port *rocker_port, - unsigned long brport_flags, - struct switchdev_trans *trans); + unsigned long brport_flags); int (*port_attr_bridge_flags_support_get)(const struct rocker_port * rocker_port, unsigned long * p_brport_flags); int (*port_attr_bridge_ageing_time_set)(struct rocker_port *rocker_port, - u32 ageing_time, - struct switchdev_trans *trans); + u32 ageing_time); int (*port_obj_vlan_add)(struct rocker_port *rocker_port, const struct switchdev_obj_port_vlan *vlan); int (*port_obj_vlan_del)(struct rocker_port *rocker_port, diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 1018d3759316..740a715c49c6 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -1550,17 +1550,13 @@ static void rocker_world_port_stop(struct rocker_port *rocker_port) } static int rocker_world_port_attr_stp_state_set(struct rocker_port *rocker_port, - u8 state, - struct switchdev_trans *trans) + u8 state) { struct rocker_world_ops *wops = rocker_port->rocker->wops; if (!wops->port_attr_stp_state_set) return -EOPNOTSUPP; - if (switchdev_trans_ph_prepare(trans)) - return 0; - return wops->port_attr_stp_state_set(rocker_port, state); } @@ -1580,8 +1576,7 @@ rocker_world_port_attr_bridge_flags_support_get(const struct rocker_port * static int rocker_world_port_attr_pre_bridge_flags_set(struct rocker_port *rocker_port, - unsigned long brport_flags, - struct switchdev_trans *trans) + unsigned long brport_flags) { struct rocker_world_ops *wops = rocker_port->rocker->wops; unsigned long brport_flags_s; @@ -1603,37 +1598,26 @@ rocker_world_port_attr_pre_bridge_flags_set(struct rocker_port *rocker_port, static int rocker_world_port_attr_bridge_flags_set(struct rocker_port *rocker_port, - unsigned long brport_flags, - struct switchdev_trans *trans) + unsigned long brport_flags) { struct rocker_world_ops *wops = rocker_port->rocker->wops; if (!wops->port_attr_bridge_flags_set) return -EOPNOTSUPP; - if (switchdev_trans_ph_prepare(trans)) - return 0; - - return wops->port_attr_bridge_flags_set(rocker_port, brport_flags, - trans); + return wops->port_attr_bridge_flags_set(rocker_port, brport_flags); } static int rocker_world_port_attr_bridge_ageing_time_set(struct rocker_port *rocker_port, - u32 ageing_time, - struct switchdev_trans *trans) - + u32 ageing_time) { struct rocker_world_ops *wops = rocker_port->rocker->wops; if (!wops->port_attr_bridge_ageing_time_set) return -EOPNOTSUPP; - if (switchdev_trans_ph_prepare(trans)) - return 0; - - return wops->port_attr_bridge_ageing_time_set(rocker_port, ageing_time, - trans); + return wops->port_attr_bridge_ageing_time_set(rocker_port, ageing_time); } static int @@ -2062,8 +2046,7 @@ static const struct net_device_ops rocker_port_netdev_ops = { ********************/ static int rocker_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { struct rocker_port *rocker_port = netdev_priv(dev); int err = 0; @@ -2071,23 +2054,19 @@ static int rocker_port_attr_set(struct net_device *dev, switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: err = rocker_world_port_attr_stp_state_set(rocker_port, - attr->u.stp_state, - trans); + attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: err = rocker_world_port_attr_pre_bridge_flags_set(rocker_port, - attr->u.brport_flags, - trans); + attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: err = rocker_world_port_attr_bridge_flags_set(rocker_port, - attr->u.brport_flags, - trans); + attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: err = rocker_world_port_attr_bridge_ageing_time_set(rocker_port, - attr->u.ageing_time, - trans); + attr->u.ageing_time); break; default: err = -EOPNOTSUPP; @@ -2719,8 +2698,7 @@ rocker_switchdev_port_attr_set_event(struct net_device *netdev, { int err; - err = rocker_port_attr_set(netdev, port_attr_info->attr, - port_attr_info->trans); + err = rocker_port_attr_set(netdev, port_attr_info->attr); port_attr_info->handled = true; return notifier_from_errno(err); diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index d9d5188b7627..d067da1ef070 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -2488,8 +2488,7 @@ static int ofdpa_port_attr_stp_state_set(struct rocker_port *rocker_port, } static int ofdpa_port_attr_bridge_flags_set(struct rocker_port *rocker_port, - unsigned long brport_flags, - struct switchdev_trans *trans) + unsigned long brport_flags) { struct ofdpa_port *ofdpa_port = rocker_port->wpriv; unsigned long orig_flags; @@ -2497,14 +2496,11 @@ static int ofdpa_port_attr_bridge_flags_set(struct rocker_port *rocker_port, orig_flags = ofdpa_port->brport_flags; ofdpa_port->brport_flags = brport_flags; - if ((orig_flags ^ ofdpa_port->brport_flags) & BR_LEARNING && - !switchdev_trans_ph_prepare(trans)) + + if ((orig_flags ^ ofdpa_port->brport_flags) & BR_LEARNING) err = rocker_port_set_learning(ofdpa_port->rocker_port, !!(ofdpa_port->brport_flags & BR_LEARNING)); - if (switchdev_trans_ph_prepare(trans)) - ofdpa_port->brport_flags = orig_flags; - return err; } @@ -2520,18 +2516,15 @@ ofdpa_port_attr_bridge_flags_support_get(const struct rocker_port * static int ofdpa_port_attr_bridge_ageing_time_set(struct rocker_port *rocker_port, - u32 ageing_time, - struct switchdev_trans *trans) + u32 ageing_time) { struct ofdpa_port *ofdpa_port = rocker_port->wpriv; struct ofdpa *ofdpa = ofdpa_port->ofdpa; - if (!switchdev_trans_ph_prepare(trans)) { - ofdpa_port->ageing_time = clock_t_to_jiffies(ageing_time); - if (ofdpa_port->ageing_time < ofdpa->ageing_time) - ofdpa->ageing_time = ofdpa_port->ageing_time; - mod_timer(&ofdpa_port->ofdpa->fdb_cleanup_timer, jiffies); - } + ofdpa_port->ageing_time = clock_t_to_jiffies(ageing_time); + if (ofdpa_port->ageing_time < ofdpa->ageing_time) + ofdpa->ageing_time = ofdpa_port->ageing_time; + mod_timer(&ofdpa_port->ofdpa->fdb_cleanup_timer, jiffies); return 0; } diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index 3232f483c068..9967cf985728 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -24,16 +24,12 @@ struct cpsw_switchdev_event_work { unsigned long event; }; -static int cpsw_port_stp_state_set(struct cpsw_priv *priv, - struct switchdev_trans *trans, u8 state) +static int cpsw_port_stp_state_set(struct cpsw_priv *priv, u8 state) { struct cpsw_common *cpsw = priv->cpsw; u8 cpsw_state; int ret = 0; - if (switchdev_trans_ph_prepare(trans)) - return 0; - switch (state) { case BR_STATE_FORWARDING: cpsw_state = ALE_PORT_STATE_FORWARD; @@ -60,16 +56,12 @@ static int cpsw_port_stp_state_set(struct cpsw_priv *priv, } static int cpsw_port_attr_br_flags_set(struct cpsw_priv *priv, - struct switchdev_trans *trans, struct net_device *orig_dev, unsigned long brport_flags) { struct cpsw_common *cpsw = priv->cpsw; bool unreg_mcast_add = false; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (brport_flags & BR_MCAST_FLOOD) unreg_mcast_add = true; dev_dbg(priv->dev, "BR_MCAST_FLOOD: %d port %u\n", @@ -82,7 +74,6 @@ static int cpsw_port_attr_br_flags_set(struct cpsw_priv *priv, } static int cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, - struct switchdev_trans *trans, unsigned long flags) { if (flags & ~(BR_LEARNING | BR_MCAST_FLOOD)) @@ -92,8 +83,7 @@ static int cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, } static int cpsw_port_attr_set(struct net_device *ndev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { struct cpsw_priv *priv = netdev_priv(ndev); int ret; @@ -102,15 +92,15 @@ static int cpsw_port_attr_set(struct net_device *ndev, switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: - ret = cpsw_port_attr_br_flags_pre_set(ndev, trans, + ret = cpsw_port_attr_br_flags_pre_set(ndev, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - ret = cpsw_port_stp_state_set(priv, trans, attr->u.stp_state); + ret = cpsw_port_stp_state_set(priv, attr->u.stp_state); dev_dbg(priv->dev, "stp state: %u\n", attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - ret = cpsw_port_attr_br_flags_set(priv, trans, attr->orig_dev, + ret = cpsw_port_attr_br_flags_set(priv, attr->orig_dev, attr->u.brport_flags); break; default: diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index 197dea9c3b42..ca3d07fe7f58 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -901,19 +901,14 @@ static void dpaa2_switch_teardown_irqs(struct fsl_mc_device *sw_dev) } static int dpaa2_switch_port_attr_stp_state_set(struct net_device *netdev, - struct switchdev_trans *trans, u8 state) { struct ethsw_port_priv *port_priv = netdev_priv(netdev); - if (switchdev_trans_ph_prepare(trans)) - return 0; - return dpaa2_switch_port_set_stp_state(port_priv, state); } static int dpaa2_switch_port_attr_br_flags_pre_set(struct net_device *netdev, - struct switchdev_trans *trans, unsigned long flags) { if (flags & ~(BR_LEARNING | BR_FLOOD)) @@ -923,15 +918,11 @@ static int dpaa2_switch_port_attr_br_flags_pre_set(struct net_device *netdev, } static int dpaa2_switch_port_attr_br_flags_set(struct net_device *netdev, - struct switchdev_trans *trans, unsigned long flags) { struct ethsw_port_priv *port_priv = netdev_priv(netdev); int err = 0; - if (switchdev_trans_ph_prepare(trans)) - return 0; - /* Learning is enabled per switch */ err = dpaa2_switch_set_learning(port_priv->ethsw_data, !!(flags & BR_LEARNING)); @@ -945,22 +936,21 @@ exit: } static int dpaa2_switch_port_attr_set(struct net_device *netdev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { int err = 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - err = dpaa2_switch_port_attr_stp_state_set(netdev, trans, + err = dpaa2_switch_port_attr_stp_state_set(netdev, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: - err = dpaa2_switch_port_attr_br_flags_pre_set(netdev, trans, + err = dpaa2_switch_port_attr_br_flags_pre_set(netdev, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - err = dpaa2_switch_port_attr_br_flags_set(netdev, trans, + err = dpaa2_switch_port_attr_br_flags_set(netdev, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: @@ -1197,8 +1187,7 @@ static int dpaa2_switch_port_attr_set_event(struct net_device *netdev, { int err; - err = dpaa2_switch_port_attr_set(netdev, port_attr_info->attr, - port_attr_info->trans); + err = dpaa2_switch_port_attr_set(netdev, port_attr_info->attr); port_attr_info->handled = true; return notifier_from_errno(err); diff --git a/include/net/dsa.h b/include/net/dsa.h index 9a0cab5fb7c4..d0f043091969 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -565,8 +565,7 @@ struct dsa_switch_ops { * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, - bool vlan_filtering, - struct switchdev_trans *trans); + bool vlan_filtering); int (*port_vlan_prepare)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void (*port_vlan_add)(struct dsa_switch *ds, int port, diff --git a/include/net/switchdev.h b/include/net/switchdev.h index cbe6e35d51f5..f873e2c5e125 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -239,7 +239,6 @@ struct switchdev_notifier_port_obj_info { struct switchdev_notifier_port_attr_info { struct switchdev_notifier_info info; /* must be first */ const struct switchdev_attr *attr; - struct switchdev_trans *trans; bool handled; }; @@ -298,8 +297,7 @@ int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans)); + const struct switchdev_attr *attr)); #else static inline void switchdev_deferred_process(void) @@ -390,8 +388,7 @@ switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans)) + const struct switchdev_attr *attr)) { return 0; } diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2f4cd3288bcc..f3db75c0172b 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -739,8 +739,7 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev); -int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled, - struct switchdev_trans *trans); +int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state); int ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9eaa85cc87ea..3e6063bf3f17 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -128,19 +128,16 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, } /* port.c */ -int dsa_port_set_state(struct dsa_port *dp, u8 state, - struct switchdev_trans *trans); +int dsa_port_set_state(struct dsa_port *dp, u8 state); int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, - struct switchdev_trans *trans); +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering); bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); -int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, - struct switchdev_trans *trans); +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, bool propagate_upstream); int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, @@ -152,12 +149,9 @@ int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans); -int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans); -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct switchdev_trans *trans); +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags); +int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags); +int dsa_port_mrouter(struct dsa_port *dp, bool mrouter); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); int dsa_port_vlan_del(struct dsa_port *dp, diff --git a/net/dsa/port.c b/net/dsa/port.c index 6668fe188f47..14bf0053ae01 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -40,17 +40,15 @@ static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) return notifier_to_errno(err); } -int dsa_port_set_state(struct dsa_port *dp, u8 state, - struct switchdev_trans *trans) +int dsa_port_set_state(struct dsa_port *dp, u8 state) { struct dsa_switch *ds = dp->ds; int port = dp->index; - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; + if (!ds->ops->port_stp_state_set) + return -EOPNOTSUPP; - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, port, state); + ds->ops->port_stp_state_set(ds, port, state); if (ds->ops->port_fast_age) { /* Fast age FDB entries or flush appropriate forwarding database @@ -75,7 +73,7 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) { int err; - err = dsa_port_set_state(dp, state, NULL); + err = dsa_port_set_state(dp, state); if (err) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } @@ -145,7 +143,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) int err; /* Set the flooding mode before joining the port in the switch */ - err = dsa_port_bridge_flags(dp, BR_FLOOD | BR_MCAST_FLOOD, NULL); + err = dsa_port_bridge_flags(dp, BR_FLOOD | BR_MCAST_FLOOD); if (err) return err; @@ -158,7 +156,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) /* The bridging is rolled back on error */ if (err) { - dsa_port_bridge_flags(dp, 0, NULL); + dsa_port_bridge_flags(dp, 0); dp->bridge_dev = NULL; } @@ -185,7 +183,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); /* Port is leaving the bridge, disable flooding */ - dsa_port_bridge_flags(dp, 0, NULL); + dsa_port_bridge_flags(dp, 0); /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional @@ -259,43 +257,36 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, return true; } -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, - struct switchdev_trans *trans) +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) { struct dsa_switch *ds = dp->ds; + bool apply; int err; - if (switchdev_trans_ph_prepare(trans)) { - bool apply; - - if (!ds->ops->port_vlan_filtering) - return -EOPNOTSUPP; + if (!ds->ops->port_vlan_filtering) + return -EOPNOTSUPP; - /* We are called from dsa_slave_switchdev_blocking_event(), - * which is not under rcu_read_lock(), unlike - * dsa_slave_switchdev_event(). - */ - rcu_read_lock(); - apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering); - rcu_read_unlock(); - if (!apply) - return -EINVAL; - } + /* We are called from dsa_slave_switchdev_blocking_event(), + * which is not under rcu_read_lock(), unlike + * dsa_slave_switchdev_event(). + */ + rcu_read_lock(); + apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering); + rcu_read_unlock(); + if (!apply) + return -EINVAL; if (dsa_port_is_vlan_filtering(dp) == vlan_filtering) return 0; - err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering, - trans); + err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering); if (err) return err; - if (switchdev_trans_ph_commit(trans)) { - if (ds->vlan_filtering_is_global) - ds->vlan_filtering = vlan_filtering; - else - dp->vlan_filtering = vlan_filtering; - } + if (ds->vlan_filtering_is_global) + ds->vlan_filtering = vlan_filtering; + else + dp->vlan_filtering = vlan_filtering; return 0; } @@ -314,26 +305,29 @@ bool dsa_port_skip_vlan_configuration(struct dsa_port *dp) !br_vlan_enabled(dp->bridge_dev)); } -int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, - struct switchdev_trans *trans) +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock) { unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock); unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); - struct dsa_notifier_ageing_time_info info = { - .ageing_time = ageing_time, - .trans = trans, - }; + struct dsa_notifier_ageing_time_info info; + struct switchdev_trans trans; + int err; - if (switchdev_trans_ph_prepare(trans)) - return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); + info.ageing_time = ageing_time; + info.trans = &trans; + + trans.ph_prepare = true; + err = dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); + if (err) + return err; dp->ageing_time = ageing_time; + trans.ph_prepare = false; return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info); } -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans) +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags) { struct dsa_switch *ds = dp->ds; @@ -344,16 +338,12 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags, return 0; } -int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, - struct switchdev_trans *trans) +int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags) { struct dsa_switch *ds = dp->ds; int port = dp->index; int err = 0; - if (switchdev_trans_ph_prepare(trans)) - return 0; - if (ds->ops->port_egress_floods) err = ds->ops->port_egress_floods(ds, port, flags & BR_FLOOD, flags & BR_MCAST_FLOOD); @@ -361,14 +351,13 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags, return err; } -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct switchdev_trans *trans) +int dsa_port_mrouter(struct dsa_port *dp, bool mrouter) { struct dsa_switch *ds = dp->ds; int port = dp->index; - if (switchdev_trans_ph_prepare(trans)) - return ds->ops->port_egress_floods ? 0 : -EOPNOTSUPP; + if (!ds->ops->port_egress_floods) + return -EOPNOTSUPP; return ds->ops->port_egress_floods(ds, port, true, mrouter); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0eefb50aae8c..7ba505825db2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -268,32 +268,29 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } static int dsa_slave_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { struct dsa_port *dp = dsa_slave_to_port(dev); int ret; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - ret = dsa_port_set_state(dp, attr->u.stp_state, trans); + ret = dsa_port_set_state(dp, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, - trans); + ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: - ret = dsa_port_ageing_time(dp, attr->u.ageing_time, trans); + ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: - ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, - trans); + ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, trans); + ret = dsa_port_bridge_flags(dp, attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: - ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, trans); + ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter); break; default: ret = -EOPNOTSUPP; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5b0bf29e1375..17979956d756 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -139,17 +139,8 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, } } if (unset_vlan_filtering) { - struct switchdev_trans trans; - - trans.ph_prepare = true; - err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false, &trans); - if (err && err != EOPNOTSUPP) - return err; - - trans.ph_prepare = false; err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false, &trans); + false); if (err && err != EOPNOTSUPP) return err; } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 3509d362056d..855a10feef3d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -100,15 +100,13 @@ static int switchdev_deferred_enqueue(struct net_device *dev, static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans) + const struct switchdev_attr *attr) { int err; int rc; struct switchdev_notifier_port_attr_info attr_info = { .attr = attr, - .trans = trans, .handled = false, }; @@ -129,34 +127,7 @@ static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, static int switchdev_port_attr_set_now(struct net_device *dev, const struct switchdev_attr *attr) { - struct switchdev_trans trans; - int err; - - /* Phase I: prepare for attr set. Driver/device should fail - * here if there are going to be issues in the commit phase, - * such as lack of resources or support. The driver/device - * should reserve resources needed for the commit phase here, - * but should not commit the attr. - */ - - trans.ph_prepare = true; - err = switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, - &trans); - if (err) - return err; - - /* Phase II: commit attr set. This cannot fail as a fault - * of driver/device. If it does, it's a bug in the driver/device - * because the driver said everythings was OK in phase I. - */ - - trans.ph_prepare = false; - err = switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, - &trans); - WARN(err, "%s: Commit of attribute (id=%d) failed.\n", - dev->name, attr->id); - - return err; + return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr); } static void switchdev_port_attr_set_deferred(struct net_device *dev, @@ -186,10 +157,6 @@ static int switchdev_port_attr_set_defer(struct net_device *dev, * @dev: port device * @attr: attribute to set * - * Use a 2-phase prepare-commit transaction model to ensure - * system is not left in a partially updated state due to - * failure from driver/device. - * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ @@ -519,8 +486,7 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans)) + const struct switchdev_attr *attr)) { struct net_device *lower_dev; struct list_head *iter; @@ -528,8 +494,7 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, if (check_cb(dev)) { port_attr_info->handled = true; - return set_cb(dev, port_attr_info->attr, - port_attr_info->trans); + return set_cb(dev, port_attr_info->attr); } /* Switch ports might be stacked under e.g. a LAG. Ignore the @@ -556,8 +521,7 @@ int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr, - struct switchdev_trans *trans)) + const struct switchdev_attr *attr)) { int err; -- cgit v1.2.3 From a52b2da778fc93e01c4e5a744953f2ba1ec01c00 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 9 Jan 2021 02:01:52 +0200 Subject: net: dsa: remove the transactional logic from MDB entries For many drivers, the .port_mdb_prepare callback was not a good opportunity to avoid any error condition, and they would suppress errors found during the actual commit phase. Where a logical separation between the prepare and the commit phase existed, the function that used to implement the .port_mdb_prepare callback still exists, but now it is called directly from .port_mdb_add, which was modified to return an int code. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Acked-by: Linus Walleij Acked-by: Jiri Pirko Reviewed-by: Kurt Kanzenbach # hellcreek Reviewed-by: Linus Wallei # RTL8366 Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 19 +++---------------- drivers/net/dsa/b53/b53_priv.h | 6 ++---- drivers/net/dsa/bcm_sf2.c | 1 - drivers/net/dsa/lan9303-core.c | 12 ++++++++---- drivers/net/dsa/microchip/ksz8795.c | 1 - drivers/net/dsa/microchip/ksz9477.c | 14 +++++++++----- drivers/net/dsa/microchip/ksz_common.c | 16 +++++----------- drivers/net/dsa/microchip/ksz_common.h | 6 ++---- drivers/net/dsa/mv88e6xxx/chip.c | 24 +++++++----------------- drivers/net/dsa/ocelot/felix.c | 14 +++----------- drivers/net/dsa/sja1105/sja1105_main.c | 14 +++----------- include/net/dsa.h | 4 +--- net/dsa/switch.c | 15 ++++++--------- 13 files changed, 49 insertions(+), 97 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index f07f547db53e..d5f2798968ed 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1741,8 +1741,8 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_fdb_dump); -int b53_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +int b53_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { struct b53_device *priv = ds->priv; @@ -1752,19 +1752,7 @@ int b53_mdb_prepare(struct dsa_switch *ds, int port, if (is5325(priv) || is5365(priv)) return -EOPNOTSUPP; - return 0; -} -EXPORT_SYMBOL(b53_mdb_prepare); - -void b53_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) -{ - struct b53_device *priv = ds->priv; - int ret; - - ret = b53_arl_op(priv, 0, port, mdb->addr, mdb->vid, true); - if (ret) - dev_err(ds->dev, "failed to add MDB entry\n"); + return b53_arl_op(priv, 0, port, mdb->addr, mdb->vid, true); } EXPORT_SYMBOL(b53_mdb_add); @@ -2205,7 +2193,6 @@ static const struct dsa_switch_ops b53_switch_ops = { .port_fdb_del = b53_fdb_del, .port_mirror_add = b53_mirror_add, .port_mirror_del = b53_mirror_del, - .port_mdb_prepare = b53_mdb_prepare, .port_mdb_add = b53_mdb_add, .port_mdb_del = b53_mdb_del, .port_max_mtu = b53_get_max_mtu, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 9ab0fb409f78..2a4a54df3a31 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -361,10 +361,8 @@ int b53_fdb_del(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); int b53_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); -int b53_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); -void b53_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); +int b53_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb); int b53_mdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); int b53_mirror_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 65c8a044f222..12ff84109f13 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1126,7 +1126,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .set_rxnfc = bcm_sf2_set_rxnfc, .port_mirror_add = b53_mirror_add, .port_mirror_del = b53_mirror_del, - .port_mdb_prepare = b53_mdb_prepare, .port_mdb_add = b53_mdb_add, .port_mdb_del = b53_mdb_del, }; diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index aa1142d6a9f5..344374025426 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -1232,14 +1232,19 @@ static int lan9303_port_mdb_prepare(struct dsa_switch *ds, int port, return 0; } -static void lan9303_port_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int lan9303_port_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { struct lan9303 *chip = ds->priv; + int err; + + err = lan9303_port_mdb_prepare(ds, port, mdb); + if (err) + return err; dev_dbg(chip->dev, "%s(%d, %pM, %d)\n", __func__, port, mdb->addr, mdb->vid); - lan9303_alr_add_port(chip, mdb->addr, port, false); + return lan9303_alr_add_port(chip, mdb->addr, port, false); } static int lan9303_port_mdb_del(struct dsa_switch *ds, int port, @@ -1274,7 +1279,6 @@ static const struct dsa_switch_ops lan9303_switch_ops = { .port_fdb_add = lan9303_port_fdb_add, .port_fdb_del = lan9303_port_fdb_del, .port_fdb_dump = lan9303_port_fdb_dump, - .port_mdb_prepare = lan9303_port_mdb_prepare, .port_mdb_add = lan9303_port_mdb_add, .port_mdb_del = lan9303_port_mdb_del, }; diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 6a9ec8a0f92f..89e1c01cf5b8 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1114,7 +1114,6 @@ static const struct dsa_switch_ops ksz8795_switch_ops = { .port_vlan_add = ksz8795_port_vlan_add, .port_vlan_del = ksz8795_port_vlan_del, .port_fdb_dump = ksz_port_fdb_dump, - .port_mdb_prepare = ksz_port_mdb_prepare, .port_mdb_add = ksz_port_mdb_add, .port_mdb_del = ksz_port_mdb_del, .port_mirror_add = ksz8795_port_mirror_add, diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 446c088ce5c5..08bf54eb9f5f 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -774,14 +774,15 @@ exit: return ret; } -static void ksz9477_port_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int ksz9477_port_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { struct ksz_device *dev = ds->priv; u32 static_table[4]; u32 data; int index; u32 mac_hi, mac_lo; + int err = 0; mac_hi = ((mdb->addr[0] << 8) | mdb->addr[1]); mac_lo = ((mdb->addr[2] << 24) | (mdb->addr[3] << 16)); @@ -796,7 +797,8 @@ static void ksz9477_port_mdb_add(struct dsa_switch *ds, int port, ksz_write32(dev, REG_SW_ALU_STAT_CTRL__4, data); /* wait to be finished */ - if (ksz9477_wait_alu_sta_ready(dev)) { + err = ksz9477_wait_alu_sta_ready(dev); + if (err) { dev_dbg(dev->dev, "Failed to read ALU STATIC\n"); goto exit; } @@ -819,8 +821,10 @@ static void ksz9477_port_mdb_add(struct dsa_switch *ds, int port, } /* no available entry */ - if (index == dev->num_statics) + if (index == dev->num_statics) { + err = -ENOSPC; goto exit; + } /* add entry */ static_table[0] = ALU_V_STATIC_VALID; @@ -842,6 +846,7 @@ static void ksz9477_port_mdb_add(struct dsa_switch *ds, int port, exit: mutex_unlock(&dev->alu_mutex); + return err; } static int ksz9477_port_mdb_del(struct dsa_switch *ds, int port, @@ -1395,7 +1400,6 @@ static const struct dsa_switch_ops ksz9477_switch_ops = { .port_fdb_dump = ksz9477_port_fdb_dump, .port_fdb_add = ksz9477_port_fdb_add, .port_fdb_del = ksz9477_port_fdb_del, - .port_mdb_prepare = ksz_port_mdb_prepare, .port_mdb_add = ksz9477_port_mdb_add, .port_mdb_del = ksz9477_port_mdb_del, .port_mirror_add = ksz9477_port_mirror_add, diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index cf743133b0b9..f2c9ff3ea4be 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -253,16 +253,8 @@ int ksz_port_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, } EXPORT_SYMBOL_GPL(ksz_port_fdb_dump); -int ksz_port_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) -{ - /* nothing to do */ - return 0; -} -EXPORT_SYMBOL_GPL(ksz_port_mdb_prepare); - -void ksz_port_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +int ksz_port_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { struct ksz_device *dev = ds->priv; struct alu_struct alu; @@ -284,7 +276,7 @@ void ksz_port_mdb_add(struct dsa_switch *ds, int port, /* no available entry */ if (index == dev->num_statics && !empty) - return; + return -ENOSPC; /* add entry */ if (index == dev->num_statics) { @@ -301,6 +293,8 @@ void ksz_port_mdb_add(struct dsa_switch *ds, int port, alu.fid = mdb->vid; } dev->dev_ops->w_sta_mac_table(dev, index, &alu); + + return 0; } EXPORT_SYMBOL_GPL(ksz_port_mdb_add); diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index 720f22275c84..a1f0929d45a0 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -165,10 +165,8 @@ int ksz_port_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int ksz_port_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); -int ksz_port_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); -void ksz_port_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); +int ksz_port_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb); int ksz_port_mdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); int ksz_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index bbf1a71ce55c..e9c517c0f89c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5238,27 +5238,18 @@ static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds, return chip->info->tag_protocol; } -static int mv88e6xxx_port_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) -{ - /* We don't need any dynamic resource from the kernel (yet), - * so skip the prepare phase. - */ - - return 0; -} - -static void mv88e6xxx_port_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int mv88e6xxx_port_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { struct mv88e6xxx_chip *chip = ds->priv; + int err; mv88e6xxx_reg_lock(chip); - if (mv88e6xxx_port_db_load_purge(chip, port, mdb->addr, mdb->vid, - MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC)) - dev_err(ds->dev, "p%d: failed to load multicast MAC address\n", - port); + err = mv88e6xxx_port_db_load_purge(chip, port, mdb->addr, mdb->vid, + MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC); mv88e6xxx_reg_unlock(chip); + + return err; } static int mv88e6xxx_port_mdb_del(struct dsa_switch *ds, int port, @@ -5403,7 +5394,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .port_fdb_add = mv88e6xxx_port_fdb_add, .port_fdb_del = mv88e6xxx_port_fdb_del, .port_fdb_dump = mv88e6xxx_port_fdb_dump, - .port_mdb_prepare = mv88e6xxx_port_mdb_prepare, .port_mdb_add = mv88e6xxx_port_mdb_add, .port_mdb_del = mv88e6xxx_port_mdb_del, .port_mirror_add = mv88e6xxx_port_mirror_add, diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 8c4cd9168dba..2825dd11feee 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -65,19 +65,12 @@ static int felix_fdb_del(struct dsa_switch *ds, int port, return ocelot_fdb_del(ocelot, port, addr, vid); } -/* This callback needs to be present */ -static int felix_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) -{ - return 0; -} - -static void felix_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int felix_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { struct ocelot *ocelot = ds->priv; - ocelot_port_mdb_add(ocelot, port, mdb); + return ocelot_port_mdb_add(ocelot, port, mdb); } static int felix_mdb_del(struct dsa_switch *ds, int port, @@ -772,7 +765,6 @@ const struct dsa_switch_ops felix_switch_ops = { .port_fdb_dump = felix_fdb_dump, .port_fdb_add = felix_fdb_add, .port_fdb_del = felix_fdb_del, - .port_mdb_prepare = felix_mdb_prepare, .port_mdb_add = felix_mdb_add, .port_mdb_del = felix_mdb_del, .port_bridge_join = felix_bridge_join, diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index c1b7f2b0e40c..be200d4289af 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1524,17 +1524,10 @@ static int sja1105_fdb_dump(struct dsa_switch *ds, int port, return 0; } -/* This callback needs to be present */ -static int sja1105_mdb_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) -{ - return 0; -} - -static void sja1105_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int sja1105_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) { - sja1105_fdb_add(ds, port, mdb->addr, mdb->vid); + return sja1105_fdb_add(ds, port, mdb->addr, mdb->vid); } static int sja1105_mdb_del(struct dsa_switch *ds, int port, @@ -3288,7 +3281,6 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .port_vlan_filtering = sja1105_vlan_filtering, .port_vlan_add = sja1105_vlan_add, .port_vlan_del = sja1105_vlan_del, - .port_mdb_prepare = sja1105_mdb_prepare, .port_mdb_add = sja1105_mdb_add, .port_mdb_del = sja1105_mdb_del, .port_hwtstamp_get = sja1105_hwtstamp_get, diff --git a/include/net/dsa.h b/include/net/dsa.h index d0f043091969..b8c0550dfa74 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -585,10 +585,8 @@ struct dsa_switch_ops { /* * Multicast database */ - int (*port_mdb_prepare)(struct dsa_switch *ds, int port, + int (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); - void (*port_mdb_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); /* diff --git a/net/dsa/switch.c b/net/dsa/switch.c index c6b3ac93bcc7..5f5e19c5e43a 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -181,24 +181,21 @@ static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, static int dsa_switch_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { - int port, err; + int err = 0; + int port; - if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_mdb_match(ds, port, info)) { - err = ds->ops->port_mdb_prepare(ds, port, info->mdb); + err = ds->ops->port_mdb_add(ds, port, info->mdb); if (err) - return err; + break; } } - for (port = 0; port < ds->num_ports; port++) - if (dsa_switch_mdb_match(ds, port, info)) - ds->ops->port_mdb_add(ds, port, info->mdb); - - return 0; + return err; } static int dsa_switch_mdb_del(struct dsa_switch *ds, -- cgit v1.2.3 From 1958d5815c91353960df2198a74f5ca15785ed28 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 9 Jan 2021 02:01:53 +0200 Subject: net: dsa: remove the transactional logic from VLAN objects It should be the driver's business to logically separate its VLAN offloading into a preparation and a commit phase, and some drivers don't need / can't do this. So remove the transactional shim from DSA and let drivers propagate errors directly from the .port_vlan_add callback. It would appear that the code has worse error handling now than it had before. DSA is the only in-kernel user of switchdev that offloads one switchdev object to more than one port: for every VLAN object offloaded to a user port, that VLAN is also offloaded to the CPU port. So the "prepare for user port -> check for errors -> prepare for CPU port -> check for errors -> commit for user port -> commit for CPU port" sequence appears to make more sense than the one we are using now: "offload to user port -> check for errors -> offload to CPU port -> check for errors", but it is really a compromise. In the new way, we can catch errors from the commit phase that we previously had to ignore. But we have our hands tied and cannot do any rollback now: if we add a VLAN on the CPU port and it fails, we can't do the rollback by simply deleting it from the user port, because the switchdev API is not so nice with us: it could have simply been there already, even with the same flags. So we don't even attempt to rollback anything on addition error, just leave whatever VLANs managed to get offloaded right where they are. This should not be a problem at all in practice. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Acked-by: Linus Walleij Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 17 ++++++++----- drivers/net/dsa/b53/b53_priv.h | 6 ++--- drivers/net/dsa/bcm_sf2.c | 1 - drivers/net/dsa/bcm_sf2_cfp.c | 7 ++---- drivers/net/dsa/dsa_loop.c | 28 ++++++---------------- drivers/net/dsa/hirschmann/hellcreek.c | 12 +++++++--- drivers/net/dsa/lantiq_gswip.c | 15 ++++++++---- drivers/net/dsa/microchip/ksz8795.c | 7 +++--- drivers/net/dsa/microchip/ksz9477.c | 18 ++++++++------ drivers/net/dsa/microchip/ksz_common.c | 9 ------- drivers/net/dsa/microchip/ksz_common.h | 2 -- drivers/net/dsa/mt7530.c | 14 +++-------- drivers/net/dsa/mv88e6xxx/chip.c | 34 ++++++++++++++++---------- drivers/net/dsa/ocelot/felix.c | 16 ++++++++----- drivers/net/dsa/qca8k.c | 14 ++++------- drivers/net/dsa/realtek-smi-core.h | 6 ++--- drivers/net/dsa/rtl8366.c | 44 ++++++++++++++-------------------- drivers/net/dsa/rtl8366rb.c | 1 - drivers/net/dsa/sja1105/sja1105_main.c | 43 ++++++++++++--------------------- include/net/dsa.h | 4 +--- net/dsa/switch.c | 8 ++----- 21 files changed, 133 insertions(+), 173 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index d5f2798968ed..5d6d9f91d40a 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1384,8 +1384,8 @@ int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) } EXPORT_SYMBOL(b53_vlan_filtering); -int b53_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int b53_vlan_prepare(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct b53_device *dev = ds->priv; @@ -1407,15 +1407,19 @@ int b53_vlan_prepare(struct dsa_switch *ds, int port, return 0; } -EXPORT_SYMBOL(b53_vlan_prepare); -void b53_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +int b53_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct b53_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct b53_vlan *vl; + int err; + + err = b53_vlan_prepare(ds, port, vlan); + if (err) + return err; vl = &dev->vlans[vlan->vid]; @@ -1438,6 +1442,8 @@ void b53_vlan_add(struct dsa_switch *ds, int port, vlan->vid); b53_fast_age_vlan(dev, vlan->vid); } + + return 0; } EXPORT_SYMBOL(b53_vlan_add); @@ -2185,7 +2191,6 @@ static const struct dsa_switch_ops b53_switch_ops = { .port_fast_age = b53_br_fast_age, .port_egress_floods = b53_br_egress_floods, .port_vlan_filtering = b53_vlan_filtering, - .port_vlan_prepare = b53_vlan_prepare, .port_vlan_add = b53_vlan_add, .port_vlan_del = b53_vlan_del, .port_fdb_dump = b53_fdb_dump, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 2a4a54df3a31..0d2cc0453bef 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -349,10 +349,8 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, int speed, int duplex, bool tx_pause, bool rx_pause); int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); -int b53_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); -void b53_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); +int b53_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan); int b53_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int b53_fdb_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 12ff84109f13..d53485c79d77 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1116,7 +1116,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .port_stp_state_set = b53_br_set_stp_state, .port_fast_age = b53_br_fast_age, .port_vlan_filtering = b53_vlan_filtering, - .port_vlan_prepare = b53_vlan_prepare, .port_vlan_add = b53_vlan_add, .port_vlan_del = b53_vlan_del, .port_fdb_dump = b53_fdb_dump, diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index 59d799ac1b60..ed45d16250e1 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -891,11 +891,9 @@ static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port, else vlan.flags = 0; - ret = ds->ops->port_vlan_prepare(ds, port_num, &vlan); + ret = ds->ops->port_vlan_add(ds, port_num, &vlan); if (ret) return ret; - - ds->ops->port_vlan_add(ds, port_num, &vlan); } /* @@ -941,8 +939,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, return -EINVAL; if ((fs->flow_type & FLOW_EXT) && - !(ds->ops->port_vlan_prepare || ds->ops->port_vlan_add || - ds->ops->port_vlan_del)) + !(ds->ops->port_vlan_add || ds->ops->port_vlan_del)) return -EOPNOTSUPP; if (fs->location != RX_CLS_LOC_ANY && diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 8e3d623f4dbd..be61ce93a377 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -198,26 +198,8 @@ static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port, return 0; } -static int -dsa_loop_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - struct dsa_loop_priv *ps = ds->priv; - struct mii_bus *bus = ps->bus; - - dev_dbg(ds->dev, "%s: port: %d, vlan: %d", __func__, port, vlan->vid); - - /* Just do a sleeping operation to make lockdep checks effective */ - mdiobus_read(bus, ps->port_base + port, MII_BMSR); - - if (vlan->vid > ARRAY_SIZE(ps->vlans)) - return -ERANGE; - - return 0; -} - -static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; @@ -225,6 +207,9 @@ static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, struct mii_bus *bus = ps->bus; struct dsa_loop_vlan *vl; + if (vlan->vid > ARRAY_SIZE(ps->vlans)) + return -ERANGE; + /* Just do a sleeping operation to make lockdep checks effective */ mdiobus_read(bus, ps->port_base + port, MII_BMSR); @@ -241,6 +226,8 @@ static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, if (pvid) ps->ports[port].pvid = vlan->vid; + + return 0; } static int dsa_loop_port_vlan_del(struct dsa_switch *ds, int port, @@ -300,7 +287,6 @@ static const struct dsa_switch_ops dsa_loop_driver = { .port_bridge_leave = dsa_loop_port_bridge_leave, .port_stp_state_set = dsa_loop_port_stp_state_set, .port_vlan_filtering = dsa_loop_port_vlan_filtering, - .port_vlan_prepare = dsa_loop_port_vlan_prepare, .port_vlan_add = dsa_loop_port_vlan_add, .port_vlan_del = dsa_loop_port_vlan_del, .port_change_mtu = dsa_loop_port_change_mtu, diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index 4e1dbf91720c..205249504289 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -438,18 +438,25 @@ static void hellcreek_unapply_vlan(struct hellcreek *hellcreek, int port, mutex_unlock(&hellcreek->reg_lock); } -static void hellcreek_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int hellcreek_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct hellcreek *hellcreek = ds->priv; + int err; + + err = hellcreek_vlan_prepare(ds, port, vlan); + if (err) + return err; dev_dbg(hellcreek->dev, "Add VLAN %d on port %d, %s, %s\n", vlan->vid, port, untagged ? "untagged" : "tagged", pvid ? "PVID" : "no PVID"); hellcreek_apply_vlan(hellcreek, port, vlan->vid, pvid, untagged); + + return 0; } static int hellcreek_vlan_del(struct dsa_switch *ds, int port, @@ -1146,7 +1153,6 @@ static const struct dsa_switch_ops hellcreek_ds_ops = { .port_vlan_add = hellcreek_vlan_add, .port_vlan_del = hellcreek_vlan_del, .port_vlan_filtering = hellcreek_vlan_filtering, - .port_vlan_prepare = hellcreek_vlan_prepare, .setup = hellcreek_setup, }; diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 1dddca92d17e..244729ee3bdb 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1167,13 +1167,18 @@ static int gswip_port_vlan_prepare(struct dsa_switch *ds, int port, return 0; } -static void gswip_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int gswip_port_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; + int err; + + err = gswip_port_vlan_prepare(ds, port, vlan); + if (err) + return err; /* We have to receive all packets on the CPU port and should not * do any VLAN filtering here. This is also called with bridge @@ -1181,9 +1186,10 @@ static void gswip_port_vlan_add(struct dsa_switch *ds, int port, * this. */ if (dsa_is_cpu_port(ds, port)) - return; + return 0; - gswip_vlan_add_aware(priv, bridge, port, vlan->vid, untagged, pvid); + return gswip_vlan_add_aware(priv, bridge, port, vlan->vid, + untagged, pvid); } static int gswip_port_vlan_del(struct dsa_switch *ds, int port, @@ -1580,7 +1586,6 @@ static const struct dsa_switch_ops gswip_switch_ops = { .port_bridge_leave = gswip_port_bridge_leave, .port_fast_age = gswip_port_fast_age, .port_vlan_filtering = gswip_port_vlan_filtering, - .port_vlan_prepare = gswip_port_vlan_prepare, .port_vlan_add = gswip_port_vlan_add, .port_vlan_del = gswip_port_vlan_del, .port_stp_state_set = gswip_port_stp_state_set, diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 89e1c01cf5b8..d639f9476bd9 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -792,8 +792,8 @@ static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port, return 0; } -static void ksz8795_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int ksz8795_port_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; @@ -828,6 +828,8 @@ static void ksz8795_port_vlan_add(struct dsa_switch *ds, int port, vid |= new_pvid; ksz_pwrite16(dev, port, REG_PORT_CTRL_VID, vid); } + + return 0; } static int ksz8795_port_vlan_del(struct dsa_switch *ds, int port, @@ -1110,7 +1112,6 @@ static const struct dsa_switch_ops ksz8795_switch_ops = { .port_stp_state_set = ksz8795_port_stp_state_set, .port_fast_age = ksz_port_fast_age, .port_vlan_filtering = ksz8795_port_vlan_filtering, - .port_vlan_prepare = ksz_port_vlan_prepare, .port_vlan_add = ksz8795_port_vlan_add, .port_vlan_del = ksz8795_port_vlan_del, .port_fdb_dump = ksz_port_fdb_dump, diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 08bf54eb9f5f..71cf24f20252 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -510,16 +510,18 @@ static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port, return 0; } -static void ksz9477_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct ksz_device *dev = ds->priv; u32 vlan_table[3]; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; + int err; - if (ksz9477_get_vlan_table(dev, vlan->vid, vlan_table)) { + err = ksz9477_get_vlan_table(dev, vlan->vid, vlan_table); + if (err) { dev_dbg(dev->dev, "Failed to get vlan table\n"); - return; + return err; } vlan_table[0] = VLAN_VALID | (vlan->vid & VLAN_FID_M); @@ -531,14 +533,17 @@ static void ksz9477_port_vlan_add(struct dsa_switch *ds, int port, vlan_table[2] |= BIT(port) | BIT(dev->cpu_port); - if (ksz9477_set_vlan_table(dev, vlan->vid, vlan_table)) { + err = ksz9477_set_vlan_table(dev, vlan->vid, vlan_table); + if (err) { dev_dbg(dev->dev, "Failed to set vlan table\n"); - return; + return err; } /* change PVID */ if (vlan->flags & BRIDGE_VLAN_INFO_PVID) ksz_pwrite16(dev, port, REG_PORT_DEFAULT_VID, vlan->vid); + + return 0; } static int ksz9477_port_vlan_del(struct dsa_switch *ds, int port, @@ -1394,7 +1399,6 @@ static const struct dsa_switch_ops ksz9477_switch_ops = { .port_stp_state_set = ksz9477_port_stp_state_set, .port_fast_age = ksz_port_fast_age, .port_vlan_filtering = ksz9477_port_vlan_filtering, - .port_vlan_prepare = ksz_port_vlan_prepare, .port_vlan_add = ksz9477_port_vlan_add, .port_vlan_del = ksz9477_port_vlan_del, .port_fdb_dump = ksz9477_port_fdb_dump, diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index f2c9ff3ea4be..4e0619c66573 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -213,15 +213,6 @@ void ksz_port_fast_age(struct dsa_switch *ds, int port) } EXPORT_SYMBOL_GPL(ksz_port_fast_age); -int ksz_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - /* nothing needed */ - - return 0; -} -EXPORT_SYMBOL_GPL(ksz_port_vlan_prepare); - int ksz_port_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data) { diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index a1f0929d45a0..f212775372ce 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -161,8 +161,6 @@ int ksz_port_bridge_join(struct dsa_switch *ds, int port, void ksz_port_bridge_leave(struct dsa_switch *ds, int port, struct net_device *br); void ksz_port_fast_age(struct dsa_switch *ds, int port); -int ksz_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); int ksz_port_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int ksz_port_mdb_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index fcaddc9c9370..199a135125b2 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1393,15 +1393,6 @@ mt7530_port_vlan_filtering(struct dsa_switch *ds, int port, return 0; } -static int -mt7530_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - /* nothing needed */ - - return 0; -} - static void mt7530_hw_vlan_add(struct mt7530_priv *priv, struct mt7530_hw_vlan_entry *entry) @@ -1489,7 +1480,7 @@ mt7530_hw_vlan_update(struct mt7530_priv *priv, u16 vid, mt7530_vlan_cmd(priv, MT7530_VTCR_WR_VID, vid); } -static void +static int mt7530_port_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { @@ -1510,6 +1501,8 @@ mt7530_port_vlan_add(struct dsa_switch *ds, int port, } mutex_unlock(&priv->reg_mutex); + + return 0; } static int @@ -2608,7 +2601,6 @@ static const struct dsa_switch_ops mt7530_switch_ops = { .port_fdb_del = mt7530_port_fdb_del, .port_fdb_dump = mt7530_port_fdb_dump, .port_vlan_filtering = mt7530_port_vlan_filtering, - .port_vlan_prepare = mt7530_port_vlan_prepare, .port_vlan_add = mt7530_port_vlan_add, .port_vlan_del = mt7530_port_vlan_del, .port_mirror_add = mt753x_port_mirror_add, diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index e9c517c0f89c..4aa7d0a8f197 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1617,9 +1617,6 @@ mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port, err = mv88e6xxx_port_check_hw_vlan(ds, port, vlan->vid); mv88e6xxx_reg_unlock(chip); - /* We don't need any dynamic resource from the kernel (yet), - * so skip the prepare phase. - */ return err; } @@ -1963,17 +1960,19 @@ static int mv88e6xxx_port_vlan_join(struct mv88e6xxx_chip *chip, int port, return 0; } -static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct mv88e6xxx_chip *chip = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; bool warn; u8 member; + int err; - if (!mv88e6xxx_max_vid(chip)) - return; + err = mv88e6xxx_port_vlan_prepare(ds, port, vlan); + if (err) + return err; if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) member = MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_UNMODIFIED; @@ -1989,15 +1988,25 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, mv88e6xxx_reg_lock(chip); - if (mv88e6xxx_port_vlan_join(chip, port, vlan->vid, member, warn)) + err = mv88e6xxx_port_vlan_join(chip, port, vlan->vid, member, warn); + if (err) { dev_err(ds->dev, "p%d: failed to add VLAN %d%c\n", port, vlan->vid, untagged ? 'u' : 't'); + goto out; + } - if (pvid && mv88e6xxx_port_set_pvid(chip, port, vlan->vid)) - dev_err(ds->dev, "p%d: failed to set PVID %d\n", port, - vlan->vid); - + if (pvid) { + err = mv88e6xxx_port_set_pvid(chip, port, vlan->vid); + if (err) { + dev_err(ds->dev, "p%d: failed to set PVID %d\n", + port, vlan->vid); + goto out; + } + } +out: mv88e6xxx_reg_unlock(chip); + + return err; } static int mv88e6xxx_port_vlan_leave(struct mv88e6xxx_chip *chip, @@ -5388,7 +5397,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .port_stp_state_set = mv88e6xxx_port_stp_state_set, .port_fast_age = mv88e6xxx_port_fast_age, .port_vlan_filtering = mv88e6xxx_port_vlan_filtering, - .port_vlan_prepare = mv88e6xxx_port_vlan_prepare, .port_vlan_add = mv88e6xxx_port_vlan_add, .port_vlan_del = mv88e6xxx_port_vlan_del, .port_fdb_add = mv88e6xxx_port_fdb_add, diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 2825dd11feee..768a74dc462a 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -134,15 +134,20 @@ static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) return ocelot_port_vlan_filtering(ocelot, port, enabled); } -static void felix_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int felix_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct ocelot *ocelot = ds->priv; u16 flags = vlan->flags; + int err; + + err = felix_vlan_prepare(ds, port, vlan); + if (err) + return err; - ocelot_vlan_add(ocelot, port, vlan->vid, - flags & BRIDGE_VLAN_INFO_PVID, - flags & BRIDGE_VLAN_INFO_UNTAGGED); + return ocelot_vlan_add(ocelot, port, vlan->vid, + flags & BRIDGE_VLAN_INFO_PVID, + flags & BRIDGE_VLAN_INFO_UNTAGGED); } static int felix_vlan_del(struct dsa_switch *ds, int port, @@ -770,7 +775,6 @@ const struct dsa_switch_ops felix_switch_ops = { .port_bridge_join = felix_bridge_join, .port_bridge_leave = felix_bridge_leave, .port_stp_state_set = felix_bridge_stp_state_set, - .port_vlan_prepare = felix_vlan_prepare, .port_vlan_filtering = felix_vlan_filtering, .port_vlan_add = felix_vlan_add, .port_vlan_del = felix_vlan_del, diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 1de6473b221b..f54e8b6c8621 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1312,13 +1312,6 @@ qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) } static int -qca8k_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - return 0; -} - -static void qca8k_port_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { @@ -1328,8 +1321,10 @@ qca8k_port_vlan_add(struct dsa_switch *ds, int port, int ret = 0; ret = qca8k_vlan_add(priv, port, vlan->vid, untagged); - if (ret) + if (ret) { dev_err(priv->dev, "Failed to add VLAN to port %d (%d)", port, ret); + return ret; + } if (pvid) { int shift = 16 * (port % 2); @@ -1340,6 +1335,8 @@ qca8k_port_vlan_add(struct dsa_switch *ds, int port, QCA8K_PORT_VLAN_CVID(vlan->vid) | QCA8K_PORT_VLAN_SVID(vlan->vid)); } + + return 0; } static int @@ -1382,7 +1379,6 @@ static const struct dsa_switch_ops qca8k_switch_ops = { .port_fdb_del = qca8k_port_fdb_del, .port_fdb_dump = qca8k_port_fdb_dump, .port_vlan_filtering = qca8k_port_vlan_filtering, - .port_vlan_prepare = qca8k_port_vlan_prepare, .port_vlan_add = qca8k_port_vlan_add, .port_vlan_del = qca8k_port_vlan_del, .phylink_validate = qca8k_phylink_validate, diff --git a/drivers/net/dsa/realtek-smi-core.h b/drivers/net/dsa/realtek-smi-core.h index bc7bd47fb037..26376b052594 100644 --- a/drivers/net/dsa/realtek-smi-core.h +++ b/drivers/net/dsa/realtek-smi-core.h @@ -132,10 +132,8 @@ int rtl8366_reset_vlan(struct realtek_smi *smi); int rtl8366_init_vlan(struct realtek_smi *smi); int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); -int rtl8366_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); -void rtl8366_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); +int rtl8366_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan); int rtl8366_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void rtl8366_get_strings(struct dsa_switch *ds, int port, u32 stringset, diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index 27f429aa89a6..3b24f2e13200 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -374,36 +374,26 @@ int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) } EXPORT_SYMBOL_GPL(rtl8366_vlan_filtering); -int rtl8366_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +int rtl8366_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { + bool untagged = !!(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED); + bool pvid = !!(vlan->flags & BRIDGE_VLAN_INFO_PVID); struct realtek_smi *smi = ds->priv; + u32 member = 0; + u32 untag = 0; + int ret; if (!smi->ops->is_vlan_valid(smi, vlan->vid)) return -EINVAL; - dev_info(smi->dev, "prepare VLAN %04x\n", vlan->vid); - /* Enable VLAN in the hardware * FIXME: what's with this 4k business? * Just rtl8366_enable_vlan() seems inconclusive. */ - return rtl8366_enable_vlan4k(smi, true); -} -EXPORT_SYMBOL_GPL(rtl8366_vlan_prepare); - -void rtl8366_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - bool untagged = !!(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED); - bool pvid = !!(vlan->flags & BRIDGE_VLAN_INFO_PVID); - struct realtek_smi *smi = ds->priv; - u32 member = 0; - u32 untag = 0; - int ret; - - if (!smi->ops->is_vlan_valid(smi, vlan->vid)) - return; + ret = rtl8366_enable_vlan4k(smi, true); + if (ret) + return ret; dev_info(smi->dev, "add VLAN %d on port %d, %s, %s\n", vlan->vid, port, untagged ? "untagged" : "tagged", @@ -418,20 +408,22 @@ void rtl8366_vlan_add(struct dsa_switch *ds, int port, untag |= BIT(port); ret = rtl8366_set_vlan(smi, vlan->vid, member, untag, 0); - if (ret) + if (ret) { dev_err(smi->dev, "failed to set up VLAN %04x", vlan->vid); + return ret; + } if (!pvid) - return; + return 0; ret = rtl8366_set_pvid(smi, port, vlan->vid); - if (ret) + if (ret) { dev_err(smi->dev, "failed to set PVID on port %d to VLAN %04x", port, vlan->vid); + return ret; + } - if (!ret) - dev_dbg(smi->dev, "VLAN add: added VLAN %d with PVID on port %d\n", - vlan->vid, port); + return 0; } EXPORT_SYMBOL_GPL(rtl8366_vlan_add); diff --git a/drivers/net/dsa/rtl8366rb.c b/drivers/net/dsa/rtl8366rb.c index cfe56960f44b..896978568716 100644 --- a/drivers/net/dsa/rtl8366rb.c +++ b/drivers/net/dsa/rtl8366rb.c @@ -1504,7 +1504,6 @@ static const struct dsa_switch_ops rtl8366rb_switch_ops = { .get_ethtool_stats = rtl8366_get_ethtool_stats, .get_sset_count = rtl8366_get_sset_count, .port_vlan_filtering = rtl8366_vlan_filtering, - .port_vlan_prepare = rtl8366_vlan_prepare, .port_vlan_add = rtl8366_vlan_add, .port_vlan_del = rtl8366_vlan_del, .port_enable = rtl8366rb_port_enable, diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index be200d4289af..050b1260f358 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2600,26 +2600,6 @@ out: return rc; } -static int sja1105_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) -{ - struct sja1105_private *priv = ds->priv; - - if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL) - return 0; - - /* If the user wants best-effort VLAN filtering (aka vlan_filtering - * bridge plus tagging), be sure to at least deny alterations to the - * configuration done by dsa_8021q. - */ - if (vid_is_dsa_8021q(vlan->vid)) { - dev_err(ds->dev, "Range 1024-3071 reserved for dsa_8021q operation\n"); - return -EBUSY; - } - - return 0; -} - /* The TPID setting belongs to the General Parameters table, * which can only be partially reconfigured at runtime (and not the TPID). * So a switch reset is required. @@ -2779,26 +2759,34 @@ static int sja1105_vlan_del_one(struct dsa_switch *ds, int port, u16 vid, return 0; } -static void sja1105_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) +static int sja1105_vlan_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan) { struct sja1105_private *priv = ds->priv; bool vlan_table_changed = false; int rc; + /* If the user wants best-effort VLAN filtering (aka vlan_filtering + * bridge plus tagging), be sure to at least deny alterations to the + * configuration done by dsa_8021q. + */ + if (priv->vlan_state != SJA1105_VLAN_FILTERING_FULL && + vid_is_dsa_8021q(vlan->vid)) { + dev_err(ds->dev, "Range 1024-3071 reserved for dsa_8021q operation\n"); + return -EBUSY; + } + rc = sja1105_vlan_add_one(ds, port, vlan->vid, vlan->flags, &priv->bridge_vlans); if (rc < 0) - return; + return rc; if (rc > 0) vlan_table_changed = true; if (!vlan_table_changed) - return; + return 0; - rc = sja1105_build_vlan_table(priv, true); - if (rc) - dev_err(ds->dev, "Failed to build VLAN table: %d\n", rc); + return sja1105_build_vlan_table(priv, true); } static int sja1105_vlan_del(struct dsa_switch *ds, int port, @@ -3277,7 +3265,6 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .port_bridge_join = sja1105_bridge_join, .port_bridge_leave = sja1105_bridge_leave, .port_stp_state_set = sja1105_bridge_stp_state_set, - .port_vlan_prepare = sja1105_vlan_prepare, .port_vlan_filtering = sja1105_vlan_filtering, .port_vlan_add = sja1105_vlan_add, .port_vlan_del = sja1105_vlan_del, diff --git a/include/net/dsa.h b/include/net/dsa.h index b8c0550dfa74..c9a3dd7588df 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -566,10 +566,8 @@ struct dsa_switch_ops { */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering); - int (*port_vlan_prepare)(struct dsa_switch *ds, int port, + int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); - void (*port_vlan_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); /* diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5f5e19c5e43a..f92eaacb17cf 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -227,21 +227,17 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, { int port, err; - if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) + if (!ds->ops->port_vlan_add) return -EOPNOTSUPP; for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_vlan_match(ds, port, info)) { - err = ds->ops->port_vlan_prepare(ds, port, info->vlan); + err = ds->ops->port_vlan_add(ds, port, info->vlan); if (err) return err; } } - for (port = 0; port < ds->num_ports; port++) - if (dsa_switch_vlan_match(ds, port, info)) - ds->ops->port_vlan_add(ds, port, info->vlan); - return 0; } -- cgit v1.2.3 From 8f73cc50ba2dd5a5749d3670e453c3864258b892 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 9 Jan 2021 02:01:56 +0200 Subject: net: switchdev: delete the transaction object Now that all users of struct switchdev_trans have been modified to do without it, we can remove this structure and the two helpers to determine the phase. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Ido Schimmel Acked-by: Linus Walleij Acked-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index f873e2c5e125..88fcac140966 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -16,20 +16,6 @@ #define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1) #define SWITCHDEV_F_DEFER BIT(2) -struct switchdev_trans { - bool ph_prepare; -}; - -static inline bool switchdev_trans_ph_prepare(struct switchdev_trans *trans) -{ - return trans && trans->ph_prepare; -} - -static inline bool switchdev_trans_ph_commit(struct switchdev_trans *trans) -{ - return trans && !trans->ph_prepare; -} - enum switchdev_attr_id { SWITCHDEV_ATTR_ID_UNDEFINED, SWITCHDEV_ATTR_ID_PORT_STP_STATE, -- cgit v1.2.3 From ad0bfc233ae2e7ee3bcb9a6089e4aa54e2b44fa1 Mon Sep 17 00:00:00 2001 From: Danilo Carvalho Date: Fri, 8 Jan 2021 22:21:04 +0000 Subject: Fix whitespace in uapi/linux/tcp.h. List of things fixed: - Two of the socket options were idented with spaces instead of tabs. - Trailing whitespace in some lines. - Improper spacing around parenthesis caught by checkpatch.pl. - Mix of space and tabs in tcp_word_hdr union. Signed-off-by: Danilo Carvalho Link: https://lore.kernel.org/r/20210108222104.2079472-1-doak@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 13ceeb395eb8..768e93bd5b51 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -51,7 +51,7 @@ struct tcphdr { fin:1; #else #error "Adjust your defines" -#endif +#endif __be16 window; __sum16 check; __be16 urg_ptr; @@ -62,14 +62,14 @@ struct tcphdr { * (union is compatible to any of its members) * This means this part of the code is -fstrict-aliasing safe now. */ -union tcp_word_hdr { +union tcp_word_hdr { struct tcphdr hdr; - __be32 words[5]; -}; + __be32 words[5]; +}; -#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) +#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) -enum { +enum { TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), @@ -80,7 +80,7 @@ enum { TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) -}; +}; /* * TCP general constants @@ -103,8 +103,8 @@ enum { #define TCP_QUICKACK 12 /* Block/reenable quick acks */ #define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ -#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ -#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 -- cgit v1.2.3 From c6458e72f6fd6ac7e390da0d9abe8446084886e5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 12 Jan 2021 12:34:22 +0000 Subject: bpf: Clarify return value of probe str helpers When the buffer is too small to contain the input string, these helpers return the length of the buffer, not the length of the original string. This tries to make the docs totally clear about that, since "the length of the [copied ]string" could also refer to the length of the input. Signed-off-by: Brendan Jackman Signed-off-by: Daniel Borkmann Acked-by: KP Singh Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112123422.2011234-1-jackmanb@google.com --- include/uapi/linux/bpf.h | 10 +++++----- tools/include/uapi/linux/bpf.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77d7c1bb2923..a1ad32456f89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2993,10 +2993,10 @@ union bpf_attr { * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: * * :: * @@ -3024,7 +3024,7 @@ union bpf_attr { * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return - * On success, the strictly positive length of the string, + * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 77d7c1bb2923..a1ad32456f89 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2993,10 +2993,10 @@ union bpf_attr { * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: * * :: * @@ -3024,7 +3024,7 @@ union bpf_attr { * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return - * On success, the strictly positive length of the string, + * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * -- cgit v1.2.3 From a643bff752dcf72a07e1b2ab2f8587e4f51118be Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:14 -0800 Subject: bpf: Add bpf_patch_call_args prototype to include/linux/bpf.h Add bpf_patch_call_args() prototype. This function is called from BPF verifier and only if CONFIG_BPF_JIT_ALWAYS_ON is not defined. This fixes compiler warning about missing prototype in some kernel configurations. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Reported-by: kernel test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-2-andrii@kernel.org --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07cb5d15e743..ef9309604b3e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1403,7 +1403,10 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, union bpf_attr __user *uattr); + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +#endif struct btf *bpf_get_btf_vmlinux(void); -- cgit v1.2.3 From 6943c2b05bf09fd5c5729f7d7d803bf3f126cb9a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:15 -0800 Subject: bpf: Avoid warning when re-casting __bpf_call_base into __bpf_call_base_args BPF interpreter uses extra input argument, so re-casts __bpf_call_base into __bpf_call_base_args. Avoid compiler warning about incompatible function prototypes by casting to void * first. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Reported-by: kernel test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-3-andrii@kernel.org --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 29c27656165b..5edf2b660881 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -886,7 +886,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - __bpf_call_base) + (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); -- cgit v1.2.3 From 936f8946bdb48239f4292812d4d2e26c6d328c95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:16 -0800 Subject: bpf: Declare __bpf_free_used_maps() unconditionally __bpf_free_used_maps() is always defined in kernel/bpf/core.c, while include/linux/bpf.h is guarding it behind CONFIG_BPF_SYSCALL. Move it out of that guard region and fix compiler warning. Fixes: a2ea07465c8d ("bpf: Fix missing prog untrack in release_maps") Reported-by: kernel test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-4-andrii@kernel.org --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ef9309604b3e..6e585dbc10df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1206,8 +1206,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); -void __bpf_free_used_maps(struct bpf_prog_aux *aux, - struct bpf_map **used_maps, u32 len); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); @@ -1676,6 +1674,9 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, return bpf_prog_get_type_dev(ufd, type, false); } +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len); + bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); -- cgit v1.2.3 From 541c3bad8dc51b253ba8686d0cd7628e6b9b5f4c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:18 -0800 Subject: bpf: Support BPF ksym variables in kernel modules Add support for directly accessing kernel module variables from BPF programs using special ldimm64 instructions. This functionality builds upon vmlinux ksym support, but extends ldimm64 with src_reg=BPF_PSEUDO_BTF_ID to allow specifying kernel module BTF's FD in insn[1].imm field. During BPF program load time, verifier will resolve FD to BTF object and will take reference on BTF object itself and, for module BTFs, corresponding module as well, to make sure it won't be unloaded from under running BPF program. The mechanism used is similar to how bpf_prog keeps track of used bpf_maps. One interesting change is also in how per-CPU variable is determined. The logic is to find .data..percpu data section in provided BTF, but both vmlinux and module each have their own .data..percpu entries in BTF. So for module's case, the search for DATASEC record needs to look at only module's added BTF types. This is implemented with custom search function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20210112075520.4103414-6-andrii@kernel.org --- include/linux/bpf.h | 10 +++ include/linux/bpf_verifier.h | 3 + include/linux/btf.h | 3 + kernel/bpf/btf.c | 31 ++++++++- kernel/bpf/core.c | 23 +++++++ kernel/bpf/verifier.c | 154 +++++++++++++++++++++++++++++++++++-------- 6 files changed, 194 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e585dbc10df..1aac2af12fed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -761,9 +761,15 @@ struct bpf_ctx_arg_aux { u32 btf_id; }; +struct btf_mod_pair { + struct btf *btf; + struct module *module; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; + u32 used_btf_cnt; u32 max_ctx_offset; u32 max_pkt_offset; u32 max_tp_access; @@ -802,6 +808,7 @@ struct bpf_prog_aux { const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ + struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ @@ -1668,6 +1675,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) } #endif /* CONFIG_BPF_SYSCALL */ +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len); + static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e941fe1484e5..dfe6f85d97dd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,6 +340,7 @@ struct bpf_insn_aux_data { }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +#define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */ #define BPF_VERIFIER_TMP_LOG_SIZE 1024 @@ -398,7 +399,9 @@ struct bpf_verifier_env { struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ + u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; bool allow_ptr_to_map_access; diff --git a/include/linux/btf.h b/include/linux/btf.h index 4c200f5d242b..7fabf1428093 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -91,6 +91,9 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); bool btf_is_kernel(const struct btf *btf); +bool btf_is_module(const struct btf *btf); +struct module *btf_try_get_module(const struct btf *btf); +u32 btf_nr_types(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d6bdb4f4d61..7ccc0133723a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static u32 btf_nr_types_total(const struct btf *btf) +u32 btf_nr_types(const struct btf *btf) { u32 total = 0; @@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) const char *tname; u32 i, total; - total = btf_nr_types_total(btf); + total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) @@ -5743,6 +5743,11 @@ bool btf_is_kernel(const struct btf *btf) return btf->kernel_btf; } +bool btf_is_module(const struct btf *btf) +{ + return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -5877,3 +5882,25 @@ static int __init btf_module_init(void) fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + +struct module *btf_try_get_module(const struct btf *btf) +{ + struct module *res = NULL; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; + + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf != btf) + continue; + + if (try_module_get(btf_mod->module)) + res = btf_mod->module; + + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return res; +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 261f8692d0d2..69c3c308de5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,6 +2119,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct btf_mod_pair *btf_mod; + u32 i; + + for (i = 0; i < len; i++) { + btf_mod = &used_btfs[i]; + if (btf_mod->module) + module_put(btf_mod->module); + btf_put(btf_mod->btf); + } +#endif +} + +static void bpf_free_used_btfs(struct bpf_prog_aux *aux) +{ + __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); + kfree(aux->used_btfs); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; @@ -2126,6 +2148,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); bpf_free_used_maps(aux); + bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5534e667bdb1..ae2aee48cf82 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9703,6 +9703,36 @@ process_bpf_exit: return 0; } +static int find_btf_percpu_datasec(struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + /* + * Both vmlinux and module each have their own ".data..percpu" + * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF + * types to look at only module's own BTF types. + */ + n = btf_nr_types(btf); + if (btf_is_module(btf)) + i = btf_nr_types(btf_vmlinux); + else + i = 1; + + for(; i < n; i++) { + t = btf_type_by_id(btf, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, ".data..percpu")) + return i; + } + + return -ENOENT; +} + /* replace pseudo btf_id with kernel symbol address */ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -9710,48 +9740,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; + struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; + struct btf *btf; s32 datasec_id; u64 addr; - int i; - - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } + int i, btf_fd, err; - if (insn[1].imm != 0) { - verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); - return -EINVAL; + btf_fd = insn[1].imm; + if (btf_fd) { + btf = btf_get_by_fd(btf_fd); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + btf = btf_vmlinux; + btf_get(btf); } - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - return -ENOENT; + err = -ENOENT; + goto err_put; } if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", - id); - return -EINVAL; + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + err = -EINVAL; + goto err_put; } - sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + sym_name = btf_name_by_offset(btf, t->name_off); addr = kallsyms_lookup_name(sym_name); if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - return -ENOENT; + err = -ENOENT; + goto err_put; } - datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", - BTF_KIND_DATASEC); + datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { - datasec = btf_type_by_id(btf_vmlinux, datasec_id); + datasec = btf_type_by_id(btf, datasec_id); for_each_vsi(i, datasec, vsi) { if (vsi->type == id) { percpu = true; @@ -9764,10 +9803,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, insn[1].imm = addr >> 32; type = t->type; - t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9775,21 +9814,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, u32 tsize; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - return -EINVAL; + err = -EINVAL; + goto err_put; } aux->btf_var.reg_type = PTR_TO_MEM; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) { + if (env->used_btfs[i].btf == btf) { + btf_put(btf); + return 0; + } + } + + if (env->used_btf_cnt >= MAX_USED_BTFS) { + err = -E2BIG; + goto err_put; + } + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + err = -ENXIO; + goto err_put; + } + } + + env->used_btf_cnt++; + return 0; +err_put: + btf_put(btf); + return err; } static int check_map_prealloc(struct bpf_map *map) @@ -10086,6 +10158,13 @@ static void release_maps(struct bpf_verifier_env *env) env->used_map_cnt); } +/* drop refcnt of maps used by the rejected program */ +static void release_btfs(struct bpf_verifier_env *env) +{ + __bpf_free_used_btfs(env->prog->aux, env->used_btfs, + env->used_btf_cnt); +} + /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { @@ -12098,7 +12177,10 @@ skip_full_check: goto err_release_maps; } - if (ret == 0 && env->used_map_cnt) { + if (ret) + goto err_release_maps; + + if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), @@ -12112,15 +12194,29 @@ skip_full_check: memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; + } + if (env->used_btf_cnt) { + /* if program passed verifier, update used_btfs in bpf_prog_aux */ + env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, + sizeof(env->used_btfs[0]), + GFP_KERNEL); + if (!env->prog->aux->used_btfs) { + ret = -ENOMEM; + goto err_release_maps; + } + memcpy(env->prog->aux->used_btfs, env->used_btfs, + sizeof(env->used_btfs[0]) * env->used_btf_cnt); + env->prog->aux->used_btf_cnt = env->used_btf_cnt; + } + if (env->used_map_cnt || env->used_btf_cnt) { /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } - if (ret == 0) - adjust_btf_func(env); + adjust_btf_func(env); err_release_maps: if (!env->prog->aux->used_maps) @@ -12128,6 +12224,8 @@ err_release_maps: * them now. Otherwise free_used_maps() will release them. */ release_maps(env); + if (!env->prog->aux->used_btfs) + release_btfs(env); /* extension progs temporarily inherit the attach_type of their targets for verification purposes, so set it back to zero before returning -- cgit v1.2.3 From c2ec5f2ecf6aeef60c63d1af9ddcacdcbf829d68 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 11 Jan 2021 11:46:57 +0100 Subject: net: dsa: add optional stats64 support Allow DSA drivers to export stats64 Signed-off-by: Oleksij Rempel Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 4 +++- net/dsa/slave.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index c9a3dd7588df..c3485ba6c312 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -482,7 +482,7 @@ struct dsa_switch_ops { void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* - * ethtool hardware statistics. + * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); @@ -491,6 +491,8 @@ struct dsa_switch_ops { int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); + void (*get_stats64)(struct dsa_switch *ds, int port, + struct rtnl_link_stats64 *s); /* * ethtool Wake-on-LAN diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5d7f6cada6a8..5a1769602e65 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1569,6 +1569,18 @@ static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev) return dp->ds->devlink ? &dp->devlink_port : NULL; } +static void dsa_slave_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_stats64) + ds->ops->get_stats64(ds, dp->index, s); + else + dev_get_tstats64(dev, s); +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1588,7 +1600,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { #endif .ndo_get_phys_port_name = dsa_slave_get_phys_port_name, .ndo_setup_tc = dsa_slave_setup_tc, - .ndo_get_stats64 = dev_get_tstats64, + .ndo_get_stats64 = dsa_slave_get_stats64, .ndo_get_port_parent_id = dsa_slave_get_port_parent_id, .ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, -- cgit v1.2.3 From 5a9d5ecd69ed65fc2d49cdecfc1c1ab1e4d7af34 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:18 +0100 Subject: can: dev: move bittiming related code into seperate file This patch moves the bittiming related code of the CAN device infrastructure into a separate file. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-4-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- MAINTAINERS | 1 + drivers/net/can/dev/Makefile | 1 + drivers/net/can/dev/bittiming.c | 261 ++++++++++++++++++++++++++++++++++++++++ drivers/net/can/dev/dev.c | 261 ---------------------------------------- include/linux/can/bittiming.h | 44 +++++++ include/linux/can/dev.h | 16 +-- 6 files changed, 308 insertions(+), 276 deletions(-) create mode 100644 drivers/net/can/dev/bittiming.c create mode 100644 include/linux/can/bittiming.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index c3091a91ebbf..d17662df1cd7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3943,6 +3943,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git F: Documentation/devicetree/bindings/net/can/ F: drivers/net/can/ +F: include/linux/can/bittiming.h F: include/linux/can/dev.h F: include/linux/can/led.h F: include/linux/can/platform/ diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile index cba92e6bcf6f..b5c6bb848d9d 100644 --- a/drivers/net/can/dev/Makefile +++ b/drivers/net/can/dev/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CAN_DEV) += can-dev.o +can-dev-y += bittiming.o can-dev-y += dev.o can-dev-y += rx-offload.o diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c new file mode 100644 index 000000000000..f7fe226bb395 --- /dev/null +++ b/drivers/net/can/dev/bittiming.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix + * Copyright (C) 2006 Andrey Volkov, Varma Electronics + * Copyright (C) 2008-2009 Wolfgang Grandegger + */ + +#include + +#ifdef CONFIG_CAN_CALC_BITTIMING +#define CAN_CALC_MAX_ERROR 50 /* in one-tenth of a percent */ + +/* Bit-timing calculation derived from: + * + * Code based on LinCAN sources and H8S2638 project + * Copyright 2004-2006 Pavel Pisa - DCE FELK CVUT cz + * Copyright 2005 Stanislav Marek + * email: pisa@cmp.felk.cvut.cz + * + * Calculates proper bit-timing parameters for a specified bit-rate + * and sample-point, which can then be used to set the bit-timing + * registers of the CAN controller. You can find more information + * in the header file linux/can/netlink.h. + */ +static int +can_update_sample_point(const struct can_bittiming_const *btc, + unsigned int sample_point_nominal, unsigned int tseg, + unsigned int *tseg1_ptr, unsigned int *tseg2_ptr, + unsigned int *sample_point_error_ptr) +{ + unsigned int sample_point_error, best_sample_point_error = UINT_MAX; + unsigned int sample_point, best_sample_point = 0; + unsigned int tseg1, tseg2; + int i; + + for (i = 0; i <= 1; i++) { + tseg2 = tseg + CAN_SYNC_SEG - + (sample_point_nominal * (tseg + CAN_SYNC_SEG)) / + 1000 - i; + tseg2 = clamp(tseg2, btc->tseg2_min, btc->tseg2_max); + tseg1 = tseg - tseg2; + if (tseg1 > btc->tseg1_max) { + tseg1 = btc->tseg1_max; + tseg2 = tseg - tseg1; + } + + sample_point = 1000 * (tseg + CAN_SYNC_SEG - tseg2) / + (tseg + CAN_SYNC_SEG); + sample_point_error = abs(sample_point_nominal - sample_point); + + if (sample_point <= sample_point_nominal && + sample_point_error < best_sample_point_error) { + best_sample_point = sample_point; + best_sample_point_error = sample_point_error; + *tseg1_ptr = tseg1; + *tseg2_ptr = tseg2; + } + } + + if (sample_point_error_ptr) + *sample_point_error_ptr = best_sample_point_error; + + return best_sample_point; +} + +int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, + const struct can_bittiming_const *btc) +{ + struct can_priv *priv = netdev_priv(dev); + unsigned int bitrate; /* current bitrate */ + unsigned int bitrate_error; /* difference between current and nominal value */ + unsigned int best_bitrate_error = UINT_MAX; + unsigned int sample_point_error; /* difference between current and nominal value */ + unsigned int best_sample_point_error = UINT_MAX; + unsigned int sample_point_nominal; /* nominal sample point */ + unsigned int best_tseg = 0; /* current best value for tseg */ + unsigned int best_brp = 0; /* current best value for brp */ + unsigned int brp, tsegall, tseg, tseg1 = 0, tseg2 = 0; + u64 v64; + + /* Use CiA recommended sample points */ + if (bt->sample_point) { + sample_point_nominal = bt->sample_point; + } else { + if (bt->bitrate > 800000) + sample_point_nominal = 750; + else if (bt->bitrate > 500000) + sample_point_nominal = 800; + else + sample_point_nominal = 875; + } + + /* tseg even = round down, odd = round up */ + for (tseg = (btc->tseg1_max + btc->tseg2_max) * 2 + 1; + tseg >= (btc->tseg1_min + btc->tseg2_min) * 2; tseg--) { + tsegall = CAN_SYNC_SEG + tseg / 2; + + /* Compute all possible tseg choices (tseg=tseg1+tseg2) */ + brp = priv->clock.freq / (tsegall * bt->bitrate) + tseg % 2; + + /* choose brp step which is possible in system */ + brp = (brp / btc->brp_inc) * btc->brp_inc; + if (brp < btc->brp_min || brp > btc->brp_max) + continue; + + bitrate = priv->clock.freq / (brp * tsegall); + bitrate_error = abs(bt->bitrate - bitrate); + + /* tseg brp biterror */ + if (bitrate_error > best_bitrate_error) + continue; + + /* reset sample point error if we have a better bitrate */ + if (bitrate_error < best_bitrate_error) + best_sample_point_error = UINT_MAX; + + can_update_sample_point(btc, sample_point_nominal, tseg / 2, + &tseg1, &tseg2, &sample_point_error); + if (sample_point_error > best_sample_point_error) + continue; + + best_sample_point_error = sample_point_error; + best_bitrate_error = bitrate_error; + best_tseg = tseg / 2; + best_brp = brp; + + if (bitrate_error == 0 && sample_point_error == 0) + break; + } + + if (best_bitrate_error) { + /* Error in one-tenth of a percent */ + v64 = (u64)best_bitrate_error * 1000; + do_div(v64, bt->bitrate); + bitrate_error = (u32)v64; + if (bitrate_error > CAN_CALC_MAX_ERROR) { + netdev_err(dev, + "bitrate error %d.%d%% too high\n", + bitrate_error / 10, bitrate_error % 10); + return -EDOM; + } + netdev_warn(dev, "bitrate error %d.%d%%\n", + bitrate_error / 10, bitrate_error % 10); + } + + /* real sample point */ + bt->sample_point = can_update_sample_point(btc, sample_point_nominal, + best_tseg, &tseg1, &tseg2, + NULL); + + v64 = (u64)best_brp * 1000 * 1000 * 1000; + do_div(v64, priv->clock.freq); + bt->tq = (u32)v64; + bt->prop_seg = tseg1 / 2; + bt->phase_seg1 = tseg1 - bt->prop_seg; + bt->phase_seg2 = tseg2; + + /* check for sjw user settings */ + if (!bt->sjw || !btc->sjw_max) { + bt->sjw = 1; + } else { + /* bt->sjw is at least 1 -> sanitize upper bound to sjw_max */ + if (bt->sjw > btc->sjw_max) + bt->sjw = btc->sjw_max; + /* bt->sjw must not be higher than tseg2 */ + if (tseg2 < bt->sjw) + bt->sjw = tseg2; + } + + bt->brp = best_brp; + + /* real bitrate */ + bt->bitrate = priv->clock.freq / + (bt->brp * (CAN_SYNC_SEG + tseg1 + tseg2)); + + return 0; +} +#endif /* CONFIG_CAN_CALC_BITTIMING */ + +/* Checks the validity of the specified bit-timing parameters prop_seg, + * phase_seg1, phase_seg2 and sjw and tries to determine the bitrate + * prescaler value brp. You can find more information in the header + * file linux/can/netlink.h. + */ +static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt, + const struct can_bittiming_const *btc) +{ + struct can_priv *priv = netdev_priv(dev); + int tseg1, alltseg; + u64 brp64; + + tseg1 = bt->prop_seg + bt->phase_seg1; + if (!bt->sjw) + bt->sjw = 1; + if (bt->sjw > btc->sjw_max || + tseg1 < btc->tseg1_min || tseg1 > btc->tseg1_max || + bt->phase_seg2 < btc->tseg2_min || bt->phase_seg2 > btc->tseg2_max) + return -ERANGE; + + brp64 = (u64)priv->clock.freq * (u64)bt->tq; + if (btc->brp_inc > 1) + do_div(brp64, btc->brp_inc); + brp64 += 500000000UL - 1; + do_div(brp64, 1000000000UL); /* the practicable BRP */ + if (btc->brp_inc > 1) + brp64 *= btc->brp_inc; + bt->brp = (u32)brp64; + + if (bt->brp < btc->brp_min || bt->brp > btc->brp_max) + return -EINVAL; + + alltseg = bt->prop_seg + bt->phase_seg1 + bt->phase_seg2 + 1; + bt->bitrate = priv->clock.freq / (bt->brp * alltseg); + bt->sample_point = ((tseg1 + 1) * 1000) / alltseg; + + return 0; +} + +/* Checks the validity of predefined bitrate settings */ +static int +can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt, + const u32 *bitrate_const, + const unsigned int bitrate_const_cnt) +{ + struct can_priv *priv = netdev_priv(dev); + unsigned int i; + + for (i = 0; i < bitrate_const_cnt; i++) { + if (bt->bitrate == bitrate_const[i]) + break; + } + + if (i >= priv->bitrate_const_cnt) + return -EINVAL; + + return 0; +} + +int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, + const struct can_bittiming_const *btc, + const u32 *bitrate_const, + const unsigned int bitrate_const_cnt) +{ + int err; + + /* Depending on the given can_bittiming parameter structure the CAN + * timing parameters are calculated based on the provided bitrate OR + * alternatively the CAN timing parameters (tq, prop_seg, etc.) are + * provided directly which are then checked and fixed up. + */ + if (!bt->tq && bt->bitrate && btc) + err = can_calc_bittiming(dev, bt, btc); + else if (bt->tq && !bt->bitrate && btc) + err = can_fixup_bittiming(dev, bt, btc); + else if (!bt->tq && bt->bitrate && bitrate_const) + err = can_validate_bitrate(dev, bt, bitrate_const, + bitrate_const_cnt); + else + err = -EINVAL; + + return err; +} diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 3486704c8a95..1b3ab95b3fd1 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -58,267 +58,6 @@ u8 can_fd_len2dlc(u8 len) } EXPORT_SYMBOL_GPL(can_fd_len2dlc); -#ifdef CONFIG_CAN_CALC_BITTIMING -#define CAN_CALC_MAX_ERROR 50 /* in one-tenth of a percent */ - -/* Bit-timing calculation derived from: - * - * Code based on LinCAN sources and H8S2638 project - * Copyright 2004-2006 Pavel Pisa - DCE FELK CVUT cz - * Copyright 2005 Stanislav Marek - * email: pisa@cmp.felk.cvut.cz - * - * Calculates proper bit-timing parameters for a specified bit-rate - * and sample-point, which can then be used to set the bit-timing - * registers of the CAN controller. You can find more information - * in the header file linux/can/netlink.h. - */ -static int -can_update_sample_point(const struct can_bittiming_const *btc, - unsigned int sample_point_nominal, unsigned int tseg, - unsigned int *tseg1_ptr, unsigned int *tseg2_ptr, - unsigned int *sample_point_error_ptr) -{ - unsigned int sample_point_error, best_sample_point_error = UINT_MAX; - unsigned int sample_point, best_sample_point = 0; - unsigned int tseg1, tseg2; - int i; - - for (i = 0; i <= 1; i++) { - tseg2 = tseg + CAN_SYNC_SEG - - (sample_point_nominal * (tseg + CAN_SYNC_SEG)) / - 1000 - i; - tseg2 = clamp(tseg2, btc->tseg2_min, btc->tseg2_max); - tseg1 = tseg - tseg2; - if (tseg1 > btc->tseg1_max) { - tseg1 = btc->tseg1_max; - tseg2 = tseg - tseg1; - } - - sample_point = 1000 * (tseg + CAN_SYNC_SEG - tseg2) / - (tseg + CAN_SYNC_SEG); - sample_point_error = abs(sample_point_nominal - sample_point); - - if (sample_point <= sample_point_nominal && - sample_point_error < best_sample_point_error) { - best_sample_point = sample_point; - best_sample_point_error = sample_point_error; - *tseg1_ptr = tseg1; - *tseg2_ptr = tseg2; - } - } - - if (sample_point_error_ptr) - *sample_point_error_ptr = best_sample_point_error; - - return best_sample_point; -} - -static int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, - const struct can_bittiming_const *btc) -{ - struct can_priv *priv = netdev_priv(dev); - unsigned int bitrate; /* current bitrate */ - unsigned int bitrate_error; /* difference between current and nominal value */ - unsigned int best_bitrate_error = UINT_MAX; - unsigned int sample_point_error; /* difference between current and nominal value */ - unsigned int best_sample_point_error = UINT_MAX; - unsigned int sample_point_nominal; /* nominal sample point */ - unsigned int best_tseg = 0; /* current best value for tseg */ - unsigned int best_brp = 0; /* current best value for brp */ - unsigned int brp, tsegall, tseg, tseg1 = 0, tseg2 = 0; - u64 v64; - - /* Use CiA recommended sample points */ - if (bt->sample_point) { - sample_point_nominal = bt->sample_point; - } else { - if (bt->bitrate > 800000) - sample_point_nominal = 750; - else if (bt->bitrate > 500000) - sample_point_nominal = 800; - else - sample_point_nominal = 875; - } - - /* tseg even = round down, odd = round up */ - for (tseg = (btc->tseg1_max + btc->tseg2_max) * 2 + 1; - tseg >= (btc->tseg1_min + btc->tseg2_min) * 2; tseg--) { - tsegall = CAN_SYNC_SEG + tseg / 2; - - /* Compute all possible tseg choices (tseg=tseg1+tseg2) */ - brp = priv->clock.freq / (tsegall * bt->bitrate) + tseg % 2; - - /* choose brp step which is possible in system */ - brp = (brp / btc->brp_inc) * btc->brp_inc; - if (brp < btc->brp_min || brp > btc->brp_max) - continue; - - bitrate = priv->clock.freq / (brp * tsegall); - bitrate_error = abs(bt->bitrate - bitrate); - - /* tseg brp biterror */ - if (bitrate_error > best_bitrate_error) - continue; - - /* reset sample point error if we have a better bitrate */ - if (bitrate_error < best_bitrate_error) - best_sample_point_error = UINT_MAX; - - can_update_sample_point(btc, sample_point_nominal, tseg / 2, - &tseg1, &tseg2, &sample_point_error); - if (sample_point_error > best_sample_point_error) - continue; - - best_sample_point_error = sample_point_error; - best_bitrate_error = bitrate_error; - best_tseg = tseg / 2; - best_brp = brp; - - if (bitrate_error == 0 && sample_point_error == 0) - break; - } - - if (best_bitrate_error) { - /* Error in one-tenth of a percent */ - v64 = (u64)best_bitrate_error * 1000; - do_div(v64, bt->bitrate); - bitrate_error = (u32)v64; - if (bitrate_error > CAN_CALC_MAX_ERROR) { - netdev_err(dev, - "bitrate error %d.%d%% too high\n", - bitrate_error / 10, bitrate_error % 10); - return -EDOM; - } - netdev_warn(dev, "bitrate error %d.%d%%\n", - bitrate_error / 10, bitrate_error % 10); - } - - /* real sample point */ - bt->sample_point = can_update_sample_point(btc, sample_point_nominal, - best_tseg, &tseg1, &tseg2, - NULL); - - v64 = (u64)best_brp * 1000 * 1000 * 1000; - do_div(v64, priv->clock.freq); - bt->tq = (u32)v64; - bt->prop_seg = tseg1 / 2; - bt->phase_seg1 = tseg1 - bt->prop_seg; - bt->phase_seg2 = tseg2; - - /* check for sjw user settings */ - if (!bt->sjw || !btc->sjw_max) { - bt->sjw = 1; - } else { - /* bt->sjw is at least 1 -> sanitize upper bound to sjw_max */ - if (bt->sjw > btc->sjw_max) - bt->sjw = btc->sjw_max; - /* bt->sjw must not be higher than tseg2 */ - if (tseg2 < bt->sjw) - bt->sjw = tseg2; - } - - bt->brp = best_brp; - - /* real bitrate */ - bt->bitrate = priv->clock.freq / - (bt->brp * (CAN_SYNC_SEG + tseg1 + tseg2)); - - return 0; -} -#else /* !CONFIG_CAN_CALC_BITTIMING */ -static int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, - const struct can_bittiming_const *btc) -{ - netdev_err(dev, "bit-timing calculation not available\n"); - return -EINVAL; -} -#endif /* CONFIG_CAN_CALC_BITTIMING */ - -/* Checks the validity of the specified bit-timing parameters prop_seg, - * phase_seg1, phase_seg2 and sjw and tries to determine the bitrate - * prescaler value brp. You can find more information in the header - * file linux/can/netlink.h. - */ -static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt, - const struct can_bittiming_const *btc) -{ - struct can_priv *priv = netdev_priv(dev); - int tseg1, alltseg; - u64 brp64; - - tseg1 = bt->prop_seg + bt->phase_seg1; - if (!bt->sjw) - bt->sjw = 1; - if (bt->sjw > btc->sjw_max || - tseg1 < btc->tseg1_min || tseg1 > btc->tseg1_max || - bt->phase_seg2 < btc->tseg2_min || bt->phase_seg2 > btc->tseg2_max) - return -ERANGE; - - brp64 = (u64)priv->clock.freq * (u64)bt->tq; - if (btc->brp_inc > 1) - do_div(brp64, btc->brp_inc); - brp64 += 500000000UL - 1; - do_div(brp64, 1000000000UL); /* the practicable BRP */ - if (btc->brp_inc > 1) - brp64 *= btc->brp_inc; - bt->brp = (u32)brp64; - - if (bt->brp < btc->brp_min || bt->brp > btc->brp_max) - return -EINVAL; - - alltseg = bt->prop_seg + bt->phase_seg1 + bt->phase_seg2 + 1; - bt->bitrate = priv->clock.freq / (bt->brp * alltseg); - bt->sample_point = ((tseg1 + 1) * 1000) / alltseg; - - return 0; -} - -/* Checks the validity of predefined bitrate settings */ -static int -can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt, - const u32 *bitrate_const, - const unsigned int bitrate_const_cnt) -{ - struct can_priv *priv = netdev_priv(dev); - unsigned int i; - - for (i = 0; i < bitrate_const_cnt; i++) { - if (bt->bitrate == bitrate_const[i]) - break; - } - - if (i >= priv->bitrate_const_cnt) - return -EINVAL; - - return 0; -} - -static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, - const struct can_bittiming_const *btc, - const u32 *bitrate_const, - const unsigned int bitrate_const_cnt) -{ - int err; - - /* Depending on the given can_bittiming parameter structure the CAN - * timing parameters are calculated based on the provided bitrate OR - * alternatively the CAN timing parameters (tq, prop_seg, etc.) are - * provided directly which are then checked and fixed up. - */ - if (!bt->tq && bt->bitrate && btc) - err = can_calc_bittiming(dev, bt, btc); - else if (bt->tq && !bt->bitrate && btc) - err = can_fixup_bittiming(dev, bt, btc); - else if (!bt->tq && bt->bitrate && bitrate_const) - err = can_validate_bitrate(dev, bt, bitrate_const, - bitrate_const_cnt); - else - err = -EINVAL; - - return err; -} - static void can_update_state_error_stats(struct net_device *dev, enum can_state new_state) { diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h new file mode 100644 index 000000000000..707575c668f4 --- /dev/null +++ b/include/linux/can/bittiming.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2020 Pengutronix, Marc Kleine-Budde + */ + +#ifndef _CAN_BITTIMING_H +#define _CAN_BITTIMING_H + +#include +#include + +#define CAN_SYNC_SEG 1 + +#ifdef CONFIG_CAN_CALC_BITTIMING +int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, + const struct can_bittiming_const *btc); +#else /* !CONFIG_CAN_CALC_BITTIMING */ +static inline int +can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, + const struct can_bittiming_const *btc) +{ + netdev_err(dev, "bit-timing calculation not available\n"); + return -EINVAL; +} +#endif /* CONFIG_CAN_CALC_BITTIMING */ + +int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, + const struct can_bittiming_const *btc, + const u32 *bitrate_const, + const unsigned int bitrate_const_cnt); + +/* + * can_bit_time() - Duration of one bit + * + * Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for + * additional information. + * + * Return: the number of time quanta in one bit. + */ +static inline unsigned int can_bit_time(const struct can_bittiming *bt) +{ + return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2; +} + +#endif /* !_CAN_BITTIMING_H */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 197a79535cc2..054c3bed190b 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -15,6 +15,7 @@ #define _CAN_DEV_H #include +#include #include #include #include @@ -82,21 +83,6 @@ struct can_priv { #endif }; -#define CAN_SYNC_SEG 1 - -/* - * can_bit_time() - Duration of one bit - * - * Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for - * additional information. - * - * Return: the number of time quanta in one bit. - */ -static inline unsigned int can_bit_time(const struct can_bittiming *bt) -{ - return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2; -} - /* * can_cc_dlc2len(value) - convert a given data length code (dlc) of a * Classical CAN frame into a valid data length of max. 8 bytes. -- cgit v1.2.3 From bdd2e413192dd5f2153d166cd907b048cce872e8 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:19 +0100 Subject: can: dev: move length related code into seperate file This patch moves all CAN frame length related code of the CAN device infrastructure into a separate file. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-5-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- MAINTAINERS | 1 + drivers/net/can/dev/Makefile | 1 + drivers/net/can/dev/dev.c | 33 ------------------------------ drivers/net/can/dev/length.c | 38 +++++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 41 +------------------------------------ include/linux/can/length.h | 48 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 73 deletions(-) create mode 100644 drivers/net/can/dev/length.c create mode 100644 include/linux/can/length.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index d17662df1cd7..21780c4e2e71 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3946,6 +3946,7 @@ F: drivers/net/can/ F: include/linux/can/bittiming.h F: include/linux/can/dev.h F: include/linux/can/led.h +F: include/linux/can/length.h F: include/linux/can/platform/ F: include/linux/can/rx-offload.h F: include/uapi/linux/can/error.h diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile index b5c6bb848d9d..5c647951e06d 100644 --- a/drivers/net/can/dev/Makefile +++ b/drivers/net/can/dev/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_CAN_DEV) += can-dev.o can-dev-y += bittiming.o can-dev-y += dev.o +can-dev-y += length.o can-dev-y += rx-offload.o can-dev-$(CONFIG_CAN_LEDS) += led.o diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 1b3ab95b3fd1..3372e99d5db7 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -25,39 +25,6 @@ MODULE_DESCRIPTION(MOD_DESC); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Wolfgang Grandegger "); -/* CAN DLC to real data length conversion helpers */ - -static const u8 dlc2len[] = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 12, 16, 20, 24, 32, 48, 64}; - -/* get data length from raw data length code (DLC) */ -u8 can_fd_dlc2len(u8 dlc) -{ - return dlc2len[dlc & 0x0F]; -} -EXPORT_SYMBOL_GPL(can_fd_dlc2len); - -static const u8 len2dlc[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, /* 0 - 8 */ - 9, 9, 9, 9, /* 9 - 12 */ - 10, 10, 10, 10, /* 13 - 16 */ - 11, 11, 11, 11, /* 17 - 20 */ - 12, 12, 12, 12, /* 21 - 24 */ - 13, 13, 13, 13, 13, 13, 13, 13, /* 25 - 32 */ - 14, 14, 14, 14, 14, 14, 14, 14, /* 33 - 40 */ - 14, 14, 14, 14, 14, 14, 14, 14, /* 41 - 48 */ - 15, 15, 15, 15, 15, 15, 15, 15, /* 49 - 56 */ - 15, 15, 15, 15, 15, 15, 15, 15}; /* 57 - 64 */ - -/* map the sanitized data length to an appropriate data length code */ -u8 can_fd_len2dlc(u8 len) -{ - if (unlikely(len > 64)) - return 0xF; - - return len2dlc[len]; -} -EXPORT_SYMBOL_GPL(can_fd_len2dlc); - static void can_update_state_error_stats(struct net_device *dev, enum can_state new_state) { diff --git a/drivers/net/can/dev/length.c b/drivers/net/can/dev/length.c new file mode 100644 index 000000000000..6fe18aa23ec9 --- /dev/null +++ b/drivers/net/can/dev/length.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2012, 2020 Oliver Hartkopp + */ + +#include + +/* CAN DLC to real data length conversion helpers */ + +static const u8 dlc2len[] = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 12, 16, 20, 24, 32, 48, 64}; + +/* get data length from raw data length code (DLC) */ +u8 can_fd_dlc2len(u8 dlc) +{ + return dlc2len[dlc & 0x0F]; +} +EXPORT_SYMBOL_GPL(can_fd_dlc2len); + +static const u8 len2dlc[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, /* 0 - 8 */ + 9, 9, 9, 9, /* 9 - 12 */ + 10, 10, 10, 10, /* 13 - 16 */ + 11, 11, 11, 11, /* 17 - 20 */ + 12, 12, 12, 12, /* 21 - 24 */ + 13, 13, 13, 13, 13, 13, 13, 13, /* 25 - 32 */ + 14, 14, 14, 14, 14, 14, 14, 14, /* 33 - 40 */ + 14, 14, 14, 14, 14, 14, 14, 14, /* 41 - 48 */ + 15, 15, 15, 15, 15, 15, 15, 15, /* 49 - 56 */ + 15, 15, 15, 15, 15, 15, 15, 15}; /* 57 - 64 */ + +/* map the sanitized data length to an appropriate data length code */ +u8 can_fd_len2dlc(u8 len) +{ + if (unlikely(len > 64)) + return 0xF; + + return len2dlc[len]; +} +EXPORT_SYMBOL_GPL(can_fd_len2dlc); diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 054c3bed190b..d75fba1d030a 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -83,15 +84,6 @@ struct can_priv { #endif }; -/* - * can_cc_dlc2len(value) - convert a given data length code (dlc) of a - * Classical CAN frame into a valid data length of max. 8 bytes. - * - * To be used in the CAN netdriver receive path to ensure conformance with - * ISO 11898-1 Chapter 8.4.2.3 (DLC field) - */ -#define can_cc_dlc2len(dlc) (min_t(u8, (dlc), CAN_MAX_DLEN)) - /* Check for outgoing skbs that have not been created by the CAN subsystem */ static inline bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb) @@ -156,31 +148,6 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb) return skb->len == CANFD_MTU; } -/* helper to get the data length code (DLC) for Classical CAN raw DLC access */ -static inline u8 can_get_cc_dlc(const struct can_frame *cf, const u32 ctrlmode) -{ - /* return len8_dlc as dlc value only if all conditions apply */ - if ((ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC) && - (cf->len == CAN_MAX_DLEN) && - (cf->len8_dlc > CAN_MAX_DLEN && cf->len8_dlc <= CAN_MAX_RAW_DLC)) - return cf->len8_dlc; - - /* return the payload length as dlc value */ - return cf->len; -} - -/* helper to set len and len8_dlc value for Classical CAN raw DLC access */ -static inline void can_frame_set_cc_len(struct can_frame *cf, const u8 dlc, - const u32 ctrlmode) -{ - /* the caller already ensured that dlc is a value from 0 .. 15 */ - if (ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC && dlc > CAN_MAX_DLEN) - cf->len8_dlc = dlc; - - /* limit the payload length 'len' to CAN_MAX_DLEN */ - cf->len = can_cc_dlc2len(dlc); -} - /* helper to define static CAN controller features at device creation time */ static inline void can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) @@ -196,12 +163,6 @@ static inline void can_set_static_ctrlmode(struct net_device *dev, dev->mtu = CANFD_MTU; } -/* get data length from raw data length code (DLC) */ -u8 can_fd_dlc2len(u8 dlc); - -/* map the sanitized data length to an appropriate data length code */ -u8 can_fd_len2dlc(u8 len); - struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs); #define alloc_candev(sizeof_priv, echo_skb_max) \ diff --git a/include/linux/can/length.h b/include/linux/can/length.h new file mode 100644 index 000000000000..156b9d17969a --- /dev/null +++ b/include/linux/can/length.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 Oliver Hartkopp + */ + +#ifndef _CAN_LENGTH_H +#define _CAN_LENGTH_H + +/* + * can_cc_dlc2len(value) - convert a given data length code (dlc) of a + * Classical CAN frame into a valid data length of max. 8 bytes. + * + * To be used in the CAN netdriver receive path to ensure conformance with + * ISO 11898-1 Chapter 8.4.2.3 (DLC field) + */ +#define can_cc_dlc2len(dlc) (min_t(u8, (dlc), CAN_MAX_DLEN)) + +/* helper to get the data length code (DLC) for Classical CAN raw DLC access */ +static inline u8 can_get_cc_dlc(const struct can_frame *cf, const u32 ctrlmode) +{ + /* return len8_dlc as dlc value only if all conditions apply */ + if ((ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC) && + (cf->len == CAN_MAX_DLEN) && + (cf->len8_dlc > CAN_MAX_DLEN && cf->len8_dlc <= CAN_MAX_RAW_DLC)) + return cf->len8_dlc; + + /* return the payload length as dlc value */ + return cf->len; +} + +/* helper to set len and len8_dlc value for Classical CAN raw DLC access */ +static inline void can_frame_set_cc_len(struct can_frame *cf, const u8 dlc, + const u32 ctrlmode) +{ + /* the caller already ensured that dlc is a value from 0 .. 15 */ + if (ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC && dlc > CAN_MAX_DLEN) + cf->len8_dlc = dlc; + + /* limit the payload length 'len' to CAN_MAX_DLEN */ + cf->len = can_cc_dlc2len(dlc); +} + +/* get data length from raw data length code (DLC) */ +u8 can_fd_dlc2len(u8 dlc); + +/* map the sanitized data length to an appropriate data length code */ +u8 can_fd_len2dlc(u8 len); + +#endif /* !_CAN_LENGTH_H */ -- cgit v1.2.3 From 18f2dbfd2232212f53af9d249682f13a8335d54f Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:20 +0100 Subject: can: dev: move skb related into seperate file This patch moves the skb related code of the CAN device infrastructure into a separate file. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-6-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/Makefile | 1 + drivers/net/can/dev/dev.c | 213 ----------------------------------------- drivers/net/can/dev/skb.c | 220 +++++++++++++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 76 --------------- include/linux/can/skb.h | 77 +++++++++++++++ 5 files changed, 298 insertions(+), 289 deletions(-) create mode 100644 drivers/net/can/dev/skb.c (limited to 'include') diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile index 5c647951e06d..188bd53b113c 100644 --- a/drivers/net/can/dev/Makefile +++ b/drivers/net/can/dev/Makefile @@ -5,5 +5,6 @@ can-dev-y += bittiming.o can-dev-y += dev.o can-dev-y += length.o can-dev-y += rx-offload.o +can-dev-y += skb.o can-dev-$(CONFIG_CAN_LEDS) += led.o diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 3372e99d5db7..fe71ff9ef8c9 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -132,149 +132,6 @@ void can_change_state(struct net_device *dev, struct can_frame *cf, } EXPORT_SYMBOL_GPL(can_change_state); -/* Local echo of CAN messages - * - * CAN network devices *should* support a local echo functionality - * (see Documentation/networking/can.rst). To test the handling of CAN - * interfaces that do not support the local echo both driver types are - * implemented. In the case that the driver does not support the echo - * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core - * to perform the echo as a fallback solution. - */ -static void can_flush_echo_skb(struct net_device *dev) -{ - struct can_priv *priv = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - int i; - - for (i = 0; i < priv->echo_skb_max; i++) { - if (priv->echo_skb[i]) { - kfree_skb(priv->echo_skb[i]); - priv->echo_skb[i] = NULL; - stats->tx_dropped++; - stats->tx_aborted_errors++; - } - } -} - -/* Put the skb on the stack to be looped backed locally lateron - * - * The function is typically called in the start_xmit function - * of the device driver. The driver must protect access to - * priv->echo_skb, if necessary. - */ -int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, - unsigned int idx) -{ - struct can_priv *priv = netdev_priv(dev); - - BUG_ON(idx >= priv->echo_skb_max); - - /* check flag whether this packet has to be looped back */ - if (!(dev->flags & IFF_ECHO) || skb->pkt_type != PACKET_LOOPBACK || - (skb->protocol != htons(ETH_P_CAN) && - skb->protocol != htons(ETH_P_CANFD))) { - kfree_skb(skb); - return 0; - } - - if (!priv->echo_skb[idx]) { - skb = can_create_echo_skb(skb); - if (!skb) - return -ENOMEM; - - /* make settings for echo to reduce code in irq context */ - skb->pkt_type = PACKET_BROADCAST; - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->dev = dev; - - /* save this skb for tx interrupt echo handling */ - priv->echo_skb[idx] = skb; - } else { - /* locking problem with netif_stop_queue() ?? */ - netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx); - kfree_skb(skb); - return -EBUSY; - } - - return 0; -} -EXPORT_SYMBOL_GPL(can_put_echo_skb); - -struct sk_buff * -__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr) -{ - struct can_priv *priv = netdev_priv(dev); - - if (idx >= priv->echo_skb_max) { - netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", - __func__, idx, priv->echo_skb_max); - return NULL; - } - - if (priv->echo_skb[idx]) { - /* Using "struct canfd_frame::len" for the frame - * length is supported on both CAN and CANFD frames. - */ - struct sk_buff *skb = priv->echo_skb[idx]; - struct canfd_frame *cf = (struct canfd_frame *)skb->data; - - /* get the real payload length for netdev statistics */ - if (cf->can_id & CAN_RTR_FLAG) - *len_ptr = 0; - else - *len_ptr = cf->len; - - priv->echo_skb[idx] = NULL; - - return skb; - } - - return NULL; -} - -/* Get the skb from the stack and loop it back locally - * - * The function is typically called when the TX done interrupt - * is handled in the device driver. The driver must protect - * access to priv->echo_skb, if necessary. - */ -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) -{ - struct sk_buff *skb; - u8 len; - - skb = __can_get_echo_skb(dev, idx, &len); - if (!skb) - return 0; - - skb_get(skb); - if (netif_rx(skb) == NET_RX_SUCCESS) - dev_consume_skb_any(skb); - else - dev_kfree_skb_any(skb); - - return len; -} -EXPORT_SYMBOL_GPL(can_get_echo_skb); - -/* Remove the skb from the stack and free it. - * - * The function is typically called when TX failed. - */ -void can_free_echo_skb(struct net_device *dev, unsigned int idx) -{ - struct can_priv *priv = netdev_priv(dev); - - BUG_ON(idx >= priv->echo_skb_max); - - if (priv->echo_skb[idx]) { - dev_kfree_skb_any(priv->echo_skb[idx]); - priv->echo_skb[idx] = NULL; - } -} -EXPORT_SYMBOL_GPL(can_free_echo_skb); - /* CAN device restart for bus-off recovery */ static void can_restart(struct net_device *dev) { @@ -379,76 +236,6 @@ static void can_setup(struct net_device *dev) dev->features = NETIF_F_HW_CSUM; } -struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) -{ - struct sk_buff *skb; - - skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + - sizeof(struct can_frame)); - if (unlikely(!skb)) - return NULL; - - skb->protocol = htons(ETH_P_CAN); - skb->pkt_type = PACKET_BROADCAST; - skb->ip_summed = CHECKSUM_UNNECESSARY; - - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; - - *cf = skb_put_zero(skb, sizeof(struct can_frame)); - - return skb; -} -EXPORT_SYMBOL_GPL(alloc_can_skb); - -struct sk_buff *alloc_canfd_skb(struct net_device *dev, - struct canfd_frame **cfd) -{ - struct sk_buff *skb; - - skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + - sizeof(struct canfd_frame)); - if (unlikely(!skb)) - return NULL; - - skb->protocol = htons(ETH_P_CANFD); - skb->pkt_type = PACKET_BROADCAST; - skb->ip_summed = CHECKSUM_UNNECESSARY; - - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; - - *cfd = skb_put_zero(skb, sizeof(struct canfd_frame)); - - return skb; -} -EXPORT_SYMBOL_GPL(alloc_canfd_skb); - -struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf) -{ - struct sk_buff *skb; - - skb = alloc_can_skb(dev, cf); - if (unlikely(!skb)) - return NULL; - - (*cf)->can_id = CAN_ERR_FLAG; - (*cf)->len = CAN_ERR_DLC; - - return skb; -} -EXPORT_SYMBOL_GPL(alloc_can_err_skb); - /* Allocate and setup space for the CAN network device */ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs) diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c new file mode 100644 index 000000000000..26cd597ff771 --- /dev/null +++ b/drivers/net/can/dev/skb.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix + * Copyright (C) 2006 Andrey Volkov, Varma Electronics + * Copyright (C) 2008-2009 Wolfgang Grandegger + */ + +#include + +/* Local echo of CAN messages + * + * CAN network devices *should* support a local echo functionality + * (see Documentation/networking/can.rst). To test the handling of CAN + * interfaces that do not support the local echo both driver types are + * implemented. In the case that the driver does not support the echo + * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core + * to perform the echo as a fallback solution. + */ +void can_flush_echo_skb(struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + struct net_device_stats *stats = &dev->stats; + int i; + + for (i = 0; i < priv->echo_skb_max; i++) { + if (priv->echo_skb[i]) { + kfree_skb(priv->echo_skb[i]); + priv->echo_skb[i] = NULL; + stats->tx_dropped++; + stats->tx_aborted_errors++; + } + } +} + +/* Put the skb on the stack to be looped backed locally lateron + * + * The function is typically called in the start_xmit function + * of the device driver. The driver must protect access to + * priv->echo_skb, if necessary. + */ +int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, + unsigned int idx) +{ + struct can_priv *priv = netdev_priv(dev); + + BUG_ON(idx >= priv->echo_skb_max); + + /* check flag whether this packet has to be looped back */ + if (!(dev->flags & IFF_ECHO) || skb->pkt_type != PACKET_LOOPBACK || + (skb->protocol != htons(ETH_P_CAN) && + skb->protocol != htons(ETH_P_CANFD))) { + kfree_skb(skb); + return 0; + } + + if (!priv->echo_skb[idx]) { + skb = can_create_echo_skb(skb); + if (!skb) + return -ENOMEM; + + /* make settings for echo to reduce code in irq context */ + skb->pkt_type = PACKET_BROADCAST; + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->dev = dev; + + /* save this skb for tx interrupt echo handling */ + priv->echo_skb[idx] = skb; + } else { + /* locking problem with netif_stop_queue() ?? */ + netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx); + kfree_skb(skb); + return -EBUSY; + } + + return 0; +} +EXPORT_SYMBOL_GPL(can_put_echo_skb); + +struct sk_buff * +__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr) +{ + struct can_priv *priv = netdev_priv(dev); + + if (idx >= priv->echo_skb_max) { + netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", + __func__, idx, priv->echo_skb_max); + return NULL; + } + + if (priv->echo_skb[idx]) { + /* Using "struct canfd_frame::len" for the frame + * length is supported on both CAN and CANFD frames. + */ + struct sk_buff *skb = priv->echo_skb[idx]; + struct canfd_frame *cf = (struct canfd_frame *)skb->data; + + /* get the real payload length for netdev statistics */ + if (cf->can_id & CAN_RTR_FLAG) + *len_ptr = 0; + else + *len_ptr = cf->len; + + priv->echo_skb[idx] = NULL; + + return skb; + } + + return NULL; +} + +/* Get the skb from the stack and loop it back locally + * + * The function is typically called when the TX done interrupt + * is handled in the device driver. The driver must protect + * access to priv->echo_skb, if necessary. + */ +unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) +{ + struct sk_buff *skb; + u8 len; + + skb = __can_get_echo_skb(dev, idx, &len); + if (!skb) + return 0; + + skb_get(skb); + if (netif_rx(skb) == NET_RX_SUCCESS) + dev_consume_skb_any(skb); + else + dev_kfree_skb_any(skb); + + return len; +} +EXPORT_SYMBOL_GPL(can_get_echo_skb); + +/* Remove the skb from the stack and free it. + * + * The function is typically called when TX failed. + */ +void can_free_echo_skb(struct net_device *dev, unsigned int idx) +{ + struct can_priv *priv = netdev_priv(dev); + + BUG_ON(idx >= priv->echo_skb_max); + + if (priv->echo_skb[idx]) { + dev_kfree_skb_any(priv->echo_skb[idx]); + priv->echo_skb[idx] = NULL; + } +} +EXPORT_SYMBOL_GPL(can_free_echo_skb); + +struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) +{ + struct sk_buff *skb; + + skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + + sizeof(struct can_frame)); + if (unlikely(!skb)) + return NULL; + + skb->protocol = htons(ETH_P_CAN); + skb->pkt_type = PACKET_BROADCAST; + skb->ip_summed = CHECKSUM_UNNECESSARY; + + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + *cf = skb_put_zero(skb, sizeof(struct can_frame)); + + return skb; +} +EXPORT_SYMBOL_GPL(alloc_can_skb); + +struct sk_buff *alloc_canfd_skb(struct net_device *dev, + struct canfd_frame **cfd) +{ + struct sk_buff *skb; + + skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + + sizeof(struct canfd_frame)); + if (unlikely(!skb)) + return NULL; + + skb->protocol = htons(ETH_P_CANFD); + skb->pkt_type = PACKET_BROADCAST; + skb->ip_summed = CHECKSUM_UNNECESSARY; + + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + *cfd = skb_put_zero(skb, sizeof(struct canfd_frame)); + + return skb; +} +EXPORT_SYMBOL_GPL(alloc_canfd_skb); + +struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf) +{ + struct sk_buff *skb; + + skb = alloc_can_skb(dev, cf); + if (unlikely(!skb)) + return NULL; + + (*cf)->can_id = CAN_ERR_FLAG; + (*cf)->len = CAN_ERR_DLC; + + return skb; +} +EXPORT_SYMBOL_GPL(alloc_can_err_skb); diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index d75fba1d030a..4a26e128af7f 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -84,69 +84,6 @@ struct can_priv { #endif }; -/* Check for outgoing skbs that have not been created by the CAN subsystem */ -static inline bool can_skb_headroom_valid(struct net_device *dev, - struct sk_buff *skb) -{ - /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ - if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) - return false; - - /* af_packet does not apply CAN skb specific settings */ - if (skb->ip_summed == CHECKSUM_NONE) { - /* init headroom */ - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; - - /* perform proper loopback on capable devices */ - if (dev->flags & IFF_ECHO) - skb->pkt_type = PACKET_LOOPBACK; - else - skb->pkt_type = PACKET_HOST; - - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - } - - return true; -} - -/* Drop a given socketbuffer if it does not contain a valid CAN frame. */ -static inline bool can_dropped_invalid_skb(struct net_device *dev, - struct sk_buff *skb) -{ - const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - - if (skb->protocol == htons(ETH_P_CAN)) { - if (unlikely(skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN)) - goto inval_skb; - } else if (skb->protocol == htons(ETH_P_CANFD)) { - if (unlikely(skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN)) - goto inval_skb; - } else - goto inval_skb; - - if (!can_skb_headroom_valid(dev, skb)) - goto inval_skb; - - return false; - -inval_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return true; -} - -static inline bool can_is_canfd_skb(const struct sk_buff *skb) -{ - /* the CAN specific type of skb is identified by its data length */ - return skb->len == CANFD_MTU; -} /* helper to define static CAN controller features at device creation time */ static inline void can_set_static_ctrlmode(struct net_device *dev, @@ -187,23 +124,10 @@ void can_bus_off(struct net_device *dev); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state); -int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, - unsigned int idx); -struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, - u8 *len_ptr); -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); -void can_free_echo_skb(struct net_device *dev, unsigned int idx); - #ifdef CONFIG_OF void of_can_transceiver(struct net_device *dev); #else static inline void of_can_transceiver(struct net_device *dev) { } #endif -struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); -struct sk_buff *alloc_canfd_skb(struct net_device *dev, - struct canfd_frame **cfd); -struct sk_buff *alloc_can_err_skb(struct net_device *dev, - struct can_frame **cf); - #endif /* !_CAN_DEV_H */ diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index fc61cf4eff1c..c90ebbd3008c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -16,6 +16,19 @@ #include #include +void can_flush_echo_skb(struct net_device *dev); +int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, + unsigned int idx); +struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, + u8 *len_ptr); +unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); +void can_free_echo_skb(struct net_device *dev, unsigned int idx); +struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); +struct sk_buff *alloc_canfd_skb(struct net_device *dev, + struct canfd_frame **cfd); +struct sk_buff *alloc_can_err_skb(struct net_device *dev, + struct can_frame **cf); + /* * The struct can_skb_priv is used to transport additional information along * with the stored struct can(fd)_frame that can not be contained in existing @@ -74,4 +87,68 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) return nskb; } +/* Check for outgoing skbs that have not been created by the CAN subsystem */ +static inline bool can_skb_headroom_valid(struct net_device *dev, + struct sk_buff *skb) +{ + /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ + if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) + return false; + + /* af_packet does not apply CAN skb specific settings */ + if (skb->ip_summed == CHECKSUM_NONE) { + /* init headroom */ + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* perform proper loopback on capable devices */ + if (dev->flags & IFF_ECHO) + skb->pkt_type = PACKET_LOOPBACK; + else + skb->pkt_type = PACKET_HOST; + + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + } + + return true; +} + +/* Drop a given socketbuffer if it does not contain a valid CAN frame. */ +static inline bool can_dropped_invalid_skb(struct net_device *dev, + struct sk_buff *skb) +{ + const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + + if (skb->protocol == htons(ETH_P_CAN)) { + if (unlikely(skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) + goto inval_skb; + } else if (skb->protocol == htons(ETH_P_CANFD)) { + if (unlikely(skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) + goto inval_skb; + } else + goto inval_skb; + + if (!can_skb_headroom_valid(dev, skb)) + goto inval_skb; + + return false; + +inval_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return true; +} + +static inline bool can_is_canfd_skb(const struct sk_buff *skb) +{ + /* the CAN specific type of skb is identified by its data length */ + return skb->len == CANFD_MTU; +} + #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From 0a042c6ec991e4d33062ee52e9e23f615bc659f9 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:21 +0100 Subject: can: dev: move netlink related code into seperate file This patch moves the netlink related code of the CAN device infrastructure into a separate file. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-7-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/Makefile | 1 + drivers/net/can/dev/dev.c | 370 +---------------------------------------- drivers/net/can/dev/netlink.c | 379 ++++++++++++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 6 + 4 files changed, 389 insertions(+), 367 deletions(-) create mode 100644 drivers/net/can/dev/netlink.c (limited to 'include') diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile index 188bd53b113c..3e2e207861fc 100644 --- a/drivers/net/can/dev/Makefile +++ b/drivers/net/can/dev/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_CAN_DEV) += can-dev.o can-dev-y += bittiming.o can-dev-y += dev.o can-dev-y += length.o +can-dev-y += netlink.o can-dev-y += rx-offload.o can-dev-y += skb.o diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index fe71ff9ef8c9..f580f0ac6497 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -14,10 +14,8 @@ #include #include #include -#include #include #include -#include #define MOD_DESC "CAN device driver interface" @@ -223,7 +221,7 @@ void can_bus_off(struct net_device *dev) } EXPORT_SYMBOL_GPL(can_bus_off); -static void can_setup(struct net_device *dev) +void can_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CAN_MTU; @@ -399,368 +397,6 @@ void close_candev(struct net_device *dev) } EXPORT_SYMBOL_GPL(close_candev); -/* CAN netlink interface */ -static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = { - [IFLA_CAN_STATE] = { .type = NLA_U32 }, - [IFLA_CAN_CTRLMODE] = { .len = sizeof(struct can_ctrlmode) }, - [IFLA_CAN_RESTART_MS] = { .type = NLA_U32 }, - [IFLA_CAN_RESTART] = { .type = NLA_U32 }, - [IFLA_CAN_BITTIMING] = { .len = sizeof(struct can_bittiming) }, - [IFLA_CAN_BITTIMING_CONST] - = { .len = sizeof(struct can_bittiming_const) }, - [IFLA_CAN_CLOCK] = { .len = sizeof(struct can_clock) }, - [IFLA_CAN_BERR_COUNTER] = { .len = sizeof(struct can_berr_counter) }, - [IFLA_CAN_DATA_BITTIMING] - = { .len = sizeof(struct can_bittiming) }, - [IFLA_CAN_DATA_BITTIMING_CONST] - = { .len = sizeof(struct can_bittiming_const) }, - [IFLA_CAN_TERMINATION] = { .type = NLA_U16 }, -}; - -static int can_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - bool is_can_fd = false; - - /* Make sure that valid CAN FD configurations always consist of - * - nominal/arbitration bittiming - * - data bittiming - * - control mode with CAN_CTRLMODE_FD set - */ - - if (!data) - return 0; - - if (data[IFLA_CAN_CTRLMODE]) { - struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); - - is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD; - } - - if (is_can_fd) { - if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING]) - return -EOPNOTSUPP; - } - - if (data[IFLA_CAN_DATA_BITTIMING]) { - if (!is_can_fd || !data[IFLA_CAN_BITTIMING]) - return -EOPNOTSUPP; - } - - return 0; -} - -static int can_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - struct can_priv *priv = netdev_priv(dev); - int err; - - /* We need synchronization with dev->stop() */ - ASSERT_RTNL(); - - if (data[IFLA_CAN_BITTIMING]) { - struct can_bittiming bt; - - /* Do not allow changing bittiming while running */ - if (dev->flags & IFF_UP) - return -EBUSY; - - /* Calculate bittiming parameters based on - * bittiming_const if set, otherwise pass bitrate - * directly via do_set_bitrate(). Bail out if neither - * is given. - */ - if (!priv->bittiming_const && !priv->do_set_bittiming) - return -EOPNOTSUPP; - - memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt)); - err = can_get_bittiming(dev, &bt, - priv->bittiming_const, - priv->bitrate_const, - priv->bitrate_const_cnt); - if (err) - return err; - - if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) { - netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n", - priv->bitrate_max); - return -EINVAL; - } - - memcpy(&priv->bittiming, &bt, sizeof(bt)); - - if (priv->do_set_bittiming) { - /* Finally, set the bit-timing registers */ - err = priv->do_set_bittiming(dev); - if (err) - return err; - } - } - - if (data[IFLA_CAN_CTRLMODE]) { - struct can_ctrlmode *cm; - u32 ctrlstatic; - u32 maskedflags; - - /* Do not allow changing controller mode while running */ - if (dev->flags & IFF_UP) - return -EBUSY; - cm = nla_data(data[IFLA_CAN_CTRLMODE]); - ctrlstatic = priv->ctrlmode_static; - maskedflags = cm->flags & cm->mask; - - /* check whether provided bits are allowed to be passed */ - if (cm->mask & ~(priv->ctrlmode_supported | ctrlstatic)) - return -EOPNOTSUPP; - - /* do not check for static fd-non-iso if 'fd' is disabled */ - if (!(maskedflags & CAN_CTRLMODE_FD)) - ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO; - - /* make sure static options are provided by configuration */ - if ((maskedflags & ctrlstatic) != ctrlstatic) - return -EOPNOTSUPP; - - /* clear bits to be modified and copy the flag values */ - priv->ctrlmode &= ~cm->mask; - priv->ctrlmode |= maskedflags; - - /* CAN_CTRLMODE_FD can only be set when driver supports FD */ - if (priv->ctrlmode & CAN_CTRLMODE_FD) - dev->mtu = CANFD_MTU; - else - dev->mtu = CAN_MTU; - } - - if (data[IFLA_CAN_RESTART_MS]) { - /* Do not allow changing restart delay while running */ - if (dev->flags & IFF_UP) - return -EBUSY; - priv->restart_ms = nla_get_u32(data[IFLA_CAN_RESTART_MS]); - } - - if (data[IFLA_CAN_RESTART]) { - /* Do not allow a restart while not running */ - if (!(dev->flags & IFF_UP)) - return -EINVAL; - err = can_restart_now(dev); - if (err) - return err; - } - - if (data[IFLA_CAN_DATA_BITTIMING]) { - struct can_bittiming dbt; - - /* Do not allow changing bittiming while running */ - if (dev->flags & IFF_UP) - return -EBUSY; - - /* Calculate bittiming parameters based on - * data_bittiming_const if set, otherwise pass bitrate - * directly via do_set_bitrate(). Bail out if neither - * is given. - */ - if (!priv->data_bittiming_const && !priv->do_set_data_bittiming) - return -EOPNOTSUPP; - - memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]), - sizeof(dbt)); - err = can_get_bittiming(dev, &dbt, - priv->data_bittiming_const, - priv->data_bitrate_const, - priv->data_bitrate_const_cnt); - if (err) - return err; - - if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) { - netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n", - priv->bitrate_max); - return -EINVAL; - } - - memcpy(&priv->data_bittiming, &dbt, sizeof(dbt)); - - if (priv->do_set_data_bittiming) { - /* Finally, set the bit-timing registers */ - err = priv->do_set_data_bittiming(dev); - if (err) - return err; - } - } - - if (data[IFLA_CAN_TERMINATION]) { - const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]); - const unsigned int num_term = priv->termination_const_cnt; - unsigned int i; - - if (!priv->do_set_termination) - return -EOPNOTSUPP; - - /* check whether given value is supported by the interface */ - for (i = 0; i < num_term; i++) { - if (termval == priv->termination_const[i]) - break; - } - if (i >= num_term) - return -EINVAL; - - /* Finally, set the termination value */ - err = priv->do_set_termination(dev, termval); - if (err) - return err; - - priv->termination = termval; - } - - return 0; -} - -static size_t can_get_size(const struct net_device *dev) -{ - struct can_priv *priv = netdev_priv(dev); - size_t size = 0; - - if (priv->bittiming.bitrate) /* IFLA_CAN_BITTIMING */ - size += nla_total_size(sizeof(struct can_bittiming)); - if (priv->bittiming_const) /* IFLA_CAN_BITTIMING_CONST */ - size += nla_total_size(sizeof(struct can_bittiming_const)); - size += nla_total_size(sizeof(struct can_clock)); /* IFLA_CAN_CLOCK */ - size += nla_total_size(sizeof(u32)); /* IFLA_CAN_STATE */ - size += nla_total_size(sizeof(struct can_ctrlmode)); /* IFLA_CAN_CTRLMODE */ - size += nla_total_size(sizeof(u32)); /* IFLA_CAN_RESTART_MS */ - if (priv->do_get_berr_counter) /* IFLA_CAN_BERR_COUNTER */ - size += nla_total_size(sizeof(struct can_berr_counter)); - if (priv->data_bittiming.bitrate) /* IFLA_CAN_DATA_BITTIMING */ - size += nla_total_size(sizeof(struct can_bittiming)); - if (priv->data_bittiming_const) /* IFLA_CAN_DATA_BITTIMING_CONST */ - size += nla_total_size(sizeof(struct can_bittiming_const)); - if (priv->termination_const) { - size += nla_total_size(sizeof(priv->termination)); /* IFLA_CAN_TERMINATION */ - size += nla_total_size(sizeof(*priv->termination_const) * /* IFLA_CAN_TERMINATION_CONST */ - priv->termination_const_cnt); - } - if (priv->bitrate_const) /* IFLA_CAN_BITRATE_CONST */ - size += nla_total_size(sizeof(*priv->bitrate_const) * - priv->bitrate_const_cnt); - if (priv->data_bitrate_const) /* IFLA_CAN_DATA_BITRATE_CONST */ - size += nla_total_size(sizeof(*priv->data_bitrate_const) * - priv->data_bitrate_const_cnt); - size += sizeof(priv->bitrate_max); /* IFLA_CAN_BITRATE_MAX */ - - return size; -} - -static int can_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct can_priv *priv = netdev_priv(dev); - struct can_ctrlmode cm = {.flags = priv->ctrlmode}; - struct can_berr_counter bec; - enum can_state state = priv->state; - - if (priv->do_get_state) - priv->do_get_state(dev, &state); - - if ((priv->bittiming.bitrate && - nla_put(skb, IFLA_CAN_BITTIMING, - sizeof(priv->bittiming), &priv->bittiming)) || - - (priv->bittiming_const && - nla_put(skb, IFLA_CAN_BITTIMING_CONST, - sizeof(*priv->bittiming_const), priv->bittiming_const)) || - - nla_put(skb, IFLA_CAN_CLOCK, sizeof(priv->clock), &priv->clock) || - nla_put_u32(skb, IFLA_CAN_STATE, state) || - nla_put(skb, IFLA_CAN_CTRLMODE, sizeof(cm), &cm) || - nla_put_u32(skb, IFLA_CAN_RESTART_MS, priv->restart_ms) || - - (priv->do_get_berr_counter && - !priv->do_get_berr_counter(dev, &bec) && - nla_put(skb, IFLA_CAN_BERR_COUNTER, sizeof(bec), &bec)) || - - (priv->data_bittiming.bitrate && - nla_put(skb, IFLA_CAN_DATA_BITTIMING, - sizeof(priv->data_bittiming), &priv->data_bittiming)) || - - (priv->data_bittiming_const && - nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST, - sizeof(*priv->data_bittiming_const), - priv->data_bittiming_const)) || - - (priv->termination_const && - (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) || - nla_put(skb, IFLA_CAN_TERMINATION_CONST, - sizeof(*priv->termination_const) * - priv->termination_const_cnt, - priv->termination_const))) || - - (priv->bitrate_const && - nla_put(skb, IFLA_CAN_BITRATE_CONST, - sizeof(*priv->bitrate_const) * - priv->bitrate_const_cnt, - priv->bitrate_const)) || - - (priv->data_bitrate_const && - nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST, - sizeof(*priv->data_bitrate_const) * - priv->data_bitrate_const_cnt, - priv->data_bitrate_const)) || - - (nla_put(skb, IFLA_CAN_BITRATE_MAX, - sizeof(priv->bitrate_max), - &priv->bitrate_max)) - ) - - return -EMSGSIZE; - - return 0; -} - -static size_t can_get_xstats_size(const struct net_device *dev) -{ - return sizeof(struct can_device_stats); -} - -static int can_fill_xstats(struct sk_buff *skb, const struct net_device *dev) -{ - struct can_priv *priv = netdev_priv(dev); - - if (nla_put(skb, IFLA_INFO_XSTATS, - sizeof(priv->can_stats), &priv->can_stats)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static int can_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - return -EOPNOTSUPP; -} - -static void can_dellink(struct net_device *dev, struct list_head *head) -{ -} - -static struct rtnl_link_ops can_link_ops __read_mostly = { - .kind = "can", - .maxtype = IFLA_CAN_MAX, - .policy = can_policy, - .setup = can_setup, - .validate = can_validate, - .newlink = can_newlink, - .changelink = can_changelink, - .dellink = can_dellink, - .get_size = can_get_size, - .fill_info = can_fill_info, - .get_xstats_size = can_get_xstats_size, - .fill_xstats = can_fill_xstats, -}; - /* Register the CAN network device */ int register_candev(struct net_device *dev) { @@ -812,7 +448,7 @@ static __init int can_dev_init(void) can_led_notifier_init(); - err = rtnl_link_register(&can_link_ops); + err = can_netlink_register(); if (!err) pr_info(MOD_DESC "\n"); @@ -822,7 +458,7 @@ module_init(can_dev_init); static __exit void can_dev_exit(void) { - rtnl_link_unregister(&can_link_ops); + can_netlink_unregister(); can_led_notifier_exit(); } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c new file mode 100644 index 000000000000..3ae884cdf677 --- /dev/null +++ b/drivers/net/can/dev/netlink.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix + * Copyright (C) 2006 Andrey Volkov, Varma Electronics + * Copyright (C) 2008-2009 Wolfgang Grandegger + */ + +#include +#include + +static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = { + [IFLA_CAN_STATE] = { .type = NLA_U32 }, + [IFLA_CAN_CTRLMODE] = { .len = sizeof(struct can_ctrlmode) }, + [IFLA_CAN_RESTART_MS] = { .type = NLA_U32 }, + [IFLA_CAN_RESTART] = { .type = NLA_U32 }, + [IFLA_CAN_BITTIMING] = { .len = sizeof(struct can_bittiming) }, + [IFLA_CAN_BITTIMING_CONST] + = { .len = sizeof(struct can_bittiming_const) }, + [IFLA_CAN_CLOCK] = { .len = sizeof(struct can_clock) }, + [IFLA_CAN_BERR_COUNTER] = { .len = sizeof(struct can_berr_counter) }, + [IFLA_CAN_DATA_BITTIMING] + = { .len = sizeof(struct can_bittiming) }, + [IFLA_CAN_DATA_BITTIMING_CONST] + = { .len = sizeof(struct can_bittiming_const) }, + [IFLA_CAN_TERMINATION] = { .type = NLA_U16 }, +}; + +static int can_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + bool is_can_fd = false; + + /* Make sure that valid CAN FD configurations always consist of + * - nominal/arbitration bittiming + * - data bittiming + * - control mode with CAN_CTRLMODE_FD set + */ + + if (!data) + return 0; + + if (data[IFLA_CAN_CTRLMODE]) { + struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); + + is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD; + } + + if (is_can_fd) { + if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING]) + return -EOPNOTSUPP; + } + + if (data[IFLA_CAN_DATA_BITTIMING]) { + if (!is_can_fd || !data[IFLA_CAN_BITTIMING]) + return -EOPNOTSUPP; + } + + return 0; +} + +static int can_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct can_priv *priv = netdev_priv(dev); + int err; + + /* We need synchronization with dev->stop() */ + ASSERT_RTNL(); + + if (data[IFLA_CAN_BITTIMING]) { + struct can_bittiming bt; + + /* Do not allow changing bittiming while running */ + if (dev->flags & IFF_UP) + return -EBUSY; + + /* Calculate bittiming parameters based on + * bittiming_const if set, otherwise pass bitrate + * directly via do_set_bitrate(). Bail out if neither + * is given. + */ + if (!priv->bittiming_const && !priv->do_set_bittiming) + return -EOPNOTSUPP; + + memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt)); + err = can_get_bittiming(dev, &bt, + priv->bittiming_const, + priv->bitrate_const, + priv->bitrate_const_cnt); + if (err) + return err; + + if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) { + netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n", + priv->bitrate_max); + return -EINVAL; + } + + memcpy(&priv->bittiming, &bt, sizeof(bt)); + + if (priv->do_set_bittiming) { + /* Finally, set the bit-timing registers */ + err = priv->do_set_bittiming(dev); + if (err) + return err; + } + } + + if (data[IFLA_CAN_CTRLMODE]) { + struct can_ctrlmode *cm; + u32 ctrlstatic; + u32 maskedflags; + + /* Do not allow changing controller mode while running */ + if (dev->flags & IFF_UP) + return -EBUSY; + cm = nla_data(data[IFLA_CAN_CTRLMODE]); + ctrlstatic = priv->ctrlmode_static; + maskedflags = cm->flags & cm->mask; + + /* check whether provided bits are allowed to be passed */ + if (cm->mask & ~(priv->ctrlmode_supported | ctrlstatic)) + return -EOPNOTSUPP; + + /* do not check for static fd-non-iso if 'fd' is disabled */ + if (!(maskedflags & CAN_CTRLMODE_FD)) + ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO; + + /* make sure static options are provided by configuration */ + if ((maskedflags & ctrlstatic) != ctrlstatic) + return -EOPNOTSUPP; + + /* clear bits to be modified and copy the flag values */ + priv->ctrlmode &= ~cm->mask; + priv->ctrlmode |= maskedflags; + + /* CAN_CTRLMODE_FD can only be set when driver supports FD */ + if (priv->ctrlmode & CAN_CTRLMODE_FD) + dev->mtu = CANFD_MTU; + else + dev->mtu = CAN_MTU; + } + + if (data[IFLA_CAN_RESTART_MS]) { + /* Do not allow changing restart delay while running */ + if (dev->flags & IFF_UP) + return -EBUSY; + priv->restart_ms = nla_get_u32(data[IFLA_CAN_RESTART_MS]); + } + + if (data[IFLA_CAN_RESTART]) { + /* Do not allow a restart while not running */ + if (!(dev->flags & IFF_UP)) + return -EINVAL; + err = can_restart_now(dev); + if (err) + return err; + } + + if (data[IFLA_CAN_DATA_BITTIMING]) { + struct can_bittiming dbt; + + /* Do not allow changing bittiming while running */ + if (dev->flags & IFF_UP) + return -EBUSY; + + /* Calculate bittiming parameters based on + * data_bittiming_const if set, otherwise pass bitrate + * directly via do_set_bitrate(). Bail out if neither + * is given. + */ + if (!priv->data_bittiming_const && !priv->do_set_data_bittiming) + return -EOPNOTSUPP; + + memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]), + sizeof(dbt)); + err = can_get_bittiming(dev, &dbt, + priv->data_bittiming_const, + priv->data_bitrate_const, + priv->data_bitrate_const_cnt); + if (err) + return err; + + if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) { + netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n", + priv->bitrate_max); + return -EINVAL; + } + + memcpy(&priv->data_bittiming, &dbt, sizeof(dbt)); + + if (priv->do_set_data_bittiming) { + /* Finally, set the bit-timing registers */ + err = priv->do_set_data_bittiming(dev); + if (err) + return err; + } + } + + if (data[IFLA_CAN_TERMINATION]) { + const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]); + const unsigned int num_term = priv->termination_const_cnt; + unsigned int i; + + if (!priv->do_set_termination) + return -EOPNOTSUPP; + + /* check whether given value is supported by the interface */ + for (i = 0; i < num_term; i++) { + if (termval == priv->termination_const[i]) + break; + } + if (i >= num_term) + return -EINVAL; + + /* Finally, set the termination value */ + err = priv->do_set_termination(dev, termval); + if (err) + return err; + + priv->termination = termval; + } + + return 0; +} + +static size_t can_get_size(const struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + size_t size = 0; + + if (priv->bittiming.bitrate) /* IFLA_CAN_BITTIMING */ + size += nla_total_size(sizeof(struct can_bittiming)); + if (priv->bittiming_const) /* IFLA_CAN_BITTIMING_CONST */ + size += nla_total_size(sizeof(struct can_bittiming_const)); + size += nla_total_size(sizeof(struct can_clock)); /* IFLA_CAN_CLOCK */ + size += nla_total_size(sizeof(u32)); /* IFLA_CAN_STATE */ + size += nla_total_size(sizeof(struct can_ctrlmode)); /* IFLA_CAN_CTRLMODE */ + size += nla_total_size(sizeof(u32)); /* IFLA_CAN_RESTART_MS */ + if (priv->do_get_berr_counter) /* IFLA_CAN_BERR_COUNTER */ + size += nla_total_size(sizeof(struct can_berr_counter)); + if (priv->data_bittiming.bitrate) /* IFLA_CAN_DATA_BITTIMING */ + size += nla_total_size(sizeof(struct can_bittiming)); + if (priv->data_bittiming_const) /* IFLA_CAN_DATA_BITTIMING_CONST */ + size += nla_total_size(sizeof(struct can_bittiming_const)); + if (priv->termination_const) { + size += nla_total_size(sizeof(priv->termination)); /* IFLA_CAN_TERMINATION */ + size += nla_total_size(sizeof(*priv->termination_const) * /* IFLA_CAN_TERMINATION_CONST */ + priv->termination_const_cnt); + } + if (priv->bitrate_const) /* IFLA_CAN_BITRATE_CONST */ + size += nla_total_size(sizeof(*priv->bitrate_const) * + priv->bitrate_const_cnt); + if (priv->data_bitrate_const) /* IFLA_CAN_DATA_BITRATE_CONST */ + size += nla_total_size(sizeof(*priv->data_bitrate_const) * + priv->data_bitrate_const_cnt); + size += sizeof(priv->bitrate_max); /* IFLA_CAN_BITRATE_MAX */ + + return size; +} + +static int can_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + struct can_ctrlmode cm = {.flags = priv->ctrlmode}; + struct can_berr_counter bec; + enum can_state state = priv->state; + + if (priv->do_get_state) + priv->do_get_state(dev, &state); + + if ((priv->bittiming.bitrate && + nla_put(skb, IFLA_CAN_BITTIMING, + sizeof(priv->bittiming), &priv->bittiming)) || + + (priv->bittiming_const && + nla_put(skb, IFLA_CAN_BITTIMING_CONST, + sizeof(*priv->bittiming_const), priv->bittiming_const)) || + + nla_put(skb, IFLA_CAN_CLOCK, sizeof(priv->clock), &priv->clock) || + nla_put_u32(skb, IFLA_CAN_STATE, state) || + nla_put(skb, IFLA_CAN_CTRLMODE, sizeof(cm), &cm) || + nla_put_u32(skb, IFLA_CAN_RESTART_MS, priv->restart_ms) || + + (priv->do_get_berr_counter && + !priv->do_get_berr_counter(dev, &bec) && + nla_put(skb, IFLA_CAN_BERR_COUNTER, sizeof(bec), &bec)) || + + (priv->data_bittiming.bitrate && + nla_put(skb, IFLA_CAN_DATA_BITTIMING, + sizeof(priv->data_bittiming), &priv->data_bittiming)) || + + (priv->data_bittiming_const && + nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST, + sizeof(*priv->data_bittiming_const), + priv->data_bittiming_const)) || + + (priv->termination_const && + (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) || + nla_put(skb, IFLA_CAN_TERMINATION_CONST, + sizeof(*priv->termination_const) * + priv->termination_const_cnt, + priv->termination_const))) || + + (priv->bitrate_const && + nla_put(skb, IFLA_CAN_BITRATE_CONST, + sizeof(*priv->bitrate_const) * + priv->bitrate_const_cnt, + priv->bitrate_const)) || + + (priv->data_bitrate_const && + nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST, + sizeof(*priv->data_bitrate_const) * + priv->data_bitrate_const_cnt, + priv->data_bitrate_const)) || + + (nla_put(skb, IFLA_CAN_BITRATE_MAX, + sizeof(priv->bitrate_max), + &priv->bitrate_max)) + ) + + return -EMSGSIZE; + + return 0; +} + +static size_t can_get_xstats_size(const struct net_device *dev) +{ + return sizeof(struct can_device_stats); +} + +static int can_fill_xstats(struct sk_buff *skb, const struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + + if (nla_put(skb, IFLA_INFO_XSTATS, + sizeof(priv->can_stats), &priv->can_stats)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int can_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static void can_dellink(struct net_device *dev, struct list_head *head) +{ +} + +struct rtnl_link_ops can_link_ops __read_mostly = { + .kind = "can", + .maxtype = IFLA_CAN_MAX, + .policy = can_policy, + .setup = can_setup, + .validate = can_validate, + .newlink = can_newlink, + .changelink = can_changelink, + .dellink = can_dellink, + .get_size = can_get_size, + .fill_info = can_fill_info, + .get_xstats_size = can_get_xstats_size, + .fill_xstats = can_fill_xstats, +}; + +int can_netlink_register(void) +{ + return rtnl_link_register(&can_link_ops); +} + +void can_netlink_unregister(void) +{ + rtnl_link_unregister(&can_link_ops); +} diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 4a26e128af7f..7faf6a37d5b2 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -100,6 +100,8 @@ static inline void can_set_static_ctrlmode(struct net_device *dev, dev->mtu = CANFD_MTU; } +void can_setup(struct net_device *dev); + struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs); #define alloc_candev(sizeof_priv, echo_skb_max) \ @@ -130,4 +132,8 @@ void of_can_transceiver(struct net_device *dev); static inline void of_can_transceiver(struct net_device *dev) { } #endif +extern struct rtnl_link_ops can_link_ops; +int can_netlink_register(void); +void can_netlink_unregister(void); + #endif /* !_CAN_DEV_H */ -- cgit v1.2.3 From 838b00a2260ab6ec104a2a5696e619c19e8cd9be Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Mon, 11 Jan 2021 23:05:24 -0800 Subject: net/mlx5: Add HW definition of reg_c_preserve Add capability bit to test whether reg_c value is preserved on recirculation. Signed-off-by: Paul Blakey Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 442c0160caab..823411e288c0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1278,7 +1278,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_a0[0x3]; u8 ece_support[0x1]; - u8 reserved_at_a4[0x7]; + u8 reserved_at_a4[0x5]; + u8 reg_c_preserve[0x1]; + u8 reserved_at_aa[0x1]; u8 log_max_srq[0x5]; u8 reserved_at_b0[0x1]; u8 uplink_follow[0x1]; -- cgit v1.2.3 From 99b7beb0431a4b9728023e9169ed15af678d2c7f Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:24 +0100 Subject: can: length: canfd_sanitize_len(): add function to sanitize CAN-FD data length The data field in CAN-FD frames have specifig frame length (0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 16, 20, 24, 32, 48, 64). This function "rounds" up a given length to the next valid CAN-FD frame length. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-10-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/length.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/can/length.h b/include/linux/can/length.h index 156b9d17969a..b2313b2a0b02 100644 --- a/include/linux/can/length.h +++ b/include/linux/can/length.h @@ -45,4 +45,10 @@ u8 can_fd_dlc2len(u8 dlc); /* map the sanitized data length to an appropriate data length code */ u8 can_fd_len2dlc(u8 len); +/* map the data length to an appropriate data link layer length */ +static inline u8 canfd_sanitize_len(u8 len) +{ + return can_fd_dlc2len(can_fd_len2dlc(len)); +} + #endif /* !_CAN_LENGTH_H */ -- cgit v1.2.3 From 85d99c3e2a13d542445342a1383a686dadeaac3d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Mon, 11 Jan 2021 15:19:25 +0100 Subject: can: length: can_skb_get_frame_len(): introduce function to get data length of frame in data link layer This patch adds the function can_skb_get_frame_len() which returns the length of a CAN frame on the data link layer, including Start-of-frame, Identifier, various other bits, the actual data, the CRC, the End-of-frame, the Inter frame spacing. Co-developed-by: Arunachalam Santhanam Signed-off-by: Arunachalam Santhanam Co-developed-by: Vincent Mailhol Signed-off-by: Vincent Mailhol Acked-by: Vincent Mailhol Reviewed-by: Vincent Mailhol Co-developed-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20210111141930.693847-11-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/length.c | 50 ++++++++++++++++++ include/linux/can/length.h | 120 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) (limited to 'include') diff --git a/drivers/net/can/dev/length.c b/drivers/net/can/dev/length.c index 64673a8d1234..d35c4e82314d 100644 --- a/drivers/net/can/dev/length.c +++ b/drivers/net/can/dev/length.c @@ -38,3 +38,53 @@ u8 can_fd_len2dlc(u8 len) return len2dlc[len]; } EXPORT_SYMBOL_GPL(can_fd_len2dlc); + +/** + * can_skb_get_frame_len() - Calculate the CAN Frame length in bytes + * of a given skb. + * @skb: socket buffer of a CAN message. + * + * Do a rough calculation: bit stuffing is ignored and length in bits + * is rounded up to a length in bytes. + * + * Rationale: this function is to be used for the BQL functions + * (netdev_sent_queue() and netdev_completed_queue()) which expect a + * value in bytes. Just using skb->len is insufficient because it will + * return the constant value of CAN(FD)_MTU. Doing the bit stuffing + * calculation would be too expensive in term of computing resources + * for no noticeable gain. + * + * Remarks: The payload of CAN FD frames with BRS flag are sent at a + * different bitrate. Currently, the can-utils canbusload tool does + * not support CAN-FD yet and so we could not run any benchmark to + * measure the impact. There might be possible improvement here. + * + * Return: length in bytes. + */ +unsigned int can_skb_get_frame_len(const struct sk_buff *skb) +{ + const struct canfd_frame *cf = (const struct canfd_frame *)skb->data; + u8 len; + + if (can_is_canfd_skb(skb)) + len = canfd_sanitize_len(cf->len); + else if (cf->can_id & CAN_RTR_FLAG) + len = 0; + else + len = cf->len; + + if (can_is_canfd_skb(skb)) { + if (cf->can_id & CAN_EFF_FLAG) + len += CANFD_FRAME_OVERHEAD_EFF; + else + len += CANFD_FRAME_OVERHEAD_SFF; + } else { + if (cf->can_id & CAN_EFF_FLAG) + len += CAN_FRAME_OVERHEAD_EFF; + else + len += CAN_FRAME_OVERHEAD_SFF; + } + + return len; +} +EXPORT_SYMBOL_GPL(can_skb_get_frame_len); diff --git a/include/linux/can/length.h b/include/linux/can/length.h index b2313b2a0b02..6995092b774e 100644 --- a/include/linux/can/length.h +++ b/include/linux/can/length.h @@ -1,10 +1,127 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2020 Oliver Hartkopp + * Copyright (C) 2020 Marc Kleine-Budde */ #ifndef _CAN_LENGTH_H #define _CAN_LENGTH_H +/* + * Size of a Classical CAN Standard Frame + * + * Name of Field Bits + * --------------------------------------------------------- + * Start-of-frame 1 + * Identifier 11 + * Remote transmission request (RTR) 1 + * Identifier extension bit (IDE) 1 + * Reserved bit (r0) 1 + * Data length code (DLC) 4 + * Data field 0...64 + * CRC 15 + * CRC delimiter 1 + * ACK slot 1 + * ACK delimiter 1 + * End-of-frame (EOF) 7 + * Inter frame spacing 3 + * + * rounded up and ignoring bitstuffing + */ +#define CAN_FRAME_OVERHEAD_SFF DIV_ROUND_UP(47, 8) + +/* + * Size of a Classical CAN Extended Frame + * + * Name of Field Bits + * --------------------------------------------------------- + * Start-of-frame 1 + * Identifier A 11 + * Substitute remote request (SRR) 1 + * Identifier extension bit (IDE) 1 + * Identifier B 18 + * Remote transmission request (RTR) 1 + * Reserved bits (r1, r0) 2 + * Data length code (DLC) 4 + * Data field 0...64 + * CRC 15 + * CRC delimiter 1 + * ACK slot 1 + * ACK delimiter 1 + * End-of-frame (EOF) 7 + * Inter frame spacing 3 + * + * rounded up and ignoring bitstuffing + */ +#define CAN_FRAME_OVERHEAD_EFF DIV_ROUND_UP(67, 8) + +/* + * Size of a CAN-FD Standard Frame + * + * Name of Field Bits + * --------------------------------------------------------- + * Start-of-frame 1 + * Identifier 11 + * Reserved bit (r1) 1 + * Identifier extension bit (IDE) 1 + * Flexible data rate format (FDF) 1 + * Reserved bit (r0) 1 + * Bit Rate Switch (BRS) 1 + * Error Status Indicator (ESI) 1 + * Data length code (DLC) 4 + * Data field 0...512 + * Stuff Bit Count (SBC) 0...16: 4 20...64:5 + * CRC 0...16: 17 20...64:21 + * CRC delimiter (CD) 1 + * ACK slot (AS) 1 + * ACK delimiter (AD) 1 + * End-of-frame (EOF) 7 + * Inter frame spacing 3 + * + * assuming CRC21, rounded up and ignoring bitstuffing + */ +#define CANFD_FRAME_OVERHEAD_SFF DIV_ROUND_UP(61, 8) + +/* + * Size of a CAN-FD Extended Frame + * + * Name of Field Bits + * --------------------------------------------------------- + * Start-of-frame 1 + * Identifier A 11 + * Substitute remote request (SRR) 1 + * Identifier extension bit (IDE) 1 + * Identifier B 18 + * Reserved bit (r1) 1 + * Flexible data rate format (FDF) 1 + * Reserved bit (r0) 1 + * Bit Rate Switch (BRS) 1 + * Error Status Indicator (ESI) 1 + * Data length code (DLC) 4 + * Data field 0...512 + * Stuff Bit Count (SBC) 0...16: 4 20...64:5 + * CRC 0...16: 17 20...64:21 + * CRC delimiter (CD) 1 + * ACK slot (AS) 1 + * ACK delimiter (AD) 1 + * End-of-frame (EOF) 7 + * Inter frame spacing 3 + * + * assuming CRC21, rounded up and ignoring bitstuffing + */ +#define CANFD_FRAME_OVERHEAD_EFF DIV_ROUND_UP(80, 8) + +/* + * Maximum size of a Classical CAN frame + * (rounded up and ignoring bitstuffing) + */ +#define CAN_FRAME_LEN_MAX (CAN_FRAME_OVERHEAD_EFF + CAN_MAX_DLEN) + +/* + * Maximum size of a CAN-FD frame + * (rounded up and ignoring bitstuffing) + */ +#define CANFD_FRAME_LEN_MAX (CANFD_FRAME_OVERHEAD_EFF + CANFD_MAX_DLEN) + /* * can_cc_dlc2len(value) - convert a given data length code (dlc) of a * Classical CAN frame into a valid data length of max. 8 bytes. @@ -45,6 +162,9 @@ u8 can_fd_dlc2len(u8 dlc); /* map the sanitized data length to an appropriate data length code */ u8 can_fd_len2dlc(u8 len); +/* calculate the CAN Frame length in bytes of a given skb */ +unsigned int can_skb_get_frame_len(const struct sk_buff *skb); + /* map the data length to an appropriate data link layer length */ static inline u8 canfd_sanitize_len(u8 len) { -- cgit v1.2.3 From f0ef72febc9a6a569d92cdf6c7996015dfa8e8bb Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:26 +0100 Subject: can: dev: extend struct can_skb_priv to hold CAN frame length In order to implement byte queue limits (bql) in CAN drivers, the length of the CAN frame needs to be passed into the networking stack after queueing and after transmission completion. To avoid to calculate this length twice, extend the struct can_skb_priv to hold the length of the CAN frame and extend __can_get_echo_skb() to return that value. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-12-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/rx-offload.c | 2 +- drivers/net/can/dev/skb.c | 9 +++++++-- include/linux/can/skb.h | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c index 3c1912c0430b..6a26b5282df1 100644 --- a/drivers/net/can/dev/rx-offload.c +++ b/drivers/net/can/dev/rx-offload.c @@ -271,7 +271,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, u8 len; int err; - skb = __can_get_echo_skb(dev, idx, &len); + skb = __can_get_echo_skb(dev, idx, &len, NULL); if (!skb) return 0; diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c index 26cd597ff771..24f782a23409 100644 --- a/drivers/net/can/dev/skb.c +++ b/drivers/net/can/dev/skb.c @@ -76,7 +76,8 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, EXPORT_SYMBOL_GPL(can_put_echo_skb); struct sk_buff * -__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr) +__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, + unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); @@ -91,6 +92,7 @@ __can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr) * length is supported on both CAN and CANFD frames. */ struct sk_buff *skb = priv->echo_skb[idx]; + struct can_skb_priv *can_skb_priv = can_skb_prv(skb); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* get the real payload length for netdev statistics */ @@ -99,6 +101,9 @@ __can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr) else *len_ptr = cf->len; + if (frame_len_ptr) + *frame_len_ptr = can_skb_priv->frame_len; + priv->echo_skb[idx] = NULL; return skb; @@ -118,7 +123,7 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) struct sk_buff *skb; u8 len; - skb = __can_get_echo_skb(dev, idx, &len); + skb = __can_get_echo_skb(dev, idx, &len, NULL); if (!skb) return 0; diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index c90ebbd3008c..5db9da30843c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -20,7 +20,7 @@ void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, - u8 *len_ptr); + u8 *len_ptr, unsigned int *frame_len_ptr); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); void can_free_echo_skb(struct net_device *dev, unsigned int idx); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); @@ -42,11 +42,13 @@ struct sk_buff *alloc_can_err_skb(struct net_device *dev, * struct can_skb_priv - private additional data inside CAN sk_buffs * @ifindex: ifindex of the first interface the CAN frame appeared on * @skbcnt: atomic counter to have an unique id together with skb pointer + * @frame_len: length of CAN frame in data link layer * @cf: align to the following CAN frame at skb->data */ struct can_skb_priv { int ifindex; int skbcnt; + unsigned int frame_len; struct can_frame cf[]; }; -- cgit v1.2.3 From 1dcb6e57db833419483d0df2d956b1cc2a802683 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Mon, 11 Jan 2021 15:19:27 +0100 Subject: can: dev: can_put_echo_skb(): extend to handle frame_len Add a frame_len argument to can_put_echo_skb() which is used to save length of the CAN frame into field frame_len of struct can_skb_priv so that it can be later used after transmission completion. Convert all users of this function, too. Drivers which implement BQL call can_put_echo_skb() with the output of can_skb_get_frame_len(skb) and drivers which do not simply pass zero as an input (in the same way that NULL would be given to can_get_echo_skb()). This way, we have a nice symmetry between the two echo functions. Link: https://lore.kernel.org/r/20210111061335.39983-1-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-13-mkl@pengutronix.de Signed-off-by: Vincent Mailhol --- drivers/net/can/at91_can.c | 2 +- drivers/net/can/c_can/c_can.c | 2 +- drivers/net/can/cc770/cc770.c | 2 +- drivers/net/can/dev/skb.c | 5 ++++- drivers/net/can/flexcan.c | 2 +- drivers/net/can/grcan.c | 2 +- drivers/net/can/ifi_canfd/ifi_canfd.c | 2 +- drivers/net/can/kvaser_pciefd.c | 2 +- drivers/net/can/m_can/m_can.c | 4 ++-- drivers/net/can/mscan/mscan.c | 2 +- drivers/net/can/pch_can.c | 2 +- drivers/net/can/peak_canfd/peak_canfd.c | 2 +- drivers/net/can/rcar/rcar_can.c | 2 +- drivers/net/can/rcar/rcar_canfd.c | 2 +- drivers/net/can/sja1000/sja1000.c | 2 +- drivers/net/can/softing/softing_main.c | 2 +- drivers/net/can/spi/hi311x.c | 2 +- drivers/net/can/spi/mcp251x.c | 2 +- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +- drivers/net/can/sun4i_can.c | 2 +- drivers/net/can/ti_hecc.c | 2 +- drivers/net/can/usb/ems_usb.c | 2 +- drivers/net/can/usb/esd_usb2.c | 2 +- drivers/net/can/usb/gs_usb.c | 2 +- drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 2 +- drivers/net/can/usb/mcba_usb.c | 2 +- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 2 +- drivers/net/can/usb/ucan.c | 2 +- drivers/net/can/usb/usb_8dev.c | 2 +- drivers/net/can/xilinx_can.c | 4 ++-- include/linux/can/skb.h | 2 +- 31 files changed, 36 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 5284f0ab3b06..90b223a80ed4 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -484,7 +484,7 @@ static netdev_tx_t at91_start_xmit(struct sk_buff *skb, struct net_device *dev) stats->tx_bytes += cf->len; /* _NOTE_: subtract AT91_MB_TX_FIRST offset from mb! */ - can_put_echo_skb(skb, dev, mb - get_mb_tx_first(priv)); + can_put_echo_skb(skb, dev, mb - get_mb_tx_first(priv), 0); /* * we have to stop the queue and deliver all messages in case diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index 63f48b016ecd..13638954a25c 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -476,7 +476,7 @@ static netdev_tx_t c_can_start_xmit(struct sk_buff *skb, */ c_can_setup_tx_object(dev, IF_TX, frame, idx); priv->dlc[idx] = frame->len; - can_put_echo_skb(skb, dev, idx); + can_put_echo_skb(skb, dev, idx, 0); /* Update the active bits */ atomic_add((1 << idx), &priv->tx_active); diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c index 8d9f332c35e0..e53ca338368a 100644 --- a/drivers/net/can/cc770/cc770.c +++ b/drivers/net/can/cc770/cc770.c @@ -702,7 +702,7 @@ static void cc770_tx_interrupt(struct net_device *dev, unsigned int o) stats->tx_bytes += cf->len; stats->tx_packets++; - can_put_echo_skb(priv->tx_skb, dev, 0); + can_put_echo_skb(priv->tx_skb, dev, 0, 0); can_get_echo_skb(dev, 0); priv->tx_skb = NULL; diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c index 24f782a23409..c184b4dce19e 100644 --- a/drivers/net/can/dev/skb.c +++ b/drivers/net/can/dev/skb.c @@ -38,7 +38,7 @@ void can_flush_echo_skb(struct net_device *dev) * priv->echo_skb, if necessary. */ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, - unsigned int idx) + unsigned int idx, unsigned int frame_len) { struct can_priv *priv = netdev_priv(dev); @@ -62,6 +62,9 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, skb->ip_summed = CHECKSUM_UNNECESSARY; skb->dev = dev; + /* save frame_len to reuse it when transmission is completed */ + can_skb_prv(skb)->frame_len = frame_len; + /* save this skb for tx interrupt echo handling */ priv->echo_skb[idx] = skb; } else { diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 7ab20a6b0d1d..202d08f8e1a4 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -815,7 +815,7 @@ static netdev_tx_t flexcan_start_xmit(struct sk_buff *skb, struct net_device *de priv->write(data, &priv->tx_mb->data[i / sizeof(u32)]); } - can_put_echo_skb(skb, dev, 0); + can_put_echo_skb(skb, dev, 0, 0); priv->write(can_id, &priv->tx_mb->can_id); priv->write(ctrl, &priv->tx_mb->can_ctrl); diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index f5d94a692576..8086cdc10000 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -1448,7 +1448,7 @@ static netdev_tx_t grcan_start_xmit(struct sk_buff *skb, * taken. */ priv->txdlc[slotindex] = cf->len; /* Store dlc for statistics */ - can_put_echo_skb(skb, dev, slotindex); + can_put_echo_skb(skb, dev, slotindex, 0); /* Make sure everything is written before allowing hardware to * read from the memory diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c index 86b0e1406a21..56ac9e1dace7 100644 --- a/drivers/net/can/ifi_canfd/ifi_canfd.c +++ b/drivers/net/can/ifi_canfd/ifi_canfd.c @@ -922,7 +922,7 @@ static netdev_tx_t ifi_canfd_start_xmit(struct sk_buff *skb, writel(0, priv->base + IFI_CANFD_TXFIFO_REPEATCOUNT); writel(0, priv->base + IFI_CANFD_TXFIFO_SUSPEND_US); - can_put_echo_skb(skb, ndev, 0); + can_put_echo_skb(skb, ndev, 0, 0); /* Start the transmission */ writel(IFI_CANFD_TXSTCMD_ADD_MSG, priv->base + IFI_CANFD_TXSTCMD); diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c index 969cedb9b0b6..0cf82f0646a3 100644 --- a/drivers/net/can/kvaser_pciefd.c +++ b/drivers/net/can/kvaser_pciefd.c @@ -778,7 +778,7 @@ static netdev_tx_t kvaser_pciefd_start_xmit(struct sk_buff *skb, spin_lock_irqsave(&can->echo_lock, irq_flags); /* Prepare and save echo skb in internal slot */ - can_put_echo_skb(skb, netdev, can->echo_idx); + can_put_echo_skb(skb, netdev, can->echo_idx, 0); /* Move echo index to the next slot */ can->echo_idx = (can->echo_idx + 1) % can->can.echo_skb_max; diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index da551fd0f502..fff7432103cb 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1483,7 +1483,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev) M_CAN_FIFO_DATA(i / 4), *(u32 *)(cf->data + i)); - can_put_echo_skb(skb, dev, 0); + can_put_echo_skb(skb, dev, 0, 0); if (cdev->can.ctrlmode & CAN_CTRLMODE_FD) { cccr = m_can_read(cdev, M_CAN_CCCR); @@ -1554,7 +1554,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev) /* Push loopback echo. * Will be looped back on TX interrupt based on message marker */ - can_put_echo_skb(skb, dev, putidx); + can_put_echo_skb(skb, dev, putidx, 0); /* Enable TX FIFO element to start transfer */ m_can_write(cdev, M_CAN_TXBAR, (1 << putidx)); diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index 5ed00a1558e1..a28fdaa411c6 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -270,7 +270,7 @@ static netdev_tx_t mscan_start_xmit(struct sk_buff *skb, struct net_device *dev) list_add_tail(&priv->tx_queue[buf_id].list, &priv->tx_head); - can_put_echo_skb(skb, dev, buf_id); + can_put_echo_skb(skb, dev, buf_id, 0); /* Enable interrupt. */ priv->tx_active |= 1 << buf_id; diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c index 4f9e7ec192aa..a4c35b48d8e9 100644 --- a/drivers/net/can/pch_can.c +++ b/drivers/net/can/pch_can.c @@ -924,7 +924,7 @@ static netdev_tx_t pch_xmit(struct sk_buff *skb, struct net_device *ndev) &priv->regs->ifregs[1].data[i / 2]); } - can_put_echo_skb(skb, ndev, tx_obj_no - PCH_RX_OBJ_END - 1); + can_put_echo_skb(skb, ndev, tx_obj_no - PCH_RX_OBJ_END - 1, 0); /* Set the size of the data. Update if2_mcont */ iowrite32(cf->len | PCH_IF_MCONT_NEWDAT | PCH_IF_MCONT_TXRQXT | diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index c5334b0c3038..179a8e10fbb8 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -716,7 +716,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb, spin_lock_irqsave(&priv->echo_lock, flags); /* prepare and save echo skb in internal slot */ - can_put_echo_skb(skb, ndev, priv->echo_idx); + can_put_echo_skb(skb, ndev, priv->echo_idx, 0); /* move echo index to the next slot */ priv->echo_idx = (priv->echo_idx + 1) % priv->can.echo_skb_max; diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index c803327f8f79..0b7e488bc4fe 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -617,7 +617,7 @@ static netdev_tx_t rcar_can_start_xmit(struct sk_buff *skb, writeb(cf->len, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].dlc); priv->tx_dlc[priv->tx_head % RCAR_CAN_FIFO_DEPTH] = cf->len; - can_put_echo_skb(skb, ndev, priv->tx_head % RCAR_CAN_FIFO_DEPTH); + can_put_echo_skb(skb, ndev, priv->tx_head % RCAR_CAN_FIFO_DEPTH, 0); priv->tx_head++; /* Start Tx: write 0xff to the TFPCR register to increment * the CPU-side pointer for the transmit FIFO to the next diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index 2778ed5c61d1..38376f29bc56 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1390,7 +1390,7 @@ static netdev_tx_t rcar_canfd_start_xmit(struct sk_buff *skb, } priv->tx_len[priv->tx_head % RCANFD_FIFO_DEPTH] = cf->len; - can_put_echo_skb(skb, ndev, priv->tx_head % RCANFD_FIFO_DEPTH); + can_put_echo_skb(skb, ndev, priv->tx_head % RCANFD_FIFO_DEPTH, 0); spin_lock_irqsave(&priv->tx_lock, flags); priv->tx_head++; diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index b6a7003c51d2..e98482c7bf33 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -318,7 +318,7 @@ static netdev_tx_t sja1000_start_xmit(struct sk_buff *skb, for (i = 0; i < cf->len; i++) priv->write_reg(priv, dreg++, cf->data[i]); - can_put_echo_skb(skb, dev, 0); + can_put_echo_skb(skb, dev, 0, 0); if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) cmd_reg_val |= CMD_AT; diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c index 40070c930202..a5314448c5ae 100644 --- a/drivers/net/can/softing/softing_main.c +++ b/drivers/net/can/softing/softing_main.c @@ -104,7 +104,7 @@ static netdev_tx_t softing_netdev_start_xmit(struct sk_buff *skb, card->tx.last_bus = priv->index; ++card->tx.pending; ++priv->tx.pending; - can_put_echo_skb(skb, dev, priv->tx.echo_put); + can_put_echo_skb(skb, dev, priv->tx.echo_put, 0); ++priv->tx.echo_put; if (priv->tx.echo_put >= TX_ECHO_SKB_MAX) priv->tx.echo_put = 0; diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index f9455de94786..8c83a9e5a9e4 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -586,7 +586,7 @@ static void hi3110_tx_work_handler(struct work_struct *ws) frame = (struct can_frame *)priv->tx_skb->data; hi3110_hw_tx(spi, frame); priv->tx_len = 1 + frame->len; - can_put_echo_skb(priv->tx_skb, net, 0); + can_put_echo_skb(priv->tx_skb, net, 0, 0); priv->tx_skb = NULL; } } diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index 25859d16d06f..40866754aafc 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -1002,7 +1002,7 @@ static void mcp251x_tx_work_handler(struct work_struct *ws) frame->len = CAN_FRAME_MAX_DATA_LEN; mcp251x_hw_tx(spi, frame, 0); priv->tx_len = 1 + frame->len; - can_put_echo_skb(priv->tx_skb, net, 0); + can_put_echo_skb(priv->tx_skb, net, 0, 0); priv->tx_skb = NULL; } } diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 36235afb0bc6..95bba456a4cd 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -2436,7 +2436,7 @@ static netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, if (tx_ring->head - tx_ring->tail >= tx_ring->obj_num) netif_stop_queue(ndev); - can_put_echo_skb(skb, ndev, tx_head); + can_put_echo_skb(skb, ndev, tx_head, 0); err = mcp251xfd_tx_obj_write(priv, tx_obj); if (err) diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 783b63218b7b..b75175d59104 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -448,7 +448,7 @@ static netdev_tx_t sun4ican_start_xmit(struct sk_buff *skb, struct net_device *d writel(msg_flag_n, priv->base + SUN4I_REG_BUF0_ADDR); - can_put_echo_skb(skb, dev, 0); + can_put_echo_skb(skb, dev, 0, 0); if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) sun4i_can_write_cmdreg(priv, SUN4I_CMD_SELF_RCV_REQ); diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index a6850ff0b55b..485c19bc98c2 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -513,7 +513,7 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev) be32_to_cpu(*(__be32 *)(cf->data + 4))); else *(u32 *)(cf->data + 4) = 0; - can_put_echo_skb(skb, ndev, mbxno); + can_put_echo_skb(skb, ndev, mbxno, 0); spin_lock_irqsave(&priv->mbx_lock, flags); --priv->tx_head; diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 25eee4466364..5e5330060464 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -801,7 +801,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &dev->tx_submitted); - can_put_echo_skb(skb, netdev, context->echo_index); + can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&dev->active_tx_urbs); diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c index 9eed75a4b678..68d8a85f00c4 100644 --- a/drivers/net/can/usb/esd_usb2.c +++ b/drivers/net/can/usb/esd_usb2.c @@ -783,7 +783,7 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb, usb_anchor_urb(urb, &priv->tx_submitted); - can_put_echo_skb(skb, netdev, context->echo_index); + can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_jobs); diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 0487095e1fd0..5ce9ba5d29d6 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -525,7 +525,7 @@ static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb, urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &dev->tx_submitted); - can_put_echo_skb(skb, netdev, idx); + can_put_echo_skb(skb, netdev, idx, 0); atomic_inc(&dev->active_tx_urbs); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index e2d58846c40c..2b7efd296758 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -578,7 +578,7 @@ static netdev_tx_t kvaser_usb_start_xmit(struct sk_buff *skb, context->priv = priv; - can_put_echo_skb(skb, netdev, context->echo_index); + can_put_echo_skb(skb, netdev, context->echo_index, 0); usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index df54eb7d4b36..5347c89992ce 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -355,7 +355,7 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb, if (cf->can_id & CAN_RTR_FLAG) usb_msg.dlc |= MCBA_DLC_RTR_MASK; - can_put_echo_skb(skb, priv->netdev, ctx->ndx); + can_put_echo_skb(skb, priv->netdev, ctx->ndx, 0); err = mcba_usb_xmit(priv, (struct mcba_usb_msg *)&usb_msg, ctx); if (err) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 251835ea15aa..95672750419a 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -365,7 +365,7 @@ static netdev_tx_t peak_usb_ndo_start_xmit(struct sk_buff *skb, usb_anchor_urb(urb, &dev->tx_submitted); - can_put_echo_skb(skb, netdev, context->echo_index); + can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&dev->active_tx_urbs); diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index 7d92da8954fe..5add27614e2b 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -1137,7 +1137,7 @@ static netdev_tx_t ucan_start_xmit(struct sk_buff *skb, /* put the skb on can loopback stack */ spin_lock_irqsave(&up->echo_skb_lock, flags); - can_put_echo_skb(skb, up->netdev, echo_index); + can_put_echo_skb(skb, up->netdev, echo_index, 0); spin_unlock_irqrestore(&up->echo_skb_lock, flags); /* transmit it */ diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 44478304ff46..2e824d9d8167 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -664,7 +664,7 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); - can_put_echo_skb(skb, netdev, context->echo_index); + can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_urbs); diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 3f54edee92eb..8d5132a3f2c9 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -592,9 +592,9 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && (priv->devtype.flags & XCAN_FLAG_TXFEMP)) - can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max); + can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); else - can_put_echo_skb(skb, ndev, 0); + can_put_echo_skb(skb, ndev, 0, 0); priv->tx_head++; diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 5db9da30843c..eaac4a637ae0 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -18,7 +18,7 @@ void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, - unsigned int idx); + unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, unsigned int *frame_len_ptr); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); -- cgit v1.2.3 From 9420e1d495e2a3b5f673148b7e3ebc861b1441f7 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:28 +0100 Subject: can: dev: can_get_echo_skb(): extend to return can frame length In order to implement byte queue limits (bql) in CAN drivers, the length of the CAN frame needs to be passed into the networking stack after queueing and after transmission completion. To avoid to calculate this length twice, extend can_get_echo_skb() to return that value. Convert all users of this function, too. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-14-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/at91_can.c | 2 +- drivers/net/can/c_can/c_can.c | 2 +- drivers/net/can/cc770/cc770.c | 2 +- drivers/net/can/dev/skb.c | 5 +++-- drivers/net/can/grcan.c | 2 +- drivers/net/can/ifi_canfd/ifi_canfd.c | 2 +- drivers/net/can/kvaser_pciefd.c | 4 ++-- drivers/net/can/m_can/m_can.c | 4 ++-- drivers/net/can/mscan/mscan.c | 2 +- drivers/net/can/pch_can.c | 2 +- drivers/net/can/peak_canfd/peak_canfd.c | 2 +- drivers/net/can/rcar/rcar_can.c | 2 +- drivers/net/can/rcar/rcar_canfd.c | 2 +- drivers/net/can/sja1000/sja1000.c | 2 +- drivers/net/can/softing/softing_main.c | 2 +- drivers/net/can/spi/hi311x.c | 2 +- drivers/net/can/spi/mcp251x.c | 2 +- drivers/net/can/sun4i_can.c | 2 +- drivers/net/can/usb/ems_usb.c | 2 +- drivers/net/can/usb/esd_usb2.c | 2 +- drivers/net/can/usb/gs_usb.c | 2 +- drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 2 +- drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c | 2 +- drivers/net/can/usb/mcba_usb.c | 2 +- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 2 +- drivers/net/can/usb/ucan.c | 2 +- drivers/net/can/usb/usb_8dev.c | 2 +- drivers/net/can/xilinx_can.c | 2 +- include/linux/can/skb.h | 3 ++- 29 files changed, 34 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c index 90b223a80ed4..9ad9b39f480e 100644 --- a/drivers/net/can/at91_can.c +++ b/drivers/net/can/at91_can.c @@ -856,7 +856,7 @@ static void at91_irq_tx(struct net_device *dev, u32 reg_sr) if (likely(reg_msr & AT91_MSR_MRDY && ~reg_msr & AT91_MSR_MABT)) { /* _NOTE_: subtract AT91_MB_TX_FIRST offset from mb! */ - can_get_echo_skb(dev, mb - get_mb_tx_first(priv)); + can_get_echo_skb(dev, mb - get_mb_tx_first(priv), NULL); dev->stats.tx_packets++; can_led_event(dev, CAN_LED_EVENT_TX); } diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c index 13638954a25c..ef474bae47a1 100644 --- a/drivers/net/can/c_can/c_can.c +++ b/drivers/net/can/c_can/c_can.c @@ -733,7 +733,7 @@ static void c_can_do_tx(struct net_device *dev) pend &= ~(1 << idx); obj = idx + C_CAN_MSG_OBJ_TX_FIRST; c_can_inval_tx_object(dev, IF_RX, obj); - can_get_echo_skb(dev, idx); + can_get_echo_skb(dev, idx, NULL); bytes += priv->dlc[idx]; pkts++; } diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c index e53ca338368a..f8a130f594e2 100644 --- a/drivers/net/can/cc770/cc770.c +++ b/drivers/net/can/cc770/cc770.c @@ -703,7 +703,7 @@ static void cc770_tx_interrupt(struct net_device *dev, unsigned int o) stats->tx_packets++; can_put_echo_skb(priv->tx_skb, dev, 0, 0); - can_get_echo_skb(dev, 0); + can_get_echo_skb(dev, 0, NULL); priv->tx_skb = NULL; netif_wake_queue(dev); diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c index c184b4dce19e..53683d4312f1 100644 --- a/drivers/net/can/dev/skb.c +++ b/drivers/net/can/dev/skb.c @@ -121,12 +121,13 @@ __can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, * is handled in the device driver. The driver must protect * access to priv->echo_skb, if necessary. */ -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) +unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, + unsigned int *frame_len_ptr) { struct sk_buff *skb; u8 len; - skb = __can_get_echo_skb(dev, idx, &len, NULL); + skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index 8086cdc10000..4a8453290530 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -517,7 +517,7 @@ static int catch_up_echo_skb(struct net_device *dev, int budget, bool echo) stats->tx_packets++; stats->tx_bytes += priv->txdlc[i]; priv->txdlc[i] = 0; - can_get_echo_skb(dev, i); + can_get_echo_skb(dev, i, NULL); } else { /* For cleanup of untransmitted messages */ can_free_echo_skb(dev, i); diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c index 56ac9e1dace7..5bb957a26bc6 100644 --- a/drivers/net/can/ifi_canfd/ifi_canfd.c +++ b/drivers/net/can/ifi_canfd/ifi_canfd.c @@ -629,7 +629,7 @@ static irqreturn_t ifi_canfd_isr(int irq, void *dev_id) /* TX IRQ */ if (isr & IFI_CANFD_INTERRUPT_TXFIFO_REMOVE) { - stats->tx_bytes += can_get_echo_skb(ndev, 0); + stats->tx_bytes += can_get_echo_skb(ndev, 0, NULL); stats->tx_packets++; can_led_event(ndev, CAN_LED_EVENT_TX); } diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c index 0cf82f0646a3..37e05010ca91 100644 --- a/drivers/net/can/kvaser_pciefd.c +++ b/drivers/net/can/kvaser_pciefd.c @@ -1467,7 +1467,7 @@ static int kvaser_pciefd_handle_eack_packet(struct kvaser_pciefd *pcie, can->reg_base + KVASER_PCIEFD_KCAN_CTRL_REG); } else { int echo_idx = p->header[0] & KVASER_PCIEFD_PACKET_SEQ_MSK; - int dlc = can_get_echo_skb(can->can.dev, echo_idx); + int dlc = can_get_echo_skb(can->can.dev, echo_idx, NULL); struct net_device_stats *stats = &can->can.dev->stats; stats->tx_bytes += dlc; @@ -1533,7 +1533,7 @@ static int kvaser_pciefd_handle_ack_packet(struct kvaser_pciefd *pcie, netdev_dbg(can->can.dev, "Packet was flushed\n"); } else { int echo_idx = p->header[0] & KVASER_PCIEFD_PACKET_SEQ_MSK; - int dlc = can_get_echo_skb(can->can.dev, echo_idx); + int dlc = can_get_echo_skb(can->can.dev, echo_idx, NULL); u8 count = ioread32(can->reg_base + KVASER_PCIEFD_KCAN_TX_NPACKETS_REG) & 0xff; diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index fff7432103cb..3752520a7d4b 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -930,7 +930,7 @@ static void m_can_echo_tx_event(struct net_device *dev) (fgi << TXEFA_EFAI_SHIFT))); /* update stats */ - stats->tx_bytes += can_get_echo_skb(dev, msg_mark); + stats->tx_bytes += can_get_echo_skb(dev, msg_mark, NULL); stats->tx_packets++; } } @@ -972,7 +972,7 @@ static irqreturn_t m_can_isr(int irq, void *dev_id) if (cdev->version == 30) { if (ir & IR_TC) { /* Transmission Complete Interrupt*/ - stats->tx_bytes += can_get_echo_skb(dev, 0); + stats->tx_bytes += can_get_echo_skb(dev, 0, NULL); stats->tx_packets++; can_led_event(dev, CAN_LED_EVENT_TX); netif_wake_queue(dev); diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c index a28fdaa411c6..fa32e418eb29 100644 --- a/drivers/net/can/mscan/mscan.c +++ b/drivers/net/can/mscan/mscan.c @@ -448,7 +448,7 @@ static irqreturn_t mscan_isr(int irq, void *dev_id) out_8(®s->cantbsel, mask); stats->tx_bytes += in_8(®s->tx.dlr); stats->tx_packets++; - can_get_echo_skb(dev, entry->id); + can_get_echo_skb(dev, entry->id, NULL); priv->tx_active &= ~mask; list_del(pos); } diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c index a4c35b48d8e9..92a54a5fd4c5 100644 --- a/drivers/net/can/pch_can.c +++ b/drivers/net/can/pch_can.c @@ -711,7 +711,7 @@ static void pch_can_tx_complete(struct net_device *ndev, u32 int_stat) struct net_device_stats *stats = &(priv->ndev->stats); u32 dlc; - can_get_echo_skb(ndev, int_stat - PCH_RX_OBJ_END - 1); + can_get_echo_skb(ndev, int_stat - PCH_RX_OBJ_END - 1, NULL); iowrite32(PCH_CMASK_RX_TX_GET | PCH_CMASK_CLRINTPND, &priv->regs->ifregs[1].cmask); pch_can_rw_msg_obj(&priv->regs->ifregs[1].creq, int_stat); diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index 179a8e10fbb8..00847cbaf7b6 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -266,7 +266,7 @@ static int pucan_handle_can_rx(struct peak_canfd_priv *priv, unsigned long flags; spin_lock_irqsave(&priv->echo_lock, flags); - can_get_echo_skb(priv->ndev, msg->client); + can_get_echo_skb(priv->ndev, msg->client, NULL); /* count bytes of the echo instead of skb */ stats->tx_bytes += cf_len; diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 0b7e488bc4fe..4870c4ea190a 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -386,7 +386,7 @@ static void rcar_can_tx_done(struct net_device *ndev) stats->tx_bytes += priv->tx_dlc[priv->tx_tail % RCAR_CAN_FIFO_DEPTH]; priv->tx_dlc[priv->tx_tail % RCAR_CAN_FIFO_DEPTH] = 0; - can_get_echo_skb(ndev, priv->tx_tail % RCAR_CAN_FIFO_DEPTH); + can_get_echo_skb(ndev, priv->tx_tail % RCAR_CAN_FIFO_DEPTH, NULL); priv->tx_tail++; netif_wake_queue(ndev); } diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index 38376f29bc56..d8d233e62990 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1044,7 +1044,7 @@ static void rcar_canfd_tx_done(struct net_device *ndev) stats->tx_packets++; stats->tx_bytes += priv->tx_len[sent]; priv->tx_len[sent] = 0; - can_get_echo_skb(ndev, sent); + can_get_echo_skb(ndev, sent, NULL); spin_lock_irqsave(&priv->tx_lock, flags); priv->tx_tail++; diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index e98482c7bf33..9e86488ba55f 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -531,7 +531,7 @@ irqreturn_t sja1000_interrupt(int irq, void *dev_id) stats->tx_bytes += priv->read_reg(priv, SJA1000_FI) & 0xf; stats->tx_packets++; - can_get_echo_skb(dev, 0); + can_get_echo_skb(dev, 0, NULL); } netif_wake_queue(dev); can_led_event(dev, CAN_LED_EVENT_TX); diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c index a5314448c5ae..c44f3411e561 100644 --- a/drivers/net/can/softing/softing_main.c +++ b/drivers/net/can/softing/softing_main.c @@ -284,7 +284,7 @@ static int softing_handle_1(struct softing *card) skb = priv->can.echo_skb[priv->tx.echo_get]; if (skb) skb->tstamp = ktime; - can_get_echo_skb(netdev, priv->tx.echo_get); + can_get_echo_skb(netdev, priv->tx.echo_get, NULL); ++priv->tx.echo_get; if (priv->tx.echo_get >= TX_ECHO_SKB_MAX) priv->tx.echo_get = 0; diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index 8c83a9e5a9e4..c3e020c90111 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -725,7 +725,7 @@ static irqreturn_t hi3110_can_ist(int irq, void *dev_id) net->stats.tx_bytes += priv->tx_len - 1; can_led_event(net, CAN_LED_EVENT_TX); if (priv->tx_len) { - can_get_echo_skb(net, 0); + can_get_echo_skb(net, 0, NULL); priv->tx_len = 0; } netif_wake_queue(net); diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index 40866754aafc..f69fb4238a65 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -1171,7 +1171,7 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id) net->stats.tx_bytes += priv->tx_len - 1; can_led_event(net, CAN_LED_EVENT_TX); if (priv->tx_len) { - can_get_echo_skb(net, 0); + can_get_echo_skb(net, 0, NULL); priv->tx_len = 0; } netif_wake_queue(net); diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index b75175d59104..54aa7c25c4de 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -655,7 +655,7 @@ static irqreturn_t sun4i_can_interrupt(int irq, void *dev_id) readl(priv->base + SUN4I_REG_RBUF_RBACK_START_ADDR) & 0xf; stats->tx_packets++; - can_get_echo_skb(dev, 0); + can_get_echo_skb(dev, 0, NULL); netif_wake_queue(dev); can_led_event(dev, CAN_LED_EVENT_TX); } diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 5e5330060464..18f40eb20360 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -518,7 +518,7 @@ static void ems_usb_write_bulk_callback(struct urb *urb) netdev->stats.tx_packets++; netdev->stats.tx_bytes += context->dlc; - can_get_echo_skb(netdev, context->echo_index); + can_get_echo_skb(netdev, context->echo_index, NULL); /* Release context */ context->echo_index = MAX_TX_URBS; diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c index 68d8a85f00c4..562acbf454fd 100644 --- a/drivers/net/can/usb/esd_usb2.c +++ b/drivers/net/can/usb/esd_usb2.c @@ -357,7 +357,7 @@ static void esd_usb2_tx_done_msg(struct esd_usb2_net_priv *priv, if (!msg->msg.txdone.status) { stats->tx_packets++; stats->tx_bytes += context->len; - can_get_echo_skb(netdev, context->echo_index); + can_get_echo_skb(netdev, context->echo_index, NULL); } else { stats->tx_errors++; can_free_echo_skb(netdev, context->echo_index); diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 5ce9ba5d29d6..a00dc1904415 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -370,7 +370,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) goto resubmit_urb; } - can_get_echo_skb(netdev, hf->echo_id); + can_get_echo_skb(netdev, hf->echo_id, NULL); gs_free_tx_context(txc); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c index 480bd2ecb296..dcee8dc828ec 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c @@ -1151,7 +1151,7 @@ static void kvaser_usb_hydra_tx_acknowledge(const struct kvaser_usb *dev, spin_lock_irqsave(&priv->tx_contexts_lock, irq_flags); - can_get_echo_skb(priv->netdev, context->echo_index); + can_get_echo_skb(priv->netdev, context->echo_index, NULL); context->echo_index = dev->max_tx_urbs; --priv->active_tx_contexts; netif_wake_queue(priv->netdev); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index 98c016ef0607..59ba7c7beec0 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -594,7 +594,7 @@ static void kvaser_usb_leaf_tx_acknowledge(const struct kvaser_usb *dev, spin_lock_irqsave(&priv->tx_contexts_lock, flags); - can_get_echo_skb(priv->netdev, context->echo_index); + can_get_echo_skb(priv->netdev, context->echo_index, NULL); context->echo_index = dev->max_tx_urbs; --priv->active_tx_contexts; netif_wake_queue(priv->netdev); diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 5347c89992ce..4232a7126c1b 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -237,7 +237,7 @@ static void mcba_usb_write_bulk_callback(struct urb *urb) netdev->stats.tx_bytes += ctx->dlc; can_led_event(netdev, CAN_LED_EVENT_TX); - can_get_echo_skb(netdev, ctx->ndx); + can_get_echo_skb(netdev, ctx->ndx, NULL); } if (urb->status) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 95672750419a..573b11559d73 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -309,7 +309,7 @@ static void peak_usb_write_bulk_callback(struct urb *urb) } /* should always release echo skb and corresponding context */ - can_get_echo_skb(netdev, context->echo_index); + can_get_echo_skb(netdev, context->echo_index, NULL); context->echo_index = PCAN_USB_MAX_TX_URBS; /* do wakeup tx queue in case of success only */ diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index 5add27614e2b..fa403c080871 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -672,7 +672,7 @@ static void ucan_tx_complete_msg(struct ucan_priv *up, /* update statistics */ up->netdev->stats.tx_packets++; up->netdev->stats.tx_bytes += dlc; - can_get_echo_skb(up->netdev, echo_index); + can_get_echo_skb(up->netdev, echo_index, NULL); } else { up->netdev->stats.tx_dropped++; can_free_echo_skb(up->netdev, echo_index); diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index 2e824d9d8167..e8c42430a4fc 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -585,7 +585,7 @@ static void usb_8dev_write_bulk_callback(struct urb *urb) netdev->stats.tx_packets++; netdev->stats.tx_bytes += context->dlc; - can_get_echo_skb(netdev, context->echo_index); + can_get_echo_skb(netdev, context->echo_index, NULL); can_led_event(netdev, CAN_LED_EVENT_TX); diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 8d5132a3f2c9..37fa19c62d73 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -1292,7 +1292,7 @@ static void xcan_tx_interrupt(struct net_device *ndev, u32 isr) while (frames_sent--) { stats->tx_bytes += can_get_echo_skb(ndev, priv->tx_tail % - priv->tx_max); + priv->tx_max, NULL); priv->tx_tail++; stats->tx_packets++; } diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index eaac4a637ae0..685f34cfba20 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -21,7 +21,8 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, unsigned int *frame_len_ptr); -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); +unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, + unsigned int *frame_len_ptr); void can_free_echo_skb(struct net_device *dev, unsigned int idx); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, -- cgit v1.2.3 From 99842c9685ab00fa8689e8bd12bde62b706b6198 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 11 Jan 2021 15:19:29 +0100 Subject: can: dev: can_rx_offload_get_echo_skb(): extend to return can frame length In order to implement byte queue limits (bql) in CAN drivers, the length of the CAN frame needs to be passed into the networking stack after queueing and after transmission completion. To avoid to calculate this length twice, extend can_rx_offload_get_echo_skb() to return that value. Convert all users of this function, too. Reviewed-by: Vincent Mailhol Link: https://lore.kernel.org/r/20210111141930.693847-15-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/rx-offload.c | 5 +++-- drivers/net/can/flexcan.c | 5 +++-- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +- drivers/net/can/ti_hecc.c | 2 +- include/linux/can/rx-offload.h | 3 ++- 5 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c index 6a26b5282df1..ab2c1543786c 100644 --- a/drivers/net/can/dev/rx-offload.c +++ b/drivers/net/can/dev/rx-offload.c @@ -263,7 +263,8 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload, EXPORT_SYMBOL_GPL(can_rx_offload_queue_sorted); unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, - unsigned int idx, u32 timestamp) + unsigned int idx, u32 timestamp, + unsigned int *frame_len_ptr) { struct net_device *dev = offload->dev; struct net_device_stats *stats = &dev->stats; @@ -271,7 +272,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, u8 len; int err; - skb = __can_get_echo_skb(dev, idx, &len, NULL); + skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 202d08f8e1a4..5d9157c655e9 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -1122,8 +1122,9 @@ static irqreturn_t flexcan_irq(int irq, void *dev_id) u32 reg_ctrl = priv->read(&priv->tx_mb->can_ctrl); handled = IRQ_HANDLED; - stats->tx_bytes += can_rx_offload_get_echo_skb(&priv->offload, - 0, reg_ctrl << 16); + stats->tx_bytes += + can_rx_offload_get_echo_skb(&priv->offload, 0, + reg_ctrl << 16, NULL); stats->tx_packets++; can_led_event(dev, CAN_LED_EVENT_TX); diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 95bba456a4cd..63bbe0930e53 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1271,7 +1271,7 @@ mcp251xfd_handle_tefif_one(struct mcp251xfd_priv *priv, stats->tx_bytes += can_rx_offload_get_echo_skb(&priv->offload, mcp251xfd_get_tef_tail(priv), - hw_tef_obj->ts); + hw_tef_obj->ts, NULL); stats->tx_packets++; priv->tef->tail++; diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c index 485c19bc98c2..73245d8836a9 100644 --- a/drivers/net/can/ti_hecc.c +++ b/drivers/net/can/ti_hecc.c @@ -757,7 +757,7 @@ static irqreturn_t ti_hecc_interrupt(int irq, void *dev_id) stamp = hecc_read_stamp(priv, mbxno); stats->tx_bytes += can_rx_offload_get_echo_skb(&priv->offload, - mbxno, stamp); + mbxno, stamp, NULL); stats->tx_packets++; can_led_event(ndev, CAN_LED_EVENT_TX); --priv->tx_tail; diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index f1b38088b765..40882df7105e 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -44,7 +44,8 @@ int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload); int can_rx_offload_queue_sorted(struct can_rx_offload *offload, struct sk_buff *skb, u32 timestamp); unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, - unsigned int idx, u32 timestamp); + unsigned int idx, u32 timestamp, + unsigned int *frame_len_ptr); int can_rx_offload_queue_tail(struct can_rx_offload *offload, struct sk_buff *skb); void can_rx_offload_del(struct can_rx_offload *offload); -- cgit v1.2.3 From b1ae3587d16a8c8fc9453e147c8708d6f006ffbb Mon Sep 17 00:00:00 2001 From: Bjarni Jonasson Date: Wed, 13 Jan 2021 12:56:25 +0100 Subject: net: phy: Add 100 base-x mode Sparx-5 supports this mode and it is missing in the PHY core. Signed-off-by: Bjarni Jonasson Reviewed-by: Russell King Signed-off-by: Jakub Kicinski --- Documentation/networking/phy.rst | 5 +++++ include/linux/phy.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index b2f7ec794bc8..399f17976a6c 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -286,6 +286,11 @@ Some of the interface modes are described below: Note: due to legacy usage, some 10GBASE-R usage incorrectly makes use of this definition. +``PHY_INTERFACE_MODE_100BASEX`` + This defines IEEE 802.3 Clause 24. The link operates at a fixed data + rate of 125Mpbs using a 4B/5B encoding scheme, resulting in an underlying + data rate of 100Mpbs. + Pause frames / flow control =========================== diff --git a/include/linux/phy.h b/include/linux/phy.h index 9effb511acde..24fcc6456a9e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -104,6 +104,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII + * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI @@ -135,6 +136,7 @@ typedef enum { PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_QSGMII, PHY_INTERFACE_MODE_TRGMII, + PHY_INTERFACE_MODE_100BASEX, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_RXAUI, @@ -217,6 +219,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "usxgmii"; case PHY_INTERFACE_MODE_10GKR: return "10gbase-kr"; + case PHY_INTERFACE_MODE_100BASEX: + return "100base-x"; default: return "unknown"; } -- cgit v1.2.3 From 058102a6e9eb1905a7da41b7a5a7a1f278a3828a Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 13 Jan 2021 09:42:53 +0100 Subject: net: dsa: Link aggregation support Monitor the following events and notify the driver when: - A DSA port joins/leaves a LAG. - A LAG, made up of DSA ports, joins/leaves a bridge. - A DSA port in a LAG is enabled/disabled (enabled meaning "distributing" in 802.3ad LACP terms). When a LAG joins a bridge, the DSA subsystem will treat that as each individual port joining the bridge. The driver may look at the port's LAG device pointer to see if it is associated with any LAG, if that is required. This is analogue to how switchdev events are replicated out to all lower devices when reaching e.g. a LAG. Drivers can optionally request that DSA maintain a linear mapping from a LAG ID to the corresponding netdev by setting ds->num_lag_ids to the desired size. In the event that the hardware is not capable of offloading a particular LAG for any reason (the typical case being use of exotic modes like broadcast), DSA will take a hands-off approach, allowing the LAG to be formed as a pure software construct. This is reported back through the extended ACK, but is otherwise transparent to the user. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 60 +++++++++++++++++++++++++++++++++++ net/dsa/dsa2.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa_priv.h | 36 +++++++++++++++++++++ net/dsa/port.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 70 ++++++++++++++++++++++++++++++++++++---- net/dsa/switch.c | 50 +++++++++++++++++++++++++++++ 6 files changed, 381 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index c3485ba6c312..06e3784ec55c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -149,8 +149,41 @@ struct dsa_switch_tree { /* List of DSA links composing the routing table */ struct list_head rtable; + + /* Maps offloaded LAG netdevs to a zero-based linear ID for + * drivers that need it. + */ + struct net_device **lags; + unsigned int lags_len; }; +#define dsa_lags_foreach_id(_id, _dst) \ + for ((_id) = 0; (_id) < (_dst)->lags_len; (_id)++) \ + if ((_dst)->lags[(_id)]) + +#define dsa_lag_foreach_port(_dp, _dst, _lag) \ + list_for_each_entry((_dp), &(_dst)->ports, list) \ + if ((_dp)->lag_dev == (_lag)) + +static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, + unsigned int id) +{ + return dst->lags[id]; +} + +static inline int dsa_lag_id(struct dsa_switch_tree *dst, + struct net_device *lag) +{ + unsigned int id; + + dsa_lags_foreach_id(id, dst) { + if (dsa_lag_dev(dst, id) == lag) + return id; + } + + return -ENODEV; +} + /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, @@ -220,6 +253,8 @@ struct dsa_port { bool devlink_port_setup; struct phylink *pl; struct phylink_config pl_config; + struct net_device *lag_dev; + bool lag_tx_enabled; struct list_head list; @@ -340,6 +375,14 @@ struct dsa_switch { */ bool mtu_enforcement_ingress; + /* Drivers that benefit from having an ID associated with each + * offloaded LAG should set this to the maximum number of + * supported IDs. DSA will then maintain a mapping of _at + * least_ these many IDs, accessible to drivers via + * dsa_lag_id(). + */ + unsigned int num_lag_ids; + size_t num_ports; }; @@ -626,6 +669,13 @@ struct dsa_switch_ops { void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct net_device *br); + int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, + int port); + int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, + int port, struct net_device *lag, + struct netdev_lag_upper_info *info); + int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, + int port, struct net_device *lag); /* * PTP functionality @@ -657,6 +707,16 @@ struct dsa_switch_ops { int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); + + /* + * LAG integration + */ + int (*port_lag_change)(struct dsa_switch *ds, int port); + int (*port_lag_join)(struct dsa_switch *ds, int port, + struct net_device *lag, + struct netdev_lag_upper_info *info); + int (*port_lag_leave)(struct dsa_switch *ds, int port, + struct net_device *lag); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 01f21b0b379a..fd343466df27 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,6 +21,65 @@ static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); +/** + * dsa_lag_map() - Map LAG netdev to a linear LAG ID + * @dst: Tree in which to record the mapping. + * @lag: Netdev that is to be mapped to an ID. + * + * dsa_lag_id/dsa_lag_dev can then be used to translate between the + * two spaces. The size of the mapping space is determined by the + * driver by setting ds->num_lag_ids. It is perfectly legal to leave + * it unset if it is not needed, in which case these functions become + * no-ops. + */ +void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag) +{ + unsigned int id; + + if (dsa_lag_id(dst, lag) >= 0) + /* Already mapped */ + return; + + for (id = 0; id < dst->lags_len; id++) { + if (!dsa_lag_dev(dst, id)) { + dst->lags[id] = lag; + return; + } + } + + /* No IDs left, which is OK. Some drivers do not need it. The + * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id + * returns an error for this device when joining the LAG. The + * driver can then return -EOPNOTSUPP back to DSA, which will + * fall back to a software LAG. + */ +} + +/** + * dsa_lag_unmap() - Remove a LAG ID mapping + * @dst: Tree in which the mapping is recorded. + * @lag: Netdev that was mapped. + * + * As there may be multiple users of the mapping, it is only removed + * if there are no other references to it. + */ +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag) +{ + struct dsa_port *dp; + unsigned int id; + + dsa_lag_foreach_port(dp, dst, lag) + /* There are remaining users of this mapping */ + return; + + dsa_lags_foreach_id(id, dst) { + if (dsa_lag_dev(dst, id) == lag) { + dst->lags[id] = NULL; + break; + } + } +} + struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) { struct dsa_switch_tree *dst; @@ -578,6 +637,32 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) dsa_master_teardown(dp->master); } +static int dsa_tree_setup_lags(struct dsa_switch_tree *dst) +{ + unsigned int len = 0; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->num_lag_ids > len) + len = dp->ds->num_lag_ids; + } + + if (!len) + return 0; + + dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL); + if (!dst->lags) + return -ENOMEM; + + dst->lags_len = len; + return 0; +} + +static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) +{ + kfree(dst->lags); +} + static int dsa_tree_setup(struct dsa_switch_tree *dst) { bool complete; @@ -605,12 +690,18 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) if (err) goto teardown_switches; + err = dsa_tree_setup_lags(dst); + if (err) + goto teardown_master; + dst->setup = true; pr_info("DSA: tree %d setup\n", dst->index); return 0; +teardown_master: + dsa_tree_teardown_master(dst); teardown_switches: dsa_tree_teardown_switches(dst); teardown_default_cpu: @@ -626,6 +717,8 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) if (!dst->setup) return; + dsa_tree_teardown_lags(dst); + dsa_tree_teardown_master(dst); dsa_tree_teardown_switches(dst); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 89143cc049db..2ce46bb87703 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -20,6 +20,9 @@ enum { DSA_NOTIFIER_BRIDGE_LEAVE, DSA_NOTIFIER_FDB_ADD, DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_LAG_CHANGE, + DSA_NOTIFIER_LAG_JOIN, + DSA_NOTIFIER_LAG_LEAVE, DSA_NOTIFIER_MDB_ADD, DSA_NOTIFIER_MDB_DEL, DSA_NOTIFIER_VLAN_ADD, @@ -55,6 +58,15 @@ struct dsa_notifier_mdb_info { int port; }; +/* DSA_NOTIFIER_LAG_* */ +struct dsa_notifier_lag_info { + struct net_device *lag; + int sw_index; + int port; + + struct netdev_lag_upper_info *info; +}; + /* DSA_NOTIFIER_VLAN_* */ struct dsa_notifier_vlan_info { const struct switchdev_obj_port_vlan *vlan; @@ -134,6 +146,11 @@ void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); +int dsa_port_lag_change(struct dsa_port *dp, + struct netdev_lag_lower_state_info *linfo); +int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, + struct netdev_lag_upper_info *uinfo); +void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering); bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); @@ -159,6 +176,22 @@ int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; +static inline bool dsa_port_offloads_netdev(struct dsa_port *dp, + struct net_device *dev) +{ + /* Switchdev offloading can be configured on: */ + + if (dev == dp->slave) + /* DSA ports directly connected to a bridge. */ + return true; + + if (dp->lag_dev == dev) + /* DSA ports connected to a bridge via a LAG */ + return true; + + return false; +} + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); @@ -248,6 +281,9 @@ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); /* dsa2.c */ +void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag); +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag); + extern struct list_head dsa_tree_list; #endif diff --git a/net/dsa/port.c b/net/dsa/port.c index e59bf66c4c0d..f5b0f72ee7cd 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -191,6 +191,85 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) dsa_port_set_state_now(dp, BR_STATE_FORWARDING); } +int dsa_port_lag_change(struct dsa_port *dp, + struct netdev_lag_lower_state_info *linfo) +{ + struct dsa_notifier_lag_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + }; + bool tx_enabled; + + if (!dp->lag_dev) + return 0; + + /* On statically configured aggregates (e.g. loadbalance + * without LACP) ports will always be tx_enabled, even if the + * link is down. Thus we require both link_up and tx_enabled + * in order to include it in the tx set. + */ + tx_enabled = linfo->link_up && linfo->tx_enabled; + + if (tx_enabled == dp->lag_tx_enabled) + return 0; + + dp->lag_tx_enabled = tx_enabled; + + return dsa_port_notify(dp, DSA_NOTIFIER_LAG_CHANGE, &info); +} + +int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag, + struct netdev_lag_upper_info *uinfo) +{ + struct dsa_notifier_lag_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .lag = lag, + .info = uinfo, + }; + int err; + + dsa_lag_map(dp->ds->dst, lag); + dp->lag_dev = lag; + + err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info); + if (err) { + dp->lag_dev = NULL; + dsa_lag_unmap(dp->ds->dst, lag); + } + + return err; +} + +void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) +{ + struct dsa_notifier_lag_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .lag = lag, + }; + int err; + + if (!dp->lag_dev) + return; + + /* Port might have been part of a LAG that in turn was + * attached to a bridge. + */ + if (dp->bridge_dev) + dsa_port_bridge_leave(dp, dp->bridge_dev); + + dp->lag_tx_enabled = false; + dp->lag_dev = NULL; + + err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); + if (err) + pr_err("DSA: failed to notify DSA_NOTIFIER_LAG_LEAVE: %d\n", + err); + + dsa_lag_unmap(dp->ds->dst, lag); +} + /* Must be called under rcu_read_lock() */ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e53c8ca6eb66..c5c81cba8259 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -273,7 +273,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev, struct dsa_port *dp = dsa_slave_to_port(dev); int ret; - if (attr->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, attr->orig_dev)) return -EOPNOTSUPP; switch (attr->id) { @@ -333,7 +333,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, struct switchdev_obj_port_vlan vlan; int err; - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) @@ -378,7 +378,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; @@ -407,7 +407,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, struct switchdev_obj_port_vlan *vlan; int err; - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) @@ -435,7 +435,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: - if (obj->orig_dev != dev) + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; @@ -1907,6 +1907,46 @@ static int dsa_slave_changeupper(struct net_device *dev, dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } + } else if (netif_is_lag_master(info->upper_dev)) { + if (info->linking) { + err = dsa_port_lag_join(dp, info->upper_dev, + info->upper_info); + if (err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(info->info.extack, + "Offloading not supported"); + err = 0; + } + err = notifier_from_errno(err); + } else { + dsa_port_lag_leave(dp, info->upper_dev); + err = NOTIFY_OK; + } + } + + return err; +} + +static int +dsa_slave_lag_changeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *lower; + struct list_head *iter; + int err = NOTIFY_DONE; + struct dsa_port *dp; + + netdev_for_each_lower_dev(dev, lower, iter) { + if (!dsa_slave_dev_check(lower)) + continue; + + dp = dsa_slave_to_port(lower); + if (!dp->lag_dev) + /* Software LAG */ + continue; + + err = dsa_slave_changeupper(lower, info); + if (notifier_to_errno(err)) + break; } return err; @@ -2004,10 +2044,26 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, break; } case NETDEV_CHANGEUPPER: + if (dsa_slave_dev_check(dev)) + return dsa_slave_changeupper(dev, ptr); + + if (netif_is_lag_master(dev)) + return dsa_slave_lag_changeupper(dev, ptr); + + break; + case NETDEV_CHANGELOWERSTATE: { + struct netdev_notifier_changelowerstate_info *info = ptr; + struct dsa_port *dp; + int err; + if (!dsa_slave_dev_check(dev)) - return NOTIFY_DONE; + break; - return dsa_slave_changeupper(dev, ptr); + dp = dsa_slave_to_port(dev); + + err = dsa_port_lag_change(dp, info->lower_state_info); + return notifier_from_errno(err); + } } return NOTIFY_DONE; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 21d2f842d068..cc0b25f3adea 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -166,6 +166,47 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); } +static int dsa_switch_lag_change(struct dsa_switch *ds, + struct dsa_notifier_lag_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_lag_change) + return ds->ops->port_lag_change(ds, info->port); + + if (ds->index != info->sw_index && ds->ops->crosschip_lag_change) + return ds->ops->crosschip_lag_change(ds, info->sw_index, + info->port); + + return 0; +} + +static int dsa_switch_lag_join(struct dsa_switch *ds, + struct dsa_notifier_lag_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_lag_join) + return ds->ops->port_lag_join(ds, info->port, info->lag, + info->info); + + if (ds->index != info->sw_index && ds->ops->crosschip_lag_join) + return ds->ops->crosschip_lag_join(ds, info->sw_index, + info->port, info->lag, + info->info); + + return 0; +} + +static int dsa_switch_lag_leave(struct dsa_switch *ds, + struct dsa_notifier_lag_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_lag_leave) + return ds->ops->port_lag_leave(ds, info->port, info->lag); + + if (ds->index != info->sw_index && ds->ops->crosschip_lag_leave) + return ds->ops->crosschip_lag_leave(ds, info->sw_index, + info->port, info->lag); + + return 0; +} + static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, struct dsa_notifier_mdb_info *info) { @@ -278,6 +319,15 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_FDB_DEL: err = dsa_switch_fdb_del(ds, info); break; + case DSA_NOTIFIER_LAG_CHANGE: + err = dsa_switch_lag_change(ds, info); + break; + case DSA_NOTIFIER_LAG_JOIN: + err = dsa_switch_lag_join(ds, info); + break; + case DSA_NOTIFIER_LAG_LEAVE: + err = dsa_switch_lag_leave(ds, info); + break; case DSA_NOTIFIER_MDB_ADD: err = dsa_switch_mdb_add(ds, info); break; -- cgit v1.2.3 From 91c960b0056672e74627776655c926388350fa30 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:44 +0000 Subject: bpf: Rename BPF_XADD and prepare to encode other atomics in .imm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A subsequent patch will add additional atomic operations. These new operations will use the same opcode field as the existing XADD, with the immediate discriminating different operations. In preparation, rename the instruction mode BPF_ATOMIC and start calling the zero immediate BPF_ADD. This is possible (doesn't break existing valid BPF progs) because the immediate field is currently reserved MBZ and BPF_ADD is zero. All uses are removed from the tree but the BPF_XADD definition is kept around to avoid breaking builds for people including kernel headers. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210114181751.768687-5-jackmanb@google.com --- Documentation/networking/filter.rst | 30 ++++++++------ arch/arm/net/bpf_jit_32.c | 7 ++-- arch/arm64/net/bpf_jit_comp.c | 16 ++++++-- arch/mips/net/ebpf_jit.c | 11 ++++-- arch/powerpc/net/bpf_jit_comp64.c | 25 +++++++++--- arch/riscv/net/bpf_jit_comp32.c | 20 ++++++++-- arch/riscv/net/bpf_jit_comp64.c | 16 ++++++-- arch/s390/net/bpf_jit_comp.c | 27 +++++++------ arch/sparc/net/bpf_jit_comp_64.c | 17 ++++++-- arch/x86/net/bpf_jit_comp.c | 46 ++++++++++++++++------ arch/x86/net/bpf_jit_comp32.c | 6 +-- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 14 +++++-- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 +- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 15 ++++--- include/linux/filter.h | 16 ++++++-- include/uapi/linux/bpf.h | 5 ++- kernel/bpf/core.c | 31 ++++++++++----- kernel/bpf/disasm.c | 6 ++- kernel/bpf/verifier.c | 24 ++++++----- lib/test_bpf.c | 14 +++---- samples/bpf/bpf_insn.h | 4 +- samples/bpf/cookie_uid_helper_example.c | 8 ++-- samples/bpf/sock_example.c | 2 +- samples/bpf/test_cgrp2_attach.c | 5 ++- tools/include/linux/filter.h | 15 +++++-- tools/include/uapi/linux/bpf.h | 5 ++- .../selftests/bpf/prog_tests/cgroup_attach_multi.c | 4 +- tools/testing/selftests/bpf/test_cgroup_storage.c | 2 +- tools/testing/selftests/bpf/verifier/ctx.c | 7 ++-- .../selftests/bpf/verifier/direct_packet_access.c | 4 +- tools/testing/selftests/bpf/verifier/leak_ptr.c | 10 ++--- tools/testing/selftests/bpf/verifier/meta_access.c | 4 +- tools/testing/selftests/bpf/verifier/unpriv.c | 3 +- .../selftests/bpf/verifier/value_illegal_alu.c | 2 +- tools/testing/selftests/bpf/verifier/xadd.c | 18 ++++----- 35 files changed, 291 insertions(+), 152 deletions(-) (limited to 'include') diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index debb59e374de..1583d59d806d 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1006,13 +1006,13 @@ Size modifier is one of ... Mode modifier is one of:: - BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 - BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ - BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ - BPF_XADD 0xc0 /* eBPF only, exclusive add */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ + BPF_ABS 0x20 + BPF_IND 0x40 + BPF_MEM 0x60 + BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ + BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ + BPF_ATOMIC 0xc0 /* eBPF only, atomic operations */ eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and (BPF_IND | | BPF_LD) which are used to access packet data. @@ -1044,11 +1044,19 @@ Unlike classic BPF instruction set, eBPF has generic load/store operations:: BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) - BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg - BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg -Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and -2 byte atomic increments are not supported. +Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. + +It also includes atomic operations, which use the immediate field for extra +encoding. + + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + +Note that 1 and 2 byte atomic operations are not supported. + +You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to +the exclusive-add operation encoded when the immediate field is zero. eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 0207b6ea6e8a..897634d0a67c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1620,10 +1620,9 @@ exit: } emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code)); break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + /* Atomic ops */ + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_W: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index ef9f1d5e989d..f7b194878a99 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -875,10 +875,18 @@ emit_cond_jmp: } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op code %02x\n", insn->imm); + return -EINVAL; + } + + /* STX XADD: lock *(u32 *)(dst + off) += src + * and + * STX XADD: lock *(u64 *)(dst + off) += src + */ + if (!off) { reg = dst; } else { diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 561154cbcc40..939dd06764bc 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1423,8 +1423,8 @@ jeq_common: case BPF_STX | BPF_H | BPF_MEM: case BPF_STX | BPF_W | BPF_MEM: case BPF_STX | BPF_DW | BPF_MEM: - case BPF_STX | BPF_W | BPF_XADD: - case BPF_STX | BPF_DW | BPF_XADD: + case BPF_STX | BPF_W | BPF_ATOMIC: + case BPF_STX | BPF_DW | BPF_ATOMIC: if (insn->dst_reg == BPF_REG_10) { ctx->flags |= EBPF_SEEN_FP; dst = MIPS_R_SP; @@ -1438,7 +1438,12 @@ jeq_common: src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); if (src < 0) return src; - if (BPF_MODE(insn->code) == BPF_XADD) { + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + if (insn->imm != BPF_ADD) { + pr_err("ATOMIC OP %02x NOT HANDLED\n", insn->imm); + return -EINVAL; + } + /* * If mem_off does not fit within the 9 bit ll/sc * instruction immediate field, use a temp reg. diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 022103c6a201..aaf1a887f653 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -683,10 +683,18 @@ emit_clear: break; /* - * BPF_STX XADD (atomic_add) + * BPF_STX ATOMIC (atomic ops) */ - /* *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -ENOTSUPP; + } + + /* *(u32 *)(dst + off) += src */ + /* Get EA into TMP_REG_1 */ EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off)); tmp_idx = ctx->idx * 4; @@ -699,8 +707,15 @@ emit_clear: /* we're done if this succeeded */ PPC_BCC_SHORT(COND_NE, tmp_idx); break; - /* *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -ENOTSUPP; + } + /* *(u64 *)(dst + off) += src */ + EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off)); tmp_idx = ctx->idx * 4; EMIT(PPC_RAW_LDARX(b2p[TMP_REG_2], 0, b2p[TMP_REG_1], 0)); diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 579575f9cdae..81de865f4c7c 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -881,7 +881,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off, const s8 *rd = bpf_get_reg64(dst, tmp1, ctx); const s8 *rs = bpf_get_reg64(src, tmp2, ctx); - if (mode == BPF_XADD && size != BPF_W) + if (mode == BPF_ATOMIC && size != BPF_W) return -1; emit_imm(RV_REG_T0, off, ctx); @@ -899,7 +899,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off, case BPF_MEM: emit(rv_sw(RV_REG_T0, 0, lo(rs)), ctx); break; - case BPF_XADD: + case BPF_ATOMIC: /* Only BPF_ADD supported */ emit(rv_amoadd_w(RV_REG_ZERO, lo(rs), RV_REG_T0, 0, 0), ctx); break; @@ -1260,7 +1260,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_W: case BPF_STX | BPF_MEM | BPF_DW: - case BPF_STX | BPF_XADD | BPF_W: if (BPF_CLASS(code) == BPF_ST) { emit_imm32(tmp2, imm, ctx); src = tmp2; @@ -1271,8 +1270,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_info_once( + "bpf-jit: not supported: atomic operation %02x ***\n", + insn->imm); + return -EFAULT; + } + + if (emit_store_r64(dst, src, off, ctx, BPF_SIZE(code), + BPF_MODE(code))) + return -1; + break; + /* No hardware support for 8-byte atomics in RV32. */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_DW: /* Fallthrough. */ notsupported: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 8a56b5293117..b44ff52f84a6 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1027,10 +1027,18 @@ out_be: emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); emit_sd(RV_REG_T1, 0, rs, ctx); break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err("bpf-jit: not supported: atomic operation %02x ***\n", + insn->imm); + return -EINVAL; + } + + /* atomic_add: lock *(u32 *)(dst + off) += src + * atomic_add: lock *(u64 *)(dst + off) += src + */ + if (off) { if (is_12b_int(off)) { emit_addi(RV_REG_T1, rd, off, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0a4182792876..f973e2ead197 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1205,18 +1205,23 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen |= SEEN_MEM; break; /* - * BPF_STX XADD (atomic_add) + * BPF_ATOMIC */ - case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */ - /* laal %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg, - dst_reg, off); - jit->seen |= SEEN_MEM; - break; - case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */ - /* laalg %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg, - dst_reg, off); + case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_err("Unknown atomic operation %02x\n", insn->imm); + return -1; + } + + /* *(u32/u64 *)(dst + off) += src + * + * BFW_W: laal %w0,%src,off(%dst) + * BPF_DW: laalg %w0,%src,off(%dst) + */ + EMIT6_DISP_LH(0xeb000000, + BPF_SIZE(insn->code) == BPF_W ? 0x00fa : 0x00ea, + REG_W0, src_reg, dst_reg, off); jit->seen |= SEEN_MEM; break; /* diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 3364e2a00989..4b8d3c65d266 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1366,12 +1366,18 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: { + case BPF_STX | BPF_ATOMIC | BPF_W: { const u8 tmp = bpf2sparc[TMP_REG_1]; const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op %02x\n", insn->imm); + return -EINVAL; + } + + /* lock *(u32 *)(dst + off) += src */ + if (insn->dst_reg == BPF_REG_FP) ctx->saw_frame_pointer = true; @@ -1390,11 +1396,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: { + case BPF_STX | BPF_ATOMIC | BPF_DW: { const u8 tmp = bpf2sparc[TMP_REG_1]; const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op %02x\n", insn->imm); + return -EINVAL; + } + if (insn->dst_reg == BPF_REG_FP) ctx->saw_frame_pointer = true; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 93f32e0ba0ef..b1829a534da1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -795,6 +795,33 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +static int emit_atomic(u8 **pprog, u8 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +{ + u8 *prog = *pprog; + int cnt = 0; + + EMIT1(0xF0); /* lock prefix */ + + maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + /* lock *(u32/u64*)(dst_reg + off) = src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + + emit_insn_suffix(&prog, dst_reg, src_reg, off); + + *pprog = prog; + return 0; +} + static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) @@ -839,6 +866,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + int err; detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -1250,18 +1278,12 @@ st: if (is_imm8(insn->off)) } break; - /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ - case BPF_STX | BPF_XADD | BPF_W: - /* Emit 'lock add dword ptr [rax + off], eax' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); - else - EMIT2(0xF0, 0x01); - goto xadd; - case BPF_STX | BPF_XADD | BPF_DW: - EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: - emit_modrm_dstoff(&prog, dst_reg, src_reg, insn->off); + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + if (err) + return err; break; /* call */ diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 96fde03aa987..d17b67c69f89 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2243,10 +2243,8 @@ emit_jmp: return -EFAULT; } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; case BPF_JMP | BPF_EXIT: if (seen_exit) { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 0a721f6e8676..e31f8fbbc696 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -3109,13 +3109,19 @@ mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64) return 0; } -static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + if (meta->insn.imm != BPF_ADD) + return -EOPNOTSUPP; + return mem_xadd(nfp_prog, meta, false); } -static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + if (meta->insn.imm != BPF_ADD) + return -EOPNOTSUPP; + return mem_xadd(nfp_prog, meta, true); } @@ -3475,8 +3481,8 @@ static const instr_cb_t instr_cb[256] = { [BPF_STX | BPF_MEM | BPF_H] = mem_stx2, [BPF_STX | BPF_MEM | BPF_W] = mem_stx4, [BPF_STX | BPF_MEM | BPF_DW] = mem_stx8, - [BPF_STX | BPF_XADD | BPF_W] = mem_xadd4, - [BPF_STX | BPF_XADD | BPF_DW] = mem_xadd8, + [BPF_STX | BPF_ATOMIC | BPF_W] = mem_atomic4, + [BPF_STX | BPF_ATOMIC | BPF_DW] = mem_atomic8, [BPF_ST | BPF_MEM | BPF_B] = mem_st1, [BPF_ST | BPF_MEM | BPF_H] = mem_st2, [BPF_ST | BPF_MEM | BPF_W] = mem_st4, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index fac9c6f9e197..d0e17eebddd9 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -428,9 +428,9 @@ static inline bool is_mbpf_classic_store_pkt(const struct nfp_insn_meta *meta) return is_mbpf_classic_store(meta) && meta->ptr.type == PTR_TO_PACKET; } -static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) +static inline bool is_mbpf_atomic(const struct nfp_insn_meta *meta) { - return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD); + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_ATOMIC); } static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index e92ee510fd52..9d235c0ce46a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -479,7 +479,7 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, pr_vlog(env, "map writes not supported\n"); return -EOPNOTSUPP; } - if (is_mbpf_xadd(meta)) { + if (is_mbpf_atomic(meta)) { err = nfp_bpf_map_mark_used(env, meta, reg, NFP_MAP_USE_ATOMIC_CNT); if (err) @@ -523,12 +523,17 @@ exit_check_ptr: } static int -nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, - struct bpf_verifier_env *env) +nfp_bpf_check_atomic(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + struct bpf_verifier_env *env) { const struct bpf_reg_state *sreg = cur_regs(env) + meta->insn.src_reg; const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg; + if (meta->insn.imm != BPF_ADD) { + pr_vlog(env, "atomic op not implemented: %d\n", meta->insn.imm); + return -EOPNOTSUPP; + } + if (dreg->type != PTR_TO_MAP_VALUE) { pr_vlog(env, "atomic add not to a map value pointer: %d\n", dreg->type); @@ -655,8 +660,8 @@ int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, if (is_mbpf_store(meta)) return nfp_bpf_check_store(nfp_prog, meta, env); - if (is_mbpf_xadd(meta)) - return nfp_bpf_check_xadd(nfp_prog, meta, env); + if (is_mbpf_atomic(meta)) + return nfp_bpf_check_atomic(nfp_prog, meta, env); if (is_mbpf_alu(meta)) return nfp_bpf_check_alu(nfp_prog, meta, env); diff --git a/include/linux/filter.h b/include/linux/filter.h index 5edf2b660881..392e94b79668 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -259,15 +259,23 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 69c3c308de5e..4836ebf459cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1309,8 +1309,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, XADD, W), \ - INSN_3(STX, XADD, DW), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -1618,13 +1618,25 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); + STX_ATOMIC_W: + switch (IMM) { + case BPF_ADD: + /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + default: + goto default_label; + } CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); + STX_ATOMIC_DW: + switch (IMM) { + case BPF_ADD: + /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + default: + goto default_label; + } CONT; default_label: @@ -1634,7 +1646,8 @@ out: * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ - pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", + insn->code, insn->imm); BUG_ON(1); return 0; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index b44d8c447afd..37c8d6e9b4cc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -153,14 +153,16 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) + else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_ADD) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else + } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ae2aee48cf82..cfc137b81ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3604,13 +3604,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int err; - if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || - insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + if (insn->imm != BPF_ADD) { + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); + return -EINVAL; + } + + if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { + verbose(env, "invalid atomic operand size\n"); return -EINVAL; } @@ -3633,19 +3637,19 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } - /* check whether atomic_add can read the memory */ + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true); if (err) return err; - /* check whether atomic_add can write into the same memory */ + /* check whether we can write into the same memory */ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true); } @@ -9524,8 +9528,8 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, env->insn_idx, insn); + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; @@ -10010,7 +10014,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { + BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index ca7d635bccd9..49ec9e8d8aed 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4295,13 +4295,13 @@ static struct bpf_test tests[] = { { { 0, 0xffffffff } }, .stack_depth = 40, }, - /* BPF_STX | BPF_XADD | BPF_W/DW */ + /* BPF_STX | BPF_ATOMIC | BPF_W/DW */ { "STX_XADD_W: Test: 0x12 + 0x10 = 0x22", .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_LDX_MEM(BPF_W, R0, R10, -40), BPF_EXIT_INSN(), }, @@ -4316,7 +4316,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, R1, R10), BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_ALU64_REG(BPF_MOV, R0, R10), BPF_ALU64_REG(BPF_SUB, R0, R1), BPF_EXIT_INSN(), @@ -4331,7 +4331,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_EXIT_INSN(), }, INTERNAL, @@ -4352,7 +4352,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_LDX_MEM(BPF_DW, R0, R10, -40), BPF_EXIT_INSN(), }, @@ -4367,7 +4367,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, R1, R10), BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_ALU64_REG(BPF_MOV, R0, R10), BPF_ALU64_REG(BPF_SUB, R0, R1), BPF_EXIT_INSN(), @@ -4382,7 +4382,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_EXIT_INSN(), }, INTERNAL, diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index 544237980582..db67a2847395 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -138,11 +138,11 @@ struct bpf_insn; #define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = BPF_ADD }) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index deb0e3e0324d..c5ff7a13918c 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -147,12 +147,12 @@ static void prog_load(void) */ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, packets)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, bytes)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct __sk_buff, len)), BPF_EXIT_INSN(), diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 00aae1d33fca..23d1930e1927 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -54,7 +54,7 @@ static int test_sock(void) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 20fbd1241db3..390ff38d2ac6 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -53,7 +53,7 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), /* Count bytes */ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ @@ -64,7 +64,8 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ BPF_EXIT_INSN(), diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index ca28b6ab8db7..e870c9039f0d 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -169,15 +169,22 @@ .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + */ -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index b549fcfacc0b..0a1fc9816cef 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -45,13 +45,13 @@ static int prog_load_cnt(int verdict, int val) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, val), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index d946252a25bb..0cda61da5d39 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -29,7 +29,7 @@ int main(int argc, char **argv) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c index 93d6b1641481..23080862aafd 100644 --- a/tools/testing/selftests/bpf/verifier/ctx.c +++ b/tools/testing/selftests/bpf/verifier/ctx.c @@ -10,14 +10,13 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "context stores via XADD", + "context stores via BPF_ATOMIC", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1, - BPF_REG_0, offsetof(struct __sk_buff, mark), 0), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c index ae72536603fe..ac1e19d0f520 100644 --- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c @@ -333,7 +333,7 @@ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_4, BPF_REG_5, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0), BPF_MOV64_IMM(BPF_REG_0, 0), @@ -488,7 +488,7 @@ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 11), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_4, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_4, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 49), BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2), diff --git a/tools/testing/selftests/bpf/verifier/leak_ptr.c b/tools/testing/selftests/bpf/verifier/leak_ptr.c index d6eec17f2cd2..73f0dea95546 100644 --- a/tools/testing/selftests/bpf/verifier/leak_ptr.c +++ b/tools/testing/selftests/bpf/verifier/leak_ptr.c @@ -5,7 +5,7 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), BPF_LD_MAP_FD(BPF_REG_2, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2, + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_2, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, @@ -13,7 +13,7 @@ .errstr_unpriv = "R2 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", }, { "leak pointer into ctx 2", @@ -21,14 +21,14 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), - BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10, + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_10, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, .errstr_unpriv = "R10 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", }, { "leak pointer into ctx 3", @@ -56,7 +56,7 @@ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_MOV64_IMM(BPF_REG_3, 0), BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_6, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/meta_access.c b/tools/testing/selftests/bpf/verifier/meta_access.c index 205292b8dd65..b45e8af41420 100644 --- a/tools/testing/selftests/bpf/verifier/meta_access.c +++ b/tools/testing/selftests/bpf/verifier/meta_access.c @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_5, 42), BPF_MOV64_IMM(BPF_REG_6, 24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5), @@ -196,7 +196,7 @@ BPF_MOV64_IMM(BPF_REG_5, 42), BPF_MOV64_IMM(BPF_REG_6, 24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5), diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index a3fe0fbaed41..ee298627abae 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -207,7 +207,8 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_10, BPF_REG_0, -8, 0), + BPF_RAW_INSN(BPF_STX | BPF_ATOMIC | BPF_DW, + BPF_REG_10, BPF_REG_0, -8, BPF_ADD), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c index ed1c2cea1dea..489062867218 100644 --- a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c +++ b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c @@ -82,7 +82,7 @@ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_2, BPF_REG_3, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_2, BPF_REG_3, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/xadd.c b/tools/testing/selftests/bpf/verifier/xadd.c index c5de2e62cc8b..b96ef3526815 100644 --- a/tools/testing/selftests/bpf/verifier/xadd.c +++ b/tools/testing/selftests/bpf/verifier/xadd.c @@ -3,7 +3,7 @@ .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -7), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_EXIT_INSN(), }, @@ -22,7 +22,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 3), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3), BPF_EXIT_INSN(), }, @@ -45,13 +45,13 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0), - BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1), - BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 1), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1), BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_XADD stores into R2 pkt is not allowed", + .errstr = "BPF_ATOMIC stores into R2 pkt is not allowed", .prog_type = BPF_PROG_TYPE_XDP, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -62,8 +62,8 @@ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3), BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), @@ -82,8 +82,8 @@ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3), BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), -- cgit v1.2.3 From 5ca419f2864a2c60940dcf4bbaeb69546200e36f Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:46 +0000 Subject: bpf: Add BPF_FETCH field / create atomic_fetch_add instruction The BPF_FETCH field can be set in bpf_insn.imm, for BPF_ATOMIC instructions, in order to have the previous value of the atomically-modified memory location loaded into the src register after an atomic op is carried out. Suggested-by: Yonghong Song Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-7-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 4 ++++ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 3 +++ kernel/bpf/core.c | 13 +++++++++++++ kernel/bpf/disasm.c | 7 +++++++ kernel/bpf/verifier.c | 33 ++++++++++++++++++++++++--------- tools/include/linux/filter.h | 1 + tools/include/uapi/linux/bpf.h | 3 +++ 8 files changed, 56 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b1829a534da1..eea7d8b0bb12 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -811,6 +811,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index 392e94b79668..23fca41b8540 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,6 +264,7 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4836ebf459cf..28d6000463e4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1624,16 +1624,29 @@ out: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ atomic_add((u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); + break; + case BPF_ADD | BPF_FETCH: + SRC = (u32) atomic_fetch_add( + (u32) SRC, + (atomic_t *)(unsigned long) (DST + insn->off)); + break; default: goto default_label; } CONT; + STX_ATOMIC_DW: switch (IMM) { case BPF_ADD: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); + break; + case BPF_ADD | BPF_FETCH: + SRC = (u64) atomic64_fetch_add( + (u64) SRC, + (atomic64_t *)(unsigned long) (DST + insn->off)); + break; default: goto default_label; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 37c8d6e9b4cc..d2e20f6d0516 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -160,6 +160,13 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == (BPF_ADD | BPF_FETCH)) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8a85f4e5b95..6aa1fc919761 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3608,7 +3608,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i { int err; - if (insn->imm != BPF_ADD) { + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + break; + default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); return -EINVAL; } @@ -3650,8 +3654,20 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; /* check whether we can write into the same memory */ - return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1, true); + if (err) + return err; + + if (!(insn->imm & BPF_FETCH)) + return 0; + + /* check and record load of old value into src reg */ + err = check_reg_arg(env, insn->src_reg, DST_OP); + if (err) + return err; + + return 0; } static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, @@ -9528,12 +9544,6 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); if (err) @@ -9542,6 +9552,11 @@ static int do_check(struct bpf_verifier_env *env) continue; } + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index e870c9039f0d..7211ce9fba53 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,6 +173,7 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, -- cgit v1.2.3 From 5ffa25502b5ab3d639829a2d1e316cff7f59a41e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:47 +0000 Subject: bpf: Add instructions for atomic_[cmp]xchg This adds two atomic opcodes, both of which include the BPF_FETCH flag. XCHG without the BPF_FETCH flag would naturally encode atomic_set. This is not supported because it would be of limited value to userspace (it doesn't imply any barriers). CMPXCHG without BPF_FETCH woulud be an atomic compare-and-write. We don't have such an operation in the kernel so it isn't provided to BPF either. There are two significant design decisions made for the CMPXCHG instruction: - To solve the issue that this operation fundamentally has 3 operands, but we only have two register fields. Therefore the operand we compare against (the kernel's API calls it 'old') is hard-coded to be R0. x86 has similar design (and A64 doesn't have this problem). A potential alternative might be to encode the other operand's register number in the immediate field. - The kernel's atomic_cmpxchg returns the old value, while the C11 userspace APIs return a boolean indicating the comparison result. Which should BPF do? A64 returns the old value. x86 returns the old value in the hard-coded register (and also sets a flag). That means return-old-value is easier to JIT, so that's what we use. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-8-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 8 ++++++++ include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 4 +++- kernel/bpf/core.c | 20 ++++++++++++++++++++ kernel/bpf/disasm.c | 15 +++++++++++++++ kernel/bpf/verifier.c | 19 +++++++++++++++++-- tools/include/linux/filter.h | 2 ++ tools/include/uapi/linux/bpf.h | 4 +++- 8 files changed, 70 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index eea7d8b0bb12..308241187582 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ EMIT2(0x0F, 0xC1); break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index 23fca41b8540..d563820f197d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 28d6000463e4..4df6daba43ef 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1630,6 +1630,16 @@ out: (u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); break; + case BPF_XCHG: + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + break; + case BPF_CMPXCHG: + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + break; default: goto default_label; } @@ -1647,6 +1657,16 @@ out: (u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); break; + case BPF_XCHG: + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); + break; + case BPF_CMPXCHG: + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); + break; default: goto default_label; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d2e20f6d0516..ee8d1132767b 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -167,6 +167,21 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG) { + verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", + insn->code, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_XCHG) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6aa1fc919761..89a4d154ab37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3606,11 +3606,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { + int load_reg; int err; switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: break; default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); @@ -3632,6 +3635,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (insn->imm == BPF_CMPXCHG) { + /* Check comparison of R0 with memory location */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + } + if (is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; @@ -3662,8 +3672,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (!(insn->imm & BPF_FETCH)) return 0; - /* check and record load of old value into src reg */ - err = check_reg_arg(env, insn->src_reg, DST_OP); + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); if (err) return err; diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 7211ce9fba53..d75998b0d5ac 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -174,6 +174,8 @@ * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { -- cgit v1.2.3 From 981f94c3e92146705baf97fb417a5ed1ab1a79a5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:49 +0000 Subject: bpf: Add bitwise atomic instructions This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-10-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 50 +++++++++++++++++++++++++++++++++++++++++++- include/linux/filter.h | 6 ++++++ kernel/bpf/core.c | 3 +++ kernel/bpf/disasm.c | 21 +++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/linux/filter.h | 6 ++++++ 6 files changed, 87 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 308241187582..1d4d50199293 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -808,6 +808,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* emit opcode */ switch (atomic_op) { case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; @@ -1292,8 +1296,52 @@ st: if (is_imm8(insn->off)) case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) { + u8 *branch_target; + bool is64 = BPF_SIZE(insn->code) == BPF_DW; + + /* + * Can't be implemented with a single x86 insn. + * Need to do a CMPXCHG loop. + */ + + /* Will need RAX as a CMPXCHG operand so save R0 */ + emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + branch_target = prog; + /* Load old value */ + emit_ldx(&prog, BPF_SIZE(insn->code), + BPF_REG_0, dst_reg, insn->off); + /* + * Perform the (commutative) operation locally, + * put the result in the AUX_REG. + */ + emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); + maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], + add_2reg(0xC0, AUX_REG, src_reg)); + /* Attempt to swap in new value */ + err = emit_atomic(&prog, BPF_CMPXCHG, + dst_reg, AUX_REG, insn->off, + BPF_SIZE(insn->code)); + if (WARN_ON(err)) + return err; + /* + * ZF tells us whether we won the race. If it's + * cleared we need to try again. + */ + EMIT2(X86_JNE, -(prog - branch_target) - 2); + /* Return the pre-modification value */ + emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + /* Restore R0 after clobbering RAX */ + emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); + break; + + } + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; diff --git a/include/linux/filter.h b/include/linux/filter.h index d563820f197d..7fdce5407214 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,7 +264,13 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8669e685825f..5bbd4884ff7a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1642,6 +1642,9 @@ out: STX_ATOMIC_W: switch (IMM) { ATOMIC_ALU_OP(BPF_ADD, add) + ATOMIC_ALU_OP(BPF_AND, and) + ATOMIC_ALU_OP(BPF_OR, or) + ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ee8d1132767b..19ff8fed7f4b 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_atomic_alu_string[16] = { + [BPF_ADD >> 4] = "add", + [BPF_AND >> 4] = "and", + [BPF_OR >> 4] = "or", + [BPF_XOR >> 4] = "or", +}; + static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", @@ -154,17 +161,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == BPF_ADD) { - verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + (insn->imm == BPF_ADD || insn->imm == BPF_ADD || + insn->imm == BPF_OR || insn->imm == BPF_XOR)) { + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, + bpf_alu_string[BPF_OP(insn->imm) >> 4], insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == (BPF_ADD | BPF_FETCH)) { - verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n", + (insn->imm == (BPF_ADD | BPF_FETCH) || + insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH))) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89a4d154ab37..0f82d5d46e2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3612,6 +3612,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: case BPF_XCHG: case BPF_CMPXCHG: break; diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index d75998b0d5ac..736bdeccdfe4 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,7 +173,13 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ -- cgit v1.2.3 From bd7525dacd7e204f8cae061941fb9001c89d6988 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:42 +0100 Subject: bpf: Move stack_map_get_build_id into lib Moving stack_map_get_build_id into lib with declaration in linux/buildid.h header: int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); This function returns build id for given struct vm_area_struct. There is no functional change to stack_map_get_build_id function. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210114134044.1418404-2-jolsa@kernel.org --- include/linux/buildid.h | 11 ++++ kernel/bpf/stackmap.c | 143 ++---------------------------------------------- lib/Makefile | 3 +- lib/buildid.c | 136 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 140 deletions(-) create mode 100644 include/linux/buildid.h create mode 100644 lib/buildid.c (limited to 'include') diff --git a/include/linux/buildid.h b/include/linux/buildid.h new file mode 100644 index 000000000000..08028a212589 --- /dev/null +++ b/include/linux/buildid.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BUILDID_H +#define _LINUX_BUILDID_H + +#include + +#define BUILD_ID_SIZE_MAX 20 + +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..55d254a59f07 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,9 @@ #include #include #include -#include -#include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -143,140 +142,6 @@ free_smap: return ERR_PTR(err); } -#define BPF_BUILD_ID 3 -/* - * Parse build id from the note segment. This logic can be shared between - * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are - * identical. - */ -static inline int stack_map_parse_build_id(void *page_addr, - unsigned char *build_id, - void *note_start, - Elf32_Word note_size) -{ - Elf32_Word note_offs = 0, new_offs; - - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); - - if (nhdr->n_type == BPF_BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BPF_BUILD_ID_SIZE - nhdr->n_descsz); - return 0; - } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; - } - return -EINVAL; -} - -/* Parse build ID from 32-bit ELF */ -static int stack_map_get_build_id_32(void *page_addr, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) - return -EINVAL; - - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID from 64-bit ELF */ -static int stack_map_get_build_id_64(void *page_addr, - unsigned char *build_id) -{ - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) - return -EINVAL; - - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID of ELF file mapped to vma */ -static int stack_map_get_build_id(struct vm_area_struct *vma, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; - int ret; - - /* only works for page backed storage */ - if (!vma->vm_file) - return -EINVAL; - - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ - - ret = -EINVAL; - page_addr = kmap_atomic(page); - ehdr = (Elf32_Ehdr *)page_addr; - - /* compare magic x7f "ELF" */ - if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) - goto out; - - /* only support executable file and shared object file */ - if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) - goto out; - - if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = stack_map_get_build_id_32(page_addr, build_id); - else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = stack_map_get_build_id_64(page_addr, build_id); -out: - kunmap_atomic(page_addr); - put_page(page); - return ret; -} - static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { @@ -317,18 +182,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] diff --git a/lib/Makefile b/lib/Makefile index afeff05fa8c5..a6b160c3a4fa 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -36,7 +36,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ - nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o + nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \ + buildid.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/buildid.c b/lib/buildid.c new file mode 100644 index 000000000000..4a4f520c0e29 --- /dev/null +++ b/lib/buildid.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#define BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BUILD_ID_SIZE_MAX - nhdr->n_descsz); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int get_build_id_32(void *page_addr, unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int get_build_id_64(void *page_addr, unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = kmap_atomic(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64(page_addr, build_id); +out: + kunmap_atomic(page_addr); + put_page(page); + return ret; +} -- cgit v1.2.3 From 921f88fc891922b325b3668cd026a386571ed602 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:43 +0100 Subject: bpf: Add size arg to build_id_parse function It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Adding size argument to build_id_parse function, that returns (if defined) size of the parsed build id, so we can recognize the build id type. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210114134044.1418404-3-jolsa@kernel.org --- include/linux/buildid.h | 3 ++- kernel/bpf/stackmap.c | 2 +- lib/buildid.c | 29 +++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 08028a212589..40232f90db6e 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -6,6 +6,7 @@ #define BUILD_ID_SIZE_MAX 20 -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size); #endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 55d254a59f07..cabaf7db8efc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -189,7 +189,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/lib/buildid.c b/lib/buildid.c index 4a4f520c0e29..6156997c3895 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -12,6 +12,7 @@ */ static inline int parse_build_id(void *page_addr, unsigned char *build_id, + __u32 *size, void *note_start, Elf32_Word note_size) { @@ -38,6 +39,8 @@ static inline int parse_build_id(void *page_addr, nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BUILD_ID_SIZE_MAX - nhdr->n_descsz); + if (size) + *size = nhdr->n_descsz; return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -50,7 +53,8 @@ static inline int parse_build_id(void *page_addr, } /* Parse build ID from 32-bit ELF */ -static int get_build_id_32(void *page_addr, unsigned char *build_id) +static int get_build_id_32(void *page_addr, unsigned char *build_id, + __u32 *size) { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; @@ -65,7 +69,7 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id) for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, + !parse_build_id(page_addr, build_id, size, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; @@ -74,7 +78,8 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id) } /* Parse build ID from 64-bit ELF */ -static int get_build_id_64(void *page_addr, unsigned char *build_id) +static int get_build_id_64(void *page_addr, unsigned char *build_id, + __u32 *size) { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; @@ -89,7 +94,7 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id) for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, + !parse_build_id(page_addr, build_id, size, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; @@ -97,8 +102,16 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id) return -EINVAL; } -/* Parse build ID of ELF file mapped to vma */ -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) +/* + * Parse build ID of ELF file mapped to vma + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Returns 0 on success, otherwise error (< 0). + */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size) { Elf32_Ehdr *ehdr; struct page *page; @@ -126,9 +139,9 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) goto out; if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(page_addr, build_id); + ret = get_build_id_32(page_addr, build_id, size); else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(page_addr, build_id); + ret = get_build_id_64(page_addr, build_id, size); out: kunmap_atomic(page_addr); put_page(page); -- cgit v1.2.3 From 88a16a1309333e43d328621ece3e9fa37027e8eb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:44 +0100 Subject: perf: Add build id data in mmap2 event Adding support to carry build id data in mmap2 event. The build id data replaces maj/min/ino/ino_generation fields, which are also used to identify map's binary, so it's ok to replace them with build id data: union { struct { u32 maj; u32 min; u64 ino; u64 ino_generation; }; struct { u8 build_id_size; u8 __reserved_1; u16 __reserved_2; u8 build_id[20]; }; }; Replaced maj/min/ino/ino_generation fields give us size of 24 bytes. We use 20 bytes for build id data, 1 byte for size and rest is unused. There's new misc bit for mmap2 to signal there's build id data in it: #define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210114134044.1418404-4-jolsa@kernel.org --- include/uapi/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++----- kernel/events/core.c | 32 +++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b15e3447cd9f..cb6f84103560 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -386,7 +386,8 @@ struct perf_event_attr { aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - __reserved_1 : 30; + build_id : 1, /* use build id in mmap2 events */ + __reserved_1 : 29; union { __u32 wakeup_events; /* wakeup every n events */ @@ -659,6 +660,22 @@ struct perf_event_mmap_page { __u64 aux_size; }; +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) @@ -690,6 +707,7 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event * * * PERF_RECORD_MISC_EXACT_IP: @@ -699,9 +717,13 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ @@ -915,10 +937,20 @@ enum perf_event_type { * u64 addr; * u64 len; * u64 pgoff; - * u32 maj; - * u32 min; - * u64 ino; - * u64 ino_generation; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; * u32 prot, flags; * char filename[]; * struct sample_id sample_id; diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..c37401e3e5f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "internal.h" @@ -397,6 +398,7 @@ static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4673,6 +4675,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) @@ -8046,6 +8050,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -8077,6 +8083,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -8101,13 +8108,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -8236,6 +8255,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -11172,6 +11194,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) -- cgit v1.2.3 From 54a52823a2d606871c33ef9e2793299768d5d896 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Thu, 14 Jan 2021 13:57:32 -0600 Subject: dsa: add support for Arrow XRS700x tag trailer Add support for Arrow SpeedChips XRS700x single byte tag trailer. This is modeled on tag_trailer.c which works in a similar way. Signed-off-by: George McCollister Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 ++ net/dsa/Kconfig | 6 +++++ net/dsa/Makefile | 1 + net/dsa/tag_xrs700x.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100644 net/dsa/tag_xrs700x.c (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 06e3784ec55c..fa537afcab53 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -46,6 +46,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 +#define DSA_TAG_PROTO_XRS700X_VALUE 19 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -67,6 +68,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, + DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, }; struct packet_type; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index dfecd7b22fd7..2d226a5c085f 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -139,4 +139,10 @@ config NET_DSA_TAG_TRAILER Say Y or M if you want to enable support for tagging frames at with a trailed. e.g. Marvell 88E6060. +config NET_DSA_TAG_XRS700X + tristate "Tag driver for XRS700x switches" + help + Say Y or M if you want to enable support for tagging frames for + Arrow SpeedChips XRS700x switches that use a single byte tag trailer. + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 0fb2b75a7ae3..92cea2132241 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o +obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c new file mode 100644 index 000000000000..db0ed1a5fcb7 --- /dev/null +++ b/net/dsa/tag_xrs700x.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * XRS700x tag format handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2020 NovaTech LLC + */ + +#include + +#include "dsa_priv.h" + +static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *trailer; + + trailer = skb_put(skb, 1); + trailer[0] = BIT(dp->index); + + return skb; +} + +static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + int source_port; + u8 *trailer; + + trailer = skb_tail_pointer(skb) - 1; + + source_port = ffs((int)trailer[0]) - 1; + + if (source_port < 0) + return NULL; + + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) + return NULL; + + if (pskb_trim_rcsum(skb, skb->len - 1)) + return NULL; + + /* Frame is forwarded by hardware, don't forward in software. */ + skb->offload_fwd_mark = 1; + + return skb; +} + +static const struct dsa_device_ops xrs700x_netdev_ops = { + .name = "xrs700x", + .proto = DSA_TAG_PROTO_XRS700X, + .xmit = xrs700x_xmit, + .rcv = xrs700x_rcv, + .overhead = 1, + .tail_tag = true, +}; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X); + +module_dsa_tag_driver(xrs700x_netdev_ops); -- cgit v1.2.3 From f6fe01d6fa24dd3c89996ad82780872441e86bfa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jan 2021 04:11:11 +0200 Subject: net: mscc: ocelot: auto-detect packet buffer size and number of frame references Instead of reading these values from the reference manual and writing them down into the driver, it appears that the hardware gives us the option of detecting them dynamically. The number of frame references corresponds to what the reference manual notes, however it seems that the frame buffers are reported as slightly less than the books would indicate. On VSC9959 (Felix), the books say it should have 128KB of packet buffer, but the registers indicate only 129840 bytes (126.79 KB). Also, the unit of measurement for FREECNT from the documentation of all these devices is incorrect (taken from an older generation). This was confirmed by Younes Leroul from Microchip support. Not having anything better to do with these values at the moment* (this will change soon), let's just print them. *The frame buffer size is, in fact, used to calculate the tail dropping watermarks. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 1 - drivers/net/dsa/ocelot/felix.h | 1 - drivers/net/dsa/ocelot/felix_vsc9959.c | 1 - drivers/net/dsa/ocelot/seville_vsc9953.c | 1 - drivers/net/ethernet/mscc/ocelot.c | 22 +++++++++++++++++++++- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 1 - include/soc/mscc/ocelot.h | 3 ++- include/soc/mscc/ocelot_qsys.h | 3 +++ 8 files changed, 26 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 8492151f8133..cf9317110591 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -422,7 +422,6 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) ocelot->map = felix->info->map; ocelot->stats_layout = felix->info->stats_layout; ocelot->num_stats = felix->info->num_stats; - ocelot->shared_queue_sz = felix->info->shared_queue_sz; ocelot->num_mact_rows = felix->info->num_mact_rows; ocelot->vcap = felix->info->vcap; ocelot->ops = felix->info->ops; diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 4c717324ac2f..5434fe278d2c 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -15,7 +15,6 @@ struct felix_info { const struct reg_field *regfields; const u32 *const *map; const struct ocelot_ops *ops; - int shared_queue_sz; int num_mact_rows; const struct ocelot_stat_layout *stats_layout; unsigned int num_stats; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index a87597eef8cf..b68df85e01a1 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1356,7 +1356,6 @@ static const struct felix_info felix_info_vsc9959 = { .stats_layout = vsc9959_stats_layout, .num_stats = ARRAY_SIZE(vsc9959_stats_layout), .vcap = vsc9959_vcap_props, - .shared_queue_sz = 128 * 1024, .num_mact_rows = 2048, .num_ports = 6, .num_tx_queues = FELIX_NUM_TC, diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index ebbaf6817ec8..b72813da6d9f 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1181,7 +1181,6 @@ static const struct felix_info seville_info_vsc9953 = { .stats_layout = vsc9953_stats_layout, .num_stats = ARRAY_SIZE(vsc9953_stats_layout), .vcap = vsc9953_vcap_props, - .shared_queue_sz = 256 * 1024, .num_mact_rows = 2048, .num_ports = 10, .mdio_bus_alloc = vsc9953_mdio_bus_alloc, diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index af620f7ce469..3af8e607c8a9 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1354,7 +1354,7 @@ void ocelot_port_set_maxlen(struct ocelot *ocelot, int port, size_t sdu) pause_stop); /* Tail dropping watermarks */ - atop_tot = (ocelot->shared_queue_sz - 9 * maxlen) / + atop_tot = (ocelot->packet_buffer_size - 9 * maxlen) / OCELOT_BUFFER_CELL_SZ; atop = (9 * maxlen) / OCELOT_BUFFER_CELL_SZ; ocelot_write_rix(ocelot, ocelot->ops->wm_enc(atop), SYS_ATOP, port); @@ -1467,6 +1467,25 @@ static void ocelot_cpu_port_init(struct ocelot *ocelot) ANA_PORT_VLAN_CFG, cpu); } +static void ocelot_detect_features(struct ocelot *ocelot) +{ + int mmgt, eq_ctrl; + + /* For Ocelot, Felix, Seville, Serval etc, SYS:MMGT:MMGT:FREECNT holds + * the number of 240-byte free memory words (aka 4-cell chunks) and not + * 192 bytes as the documentation incorrectly says. + */ + mmgt = ocelot_read(ocelot, SYS_MMGT); + ocelot->packet_buffer_size = 240 * SYS_MMGT_FREECNT(mmgt); + + eq_ctrl = ocelot_read(ocelot, QSYS_EQ_CTRL); + ocelot->num_frame_refs = QSYS_MMGT_EQ_CTRL_FP_FREE_CNT(eq_ctrl); + + dev_info(ocelot->dev, + "Detected %d bytes of packet buffer and %d frame references\n", + ocelot->packet_buffer_size, ocelot->num_frame_refs); +} + int ocelot_init(struct ocelot *ocelot) { char queue_name[32]; @@ -1509,6 +1528,7 @@ int ocelot_init(struct ocelot *ocelot) INIT_LIST_HEAD(&ocelot->multicast); INIT_LIST_HEAD(&ocelot->pgids); + ocelot_detect_features(ocelot); ocelot_mact_init(ocelot); ocelot_vlan_init(ocelot); ocelot_vcap_init(ocelot); diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 9cf2bc5f4289..7135ad18affe 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -517,7 +517,6 @@ static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops) ocelot->map = ocelot_regmap; ocelot->stats_layout = ocelot_stats_layout; ocelot->num_stats = ARRAY_SIZE(ocelot_stats_layout); - ocelot->shared_queue_sz = 224 * 1024; ocelot->num_mact_rows = 1024; ocelot->ops = ops; diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index f3db75c0172b..c17a372335cd 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -607,7 +607,8 @@ struct ocelot { const struct ocelot_stat_layout *stats_layout; unsigned int num_stats; - int shared_queue_sz; + int packet_buffer_size; + int num_frame_refs; int num_mact_rows; struct net_device *hw_bridge_dev; diff --git a/include/soc/mscc/ocelot_qsys.h b/include/soc/mscc/ocelot_qsys.h index a814bc2017d8..b7b263a19068 100644 --- a/include/soc/mscc/ocelot_qsys.h +++ b/include/soc/mscc/ocelot_qsys.h @@ -77,6 +77,9 @@ #define QSYS_RES_STAT_MAXUSE(x) ((x) & GENMASK(11, 0)) #define QSYS_RES_STAT_MAXUSE_M GENMASK(11, 0) +#define QSYS_MMGT_EQ_CTRL_FP_FREE_CNT(x) ((x) & GENMASK(15, 0)) +#define QSYS_MMGT_EQ_CTRL_FP_FREE_CNT_M GENMASK(15, 0) + #define QSYS_EVENTS_CORE_EV_FDC(x) (((x) << 2) & GENMASK(4, 2)) #define QSYS_EVENTS_CORE_EV_FDC_M GENMASK(4, 2) #define QSYS_EVENTS_CORE_EV_FDC_X(x) (((x) & GENMASK(4, 2)) >> 2) -- cgit v1.2.3 From 703b762190e643bf46a048ebe99504b14d71449c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jan 2021 04:11:12 +0200 Subject: net: mscc: ocelot: add ops for decoding watermark threshold and occupancy We'll need to read back the watermark thresholds and occupancy from hardware (for devlink-sb integration), not only to write them as we did so far in ocelot_port_set_maxlen. So introduce 2 new functions in struct ocelot_ops, similar to wm_enc, and implement them for the 3 supported mscc_ocelot switches. Remove the INUSE and MAXUSE unpacking helpers for the QSYS_RES_STAT register, because that doesn't scale with the number of switches that mscc_ocelot supports now. They have different bit widths for the watermarks, and we need function pointers to abstract that difference away. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 18 ++++++++++++++++++ drivers/net/dsa/ocelot/seville_vsc9953.c | 18 ++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_vsc7514.c | 16 ++++++++++++++++ include/soc/mscc/ocelot.h | 2 ++ include/soc/mscc/ocelot_qsys.h | 6 ------ 5 files changed, 54 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index b68df85e01a1..21dacb85976e 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1006,9 +1006,27 @@ static u16 vsc9959_wm_enc(u16 value) return value; } +static u16 vsc9959_wm_dec(u16 wm) +{ + WARN_ON(wm & ~GENMASK(8, 0)); + + if (wm & BIT(8)) + return (wm & GENMASK(7, 0)) * 16; + + return wm; +} + +static void vsc9959_wm_stat(u32 val, u32 *inuse, u32 *maxuse) +{ + *inuse = (val & GENMASK(23, 12)) >> 12; + *maxuse = val & GENMASK(11, 0); +} + static const struct ocelot_ops vsc9959_ops = { .reset = vsc9959_reset, .wm_enc = vsc9959_wm_enc, + .wm_dec = vsc9959_wm_dec, + .wm_stat = vsc9959_wm_stat, .port_to_netdev = felix_port_to_netdev, .netdev_to_port = felix_netdev_to_port, }; diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index b72813da6d9f..8dad0c894eca 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1057,9 +1057,27 @@ static u16 vsc9953_wm_enc(u16 value) return value; } +static u16 vsc9953_wm_dec(u16 wm) +{ + WARN_ON(wm & ~GENMASK(9, 0)); + + if (wm & BIT(9)) + return (wm & GENMASK(8, 0)) * 16; + + return wm; +} + +static void vsc9953_wm_stat(u32 val, u32 *inuse, u32 *maxuse) +{ + *inuse = (val & GENMASK(25, 13)) >> 13; + *maxuse = val & GENMASK(12, 0); +} + static const struct ocelot_ops vsc9953_ops = { .reset = vsc9953_reset, .wm_enc = vsc9953_wm_enc, + .wm_dec = vsc9953_wm_dec, + .wm_stat = vsc9953_wm_stat, .port_to_netdev = felix_port_to_netdev, .netdev_to_port = felix_netdev_to_port, }; diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 7135ad18affe..ecd474476cc6 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -763,9 +763,25 @@ static u16 ocelot_wm_enc(u16 value) return value; } +static u16 ocelot_wm_dec(u16 wm) +{ + if (wm & BIT(8)) + return (wm & GENMASK(7, 0)) * 16; + + return wm; +} + +static void ocelot_wm_stat(u32 val, u32 *inuse, u32 *maxuse) +{ + *inuse = (val & GENMASK(23, 12)) >> 12; + *maxuse = val & GENMASK(11, 0); +} + static const struct ocelot_ops ocelot_ops = { .reset = ocelot_reset, .wm_enc = ocelot_wm_enc, + .wm_dec = ocelot_wm_dec, + .wm_stat = ocelot_wm_stat, .port_to_netdev = ocelot_port_to_netdev, .netdev_to_port = ocelot_netdev_to_port, }; diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index c17a372335cd..e548b0f51d0c 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -563,6 +563,8 @@ struct ocelot_ops { int (*netdev_to_port)(struct net_device *dev); int (*reset)(struct ocelot *ocelot); u16 (*wm_enc)(u16 value); + u16 (*wm_dec)(u16 value); + void (*wm_stat)(u32 val, u32 *inuse, u32 *maxuse); }; struct ocelot_vcap_block { diff --git a/include/soc/mscc/ocelot_qsys.h b/include/soc/mscc/ocelot_qsys.h index b7b263a19068..9731895be643 100644 --- a/include/soc/mscc/ocelot_qsys.h +++ b/include/soc/mscc/ocelot_qsys.h @@ -71,12 +71,6 @@ #define QSYS_RES_STAT_GSZ 0x8 -#define QSYS_RES_STAT_INUSE(x) (((x) << 12) & GENMASK(23, 12)) -#define QSYS_RES_STAT_INUSE_M GENMASK(23, 12) -#define QSYS_RES_STAT_INUSE_X(x) (((x) & GENMASK(23, 12)) >> 12) -#define QSYS_RES_STAT_MAXUSE(x) ((x) & GENMASK(11, 0)) -#define QSYS_RES_STAT_MAXUSE_M GENMASK(11, 0) - #define QSYS_MMGT_EQ_CTRL_FP_FREE_CNT(x) ((x) & GENMASK(15, 0)) #define QSYS_MMGT_EQ_CTRL_FP_FREE_CNT_M GENMASK(15, 0) -- cgit v1.2.3 From 2a6ef763037238a5aa6a6505fc6693ee77c1a59b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jan 2021 04:11:13 +0200 Subject: net: dsa: add ops for devlink-sb Switches that care about QoS might have hardware support for reserving buffer pools for individual ports or traffic classes, and configuring their sizes and thresholds. Through devlink-sb (shared buffers), this is all configurable, as well as their occupancy being viewable. Add the plumbing in DSA for these operations. Individual drivers still need to call devlink_sb_register() with the shared buffers they want to expose. A helper was not created in DSA for this purpose (unlike, say, dsa_devlink_params_register), since in my opinion it does not bring any benefit over plainly calling devlink_sb_register() directly. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 34 ++++++++++++ net/dsa/dsa2.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 192 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index fa537afcab53..2f5435d3d1db 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -699,6 +699,40 @@ struct dsa_switch_ops { int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); + int (*devlink_sb_pool_get)(struct dsa_switch *ds, + unsigned int sb_index, u16 pool_index, + struct devlink_sb_pool_info *pool_info); + int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack); + int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold); + int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack); + int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold); + int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack); + int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, + unsigned int sb_index); + int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, + unsigned int sb_index); + int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_cur, u32 *p_max); + int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f98eb521c2ec..cc13549120e5 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -463,8 +463,165 @@ static int dsa_devlink_info_get(struct devlink *dl, return -EOPNOTSUPP; } +static int dsa_devlink_sb_pool_get(struct devlink *dl, + unsigned int sb_index, u16 pool_index, + struct devlink_sb_pool_info *pool_info) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index, + pool_info); +} + +static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_pool_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size, + threshold_type, extack); +} + +static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_port_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index, + pool_index, p_threshold); +} + +static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_port_pool_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index, + pool_index, threshold, extack); +} + +static int +dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_tc_pool_bind_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index, + tc_index, pool_type, + p_pool_index, p_threshold); +} + +static int +dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_tc_pool_bind_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index, + tc_index, pool_type, + pool_index, threshold, + extack); +} + +static int dsa_devlink_sb_occ_snapshot(struct devlink *dl, + unsigned int sb_index) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_occ_snapshot) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_snapshot(ds, sb_index); +} + +static int dsa_devlink_sb_occ_max_clear(struct devlink *dl, + unsigned int sb_index) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_occ_max_clear) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_max_clear(ds, sb_index); +} + +static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, + u16 pool_index, u32 *p_cur, + u32 *p_max) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_occ_port_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index, + pool_index, p_cur, p_max); +} + +static int +dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_occ_tc_port_bind_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port, + sb_index, tc_index, + pool_type, p_cur, + p_max); +} + static const struct devlink_ops dsa_devlink_ops = { - .info_get = dsa_devlink_info_get, + .info_get = dsa_devlink_info_get, + .sb_pool_get = dsa_devlink_sb_pool_get, + .sb_pool_set = dsa_devlink_sb_pool_set, + .sb_port_pool_get = dsa_devlink_sb_port_pool_get, + .sb_port_pool_set = dsa_devlink_sb_port_pool_set, + .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get, + .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set, + .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot, + .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear, + .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get, + .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get, }; static int dsa_switch_setup(struct dsa_switch *ds) -- cgit v1.2.3 From 70d39a6e62d31a4a7372a712ccc6f8063bbb1550 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jan 2021 04:11:16 +0200 Subject: net: mscc: ocelot: export NUM_TC constant from felix to common switch lib We should be moving anything that isn't DSA-specific or SoC-specific out of the felix DSA driver, and into the common mscc_ocelot switch library. The number of traffic classes is one of the aspects that is common between all ocelot switches, so it belongs in the library. This patch also makes seville use 8 TX queues, and therefore enables prioritization via the QOS_CLASS field in the NPI injection header. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 2 +- drivers/net/dsa/ocelot/felix.h | 1 - drivers/net/dsa/ocelot/felix_vsc9959.c | 4 ++-- drivers/net/dsa/ocelot/seville_vsc9953.c | 1 + include/soc/mscc/ocelot.h | 1 + 5 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 178be0d736bd..0c6a7de03331 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -299,7 +299,7 @@ static void felix_port_qos_map_init(struct ocelot *ocelot, int port) ANA_PORT_QOS_CFG, port); - for (i = 0; i < FELIX_NUM_TC * 2; i++) { + for (i = 0; i < OCELOT_NUM_TC * 2; i++) { ocelot_rmw_ix(ocelot, (ANA_PORT_PCP_DEI_MAP_DP_PCP_DEI_VAL & i) | ANA_PORT_PCP_DEI_MAP_QOS_PCP_DEI_VAL(i), diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 5434fe278d2c..994835cb9307 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -5,7 +5,6 @@ #define _MSCC_FELIX_H #define ocelot_to_felix(o) container_of((o), struct felix, ocelot) -#define FELIX_NUM_TC 8 /* Platform-specific information */ struct felix_info { diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 21dacb85976e..f9711e69b8d5 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1376,7 +1376,7 @@ static const struct felix_info felix_info_vsc9959 = { .vcap = vsc9959_vcap_props, .num_mact_rows = 2048, .num_ports = 6, - .num_tx_queues = FELIX_NUM_TC, + .num_tx_queues = OCELOT_NUM_TC, .switch_pci_bar = 4, .imdio_pci_bar = 0, .ptp_caps = &vsc9959_ptp_caps, @@ -1435,7 +1435,7 @@ static int felix_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, felix); ocelot = &felix->ocelot; ocelot->dev = &pdev->dev; - ocelot->num_flooding_pgids = FELIX_NUM_TC; + ocelot->num_flooding_pgids = OCELOT_NUM_TC; felix->info = &felix_info_vsc9959; felix->switch_base = pci_resource_start(pdev, felix->info->switch_pci_bar); diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 8dad0c894eca..5e9bfdea50be 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1201,6 +1201,7 @@ static const struct felix_info seville_info_vsc9953 = { .vcap = vsc9953_vcap_props, .num_mact_rows = 2048, .num_ports = 10, + .num_tx_queues = OCELOT_NUM_TC, .mdio_bus_alloc = vsc9953_mdio_bus_alloc, .mdio_bus_free = vsc9953_mdio_bus_free, .phylink_validate = vsc9953_phylink_validate, diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index e548b0f51d0c..1dc0c6d0671a 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -98,6 +98,7 @@ #define IFH_REW_OP_TWO_STEP_PTP 0x3 #define IFH_REW_OP_ORIGIN_PTP 0x5 +#define OCELOT_NUM_TC 8 #define OCELOT_TAG_LEN 16 #define OCELOT_SHORT_PREFIX_LEN 4 #define OCELOT_LONG_PREFIX_LEN 16 -- cgit v1.2.3 From 6c30384eb1dec96b678ff9c01c15134b1a0e81f4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jan 2021 04:11:18 +0200 Subject: net: mscc: ocelot: register devlink ports Add devlink integration into the mscc_ocelot switchdev driver. All physical ports (i.e. the unused ones as well) except the CPU port module at ocelot->num_phys_ports are registered with devlink, and that requires keeping the devlink_port structure outside struct ocelot_port_private, since the latter has a 1:1 mapping with a struct net_device (which does not exist for unused ports). Since we use devlink_port_type_eth_set to link the devlink port to the net_device, we can as well remove the .ndo_get_phys_port_name and .ndo_get_port_parent_id implementations, since devlink takes care of retrieving the port name and number automatically, once .ndo_get_devlink_port is implemented. Note that the felix DSA driver is already integrated with devlink by default, since that is a thing that the DSA core takes care of. This is the reason why these devlink stubs were put in ocelot_net.c and not in the common library. It is also the reason why ocelot::devlink is a pointer and not a full structure embedded inside struct ocelot: because the mscc_ocelot driver allocates that by itself (as the container of struct ocelot, in fact), but in the case of felix, it is DSA who allocates the devlink, and felix just propagates the pointer towards struct ocelot. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.h | 6 ++ drivers/net/ethernet/mscc/ocelot_net.c | 66 +++++++++------- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 118 +++++++++++++++++++++++++---- include/soc/mscc/ocelot.h | 2 + 4 files changed, 148 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index 519335676c24..e8621dbc14f7 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -121,9 +121,15 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target, struct phy_device *phy); +int ocelot_devlink_init(struct ocelot *ocelot); +void ocelot_devlink_teardown(struct ocelot *ocelot); +int ocelot_port_devlink_init(struct ocelot *ocelot, int port, + enum devlink_port_flavour flavour); +void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port); extern struct notifier_block ocelot_netdevice_nb; extern struct notifier_block ocelot_switchdev_nb; extern struct notifier_block ocelot_switchdev_blocking_nb; +extern const struct devlink_ops ocelot_devlink_ops; #endif diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 4fb9095be3ea..4485faefc2b1 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -8,6 +8,43 @@ #include "ocelot.h" #include "ocelot_vcap.h" +const struct devlink_ops ocelot_devlink_ops = { +}; + +int ocelot_port_devlink_init(struct ocelot *ocelot, int port, + enum devlink_port_flavour flavour) +{ + struct devlink_port *dlp = &ocelot->devlink_ports[port]; + int id_len = sizeof(ocelot->base_mac); + struct devlink *dl = ocelot->devlink; + struct devlink_port_attrs attrs = {}; + + memcpy(attrs.switch_id.id, &ocelot->base_mac, id_len); + attrs.switch_id.id_len = id_len; + attrs.phys.port_number = port; + attrs.flavour = flavour; + + devlink_port_attrs_set(dlp, &attrs); + + return devlink_port_register(dl, dlp, port); +} + +void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port) +{ + struct devlink_port *dlp = &ocelot->devlink_ports[port]; + + devlink_port_unregister(dlp); +} + +static struct devlink_port *ocelot_get_devlink_port(struct net_device *dev) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot *ocelot = priv->port.ocelot; + int port = priv->chip_port; + + return &ocelot->devlink_ports[port]; +} + int ocelot_setup_tc_cls_flower(struct ocelot_port_private *priv, struct flow_cls_offload *f, bool ingress) @@ -525,20 +562,6 @@ static void ocelot_set_rx_mode(struct net_device *dev) __dev_mc_sync(dev, ocelot_mc_sync, ocelot_mc_unsync); } -static int ocelot_port_get_phys_port_name(struct net_device *dev, - char *buf, size_t len) -{ - struct ocelot_port_private *priv = netdev_priv(dev); - int port = priv->chip_port; - int ret; - - ret = snprintf(buf, len, "p%d", port); - if (ret >= len) - return -EINVAL; - - return 0; -} - static int ocelot_port_set_mac_address(struct net_device *dev, void *p) { struct ocelot_port_private *priv = netdev_priv(dev); @@ -689,18 +712,6 @@ static int ocelot_set_features(struct net_device *dev, return 0; } -static int ocelot_get_port_parent_id(struct net_device *dev, - struct netdev_phys_item_id *ppid) -{ - struct ocelot_port_private *priv = netdev_priv(dev); - struct ocelot *ocelot = priv->port.ocelot; - - ppid->id_len = sizeof(ocelot->base_mac); - memcpy(&ppid->id, &ocelot->base_mac, ppid->id_len); - - return 0; -} - static int ocelot_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ocelot_port_private *priv = netdev_priv(dev); @@ -727,7 +738,6 @@ static const struct net_device_ops ocelot_port_netdev_ops = { .ndo_stop = ocelot_port_stop, .ndo_start_xmit = ocelot_port_xmit, .ndo_set_rx_mode = ocelot_set_rx_mode, - .ndo_get_phys_port_name = ocelot_port_get_phys_port_name, .ndo_set_mac_address = ocelot_port_set_mac_address, .ndo_get_stats64 = ocelot_get_stats64, .ndo_fdb_add = ocelot_port_fdb_add, @@ -736,9 +746,9 @@ static const struct net_device_ops ocelot_port_netdev_ops = { .ndo_vlan_rx_add_vid = ocelot_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ocelot_vlan_rx_kill_vid, .ndo_set_features = ocelot_set_features, - .ndo_get_port_parent_id = ocelot_get_port_parent_id, .ndo_setup_tc = ocelot_setup_tc, .ndo_do_ioctl = ocelot_ioctl, + .ndo_get_devlink_port = ocelot_get_devlink_port, }; struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port) diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index ecd474476cc6..28617023cd85 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -1051,6 +1051,14 @@ static struct ptp_clock_info ocelot_ptp_clock_info = { .enable = ocelot_ptp_enable, }; +static void mscc_ocelot_teardown_devlink_ports(struct ocelot *ocelot) +{ + int port; + + for (port = 0; port < ocelot->num_phys_ports; port++) + ocelot_port_devlink_teardown(ocelot, port); +} + static void mscc_ocelot_release_ports(struct ocelot *ocelot) { int port; @@ -1078,28 +1086,44 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, { struct ocelot *ocelot = platform_get_drvdata(pdev); struct device_node *portnp; - int err; + bool *registered_ports; + int port, err; + u32 reg; ocelot->ports = devm_kcalloc(ocelot->dev, ocelot->num_phys_ports, sizeof(struct ocelot_port *), GFP_KERNEL); if (!ocelot->ports) return -ENOMEM; + ocelot->devlink_ports = devm_kcalloc(ocelot->dev, + ocelot->num_phys_ports, + sizeof(*ocelot->devlink_ports), + GFP_KERNEL); + if (!ocelot->devlink_ports) + return -ENOMEM; + + registered_ports = kcalloc(ocelot->num_phys_ports, sizeof(bool), + GFP_KERNEL); + if (!registered_ports) + return -ENOMEM; + for_each_available_child_of_node(ports, portnp) { struct ocelot_port_private *priv; struct ocelot_port *ocelot_port; struct device_node *phy_node; + struct devlink_port *dlp; phy_interface_t phy_mode; struct phy_device *phy; struct regmap *target; struct resource *res; struct phy *serdes; char res_name[8]; - u32 port; - if (of_property_read_u32(portnp, "reg", &port)) + if (of_property_read_u32(portnp, "reg", ®)) continue; + port = reg; + snprintf(res_name, sizeof(res_name), "port%d", port); res = platform_get_resource_byname(pdev, IORESOURCE_MEM, @@ -1117,15 +1141,26 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, if (!phy) continue; + err = ocelot_port_devlink_init(ocelot, port, + DEVLINK_PORT_FLAVOUR_PHYSICAL); + if (err) { + of_node_put(portnp); + goto out_teardown; + } + err = ocelot_probe_port(ocelot, port, target, phy); if (err) { of_node_put(portnp); - return err; + goto out_teardown; } + registered_ports[port] = true; + ocelot_port = ocelot->ports[port]; priv = container_of(ocelot_port, struct ocelot_port_private, port); + dlp = &ocelot->devlink_ports[port]; + devlink_port_type_eth_set(dlp, priv->dev); of_get_phy_mode(portnp, &phy_mode); @@ -1150,7 +1185,8 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, "invalid phy mode for port%d, (Q)SGMII only\n", port); of_node_put(portnp); - return -EINVAL; + err = -EINVAL; + goto out_teardown; } serdes = devm_of_phy_get(ocelot->dev, portnp, NULL); @@ -1164,13 +1200,46 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, port); of_node_put(portnp); - return err; + goto out_teardown; } priv->serdes = serdes; } + /* Initialize unused devlink ports at the end */ + for (port = 0; port < ocelot->num_phys_ports; port++) { + if (registered_ports[port]) + continue; + + err = ocelot_port_devlink_init(ocelot, port, + DEVLINK_PORT_FLAVOUR_UNUSED); + if (err) { + while (port-- >= 0) { + if (!registered_ports[port]) + continue; + ocelot_port_devlink_teardown(ocelot, port); + } + + goto out_teardown; + } + } + + kfree(registered_ports); + return 0; + +out_teardown: + /* Unregister the network interfaces */ + mscc_ocelot_release_ports(ocelot); + /* Tear down devlink ports for the registered network interfaces */ + for (port = 0; port < ocelot->num_phys_ports; port++) { + if (!registered_ports[port]) + continue; + + ocelot_port_devlink_teardown(ocelot, port); + } + kfree(registered_ports); + return err; } static int mscc_ocelot_probe(struct platform_device *pdev) @@ -1178,6 +1247,7 @@ static int mscc_ocelot_probe(struct platform_device *pdev) struct device_node *np = pdev->dev.of_node; int err, irq_xtr, irq_ptp_rdy; struct device_node *ports; + struct devlink *devlink; struct ocelot *ocelot; struct regmap *hsio; unsigned int i; @@ -1201,10 +1271,12 @@ static int mscc_ocelot_probe(struct platform_device *pdev) if (!np && !pdev->dev.platform_data) return -ENODEV; - ocelot = devm_kzalloc(&pdev->dev, sizeof(*ocelot), GFP_KERNEL); - if (!ocelot) + devlink = devlink_alloc(&ocelot_devlink_ops, sizeof(*ocelot)); + if (!devlink) return -ENOMEM; + ocelot = devlink_priv(devlink); + ocelot->devlink = priv_to_devlink(ocelot); platform_set_drvdata(pdev, ocelot); ocelot->dev = &pdev->dev; @@ -1221,7 +1293,8 @@ static int mscc_ocelot_probe(struct platform_device *pdev) ocelot->targets[io_target[i].id] = NULL; continue; } - return PTR_ERR(target); + err = PTR_ERR(target); + goto out_free_devlink; } ocelot->targets[io_target[i].id] = target; @@ -1230,24 +1303,25 @@ static int mscc_ocelot_probe(struct platform_device *pdev) hsio = syscon_regmap_lookup_by_compatible("mscc,ocelot-hsio"); if (IS_ERR(hsio)) { dev_err(&pdev->dev, "missing hsio syscon\n"); - return PTR_ERR(hsio); + err = PTR_ERR(hsio); + goto out_free_devlink; } ocelot->targets[HSIO] = hsio; err = ocelot_chip_init(ocelot, &ocelot_ops); if (err) - return err; + goto out_free_devlink; irq_xtr = platform_get_irq_byname(pdev, "xtr"); if (irq_xtr < 0) - return -ENODEV; + goto out_free_devlink; err = devm_request_threaded_irq(&pdev->dev, irq_xtr, NULL, ocelot_xtr_irq_handler, IRQF_ONESHOT, "frame extraction", ocelot); if (err) - return err; + goto out_free_devlink; irq_ptp_rdy = platform_get_irq_byname(pdev, "ptp_rdy"); if (irq_ptp_rdy > 0 && ocelot->targets[PTP]) { @@ -1256,7 +1330,7 @@ static int mscc_ocelot_probe(struct platform_device *pdev) IRQF_ONESHOT, "ptp ready", ocelot); if (err) - return err; + goto out_free_devlink; /* Both the PTP interrupt and the PTP bank are available */ ocelot->ptp = 1; @@ -1265,7 +1339,8 @@ static int mscc_ocelot_probe(struct platform_device *pdev) ports = of_get_child_by_name(np, "ethernet-ports"); if (!ports) { dev_err(ocelot->dev, "no ethernet-ports child node found\n"); - return -ENODEV; + err = -ENODEV; + goto out_free_devlink; } ocelot->num_phys_ports = of_get_child_count(ports); @@ -1280,10 +1355,14 @@ static int mscc_ocelot_probe(struct platform_device *pdev) if (err) goto out_put_ports; - err = mscc_ocelot_init_ports(pdev, ports); + err = devlink_register(devlink, ocelot->dev); if (err) goto out_ocelot_deinit; + err = mscc_ocelot_init_ports(pdev, ports); + if (err) + goto out_ocelot_devlink_unregister; + if (ocelot->ptp) { err = ocelot_init_timestamp(ocelot, &ocelot_ptp_clock_info); if (err) { @@ -1303,10 +1382,14 @@ static int mscc_ocelot_probe(struct platform_device *pdev) return 0; +out_ocelot_devlink_unregister: + devlink_unregister(devlink); out_ocelot_deinit: ocelot_deinit(ocelot); out_put_ports: of_node_put(ports); +out_free_devlink: + devlink_free(devlink); return err; } @@ -1316,10 +1399,13 @@ static int mscc_ocelot_remove(struct platform_device *pdev) ocelot_deinit_timestamp(ocelot); mscc_ocelot_release_ports(ocelot); + mscc_ocelot_teardown_devlink_ports(ocelot); + devlink_unregister(ocelot->devlink); ocelot_deinit(ocelot); unregister_switchdev_blocking_notifier(&ocelot_switchdev_blocking_nb); unregister_switchdev_notifier(&ocelot_switchdev_nb); unregister_netdevice_notifier(&ocelot_netdevice_nb); + devlink_free(ocelot->devlink); return 0; } diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 1dc0c6d0671a..fc7dc6679739 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -602,6 +602,8 @@ struct ocelot_port { struct ocelot { struct device *dev; + struct devlink *devlink; + struct devlink_port *devlink_ports; const struct ocelot_ops *ops; struct regmap *targets[TARGET_MAX]; -- cgit v1.2.3 From f59fd9cab7305266f4148776c3b66329551a2a3a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 15 Jan 2021 04:11:20 +0200 Subject: net: mscc: ocelot: configure watermarks using devlink-sb Using devlink-sb, we can configure 12/16 (the important 75%) of the switch's controlling watermarks for congestion drops, and we can monitor 50% of the watermark occupancies (we can monitor the reservation watermarks, but not the sharing watermarks, which are exposed as pool sizes). The following definitions can be made: SB_BUF=0 # The devlink-sb for frame buffers SB_REF=1 # The devlink-sb for frame references POOL_ING=0 # The pool for ingress traffic. Both devlink-sb instances # have one of these. POOL_EGR=1 # The pool for egress traffic. Both devlink-sb instances # have one of these. Editing the hardware watermarks is done in the following way: BUF_xxxx_I is accessed when sb=$SB_BUF and pool=$POOL_ING REF_xxxx_I is accessed when sb=$SB_REF and pool=$POOL_ING BUF_xxxx_E is accessed when sb=$SB_BUF and pool=$POOL_EGR REF_xxxx_E is accessed when sb=$SB_REF and pool=$POOL_EGR Configuring the sharing watermarks for COL_SHR(dp=0) is done implicitly by modifying the corresponding pool size. By default, the pool size has maximum size, so this can be skipped. devlink sb pool set pci/0000:00:00.5 sb $SB_BUF pool $POOL_ING \ size 129840 thtype static Since by default there is no buffer reservation, the above command has maxed out BUF_COL_SHR_I(dp=0). Configuring the per-port reservation watermark (P_RSRV) is done in the following way: devlink sb port pool set pci/0000:00:00.5/0 sb $SB_BUF \ pool $POOL_ING th 1000 The above command sets BUF_P_RSRV_I(port 0) to 1000 bytes. After this command, the sharing watermarks are internally reconfigured with 1000 bytes less, i.e. from 129840 bytes to 128840 bytes. Configuring the per-port-tc reservation watermarks (Q_RSRV) is done in the following way: for tc in {0..7}; do devlink sb tc bind set pci/0000:00:00.5/0 sb 0 tc $tc \ type ingress pool $POOL_ING \ th 3000 done The above command sets BUF_Q_RSRV_I(port 0, tc 0..7) to 3000 bytes. The sharing watermarks are again reconfigured with 24000 bytes less. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 118 ++++++++ drivers/net/ethernet/mscc/ocelot.c | 5 - drivers/net/ethernet/mscc/ocelot.h | 1 - drivers/net/ethernet/mscc/ocelot_devlink.c | 453 ++++++++++++++++++++++++++++- drivers/net/ethernet/mscc/ocelot_net.c | 140 +++++++++ drivers/net/ethernet/mscc/ocelot_vsc7514.c | 8 + include/soc/mscc/ocelot.h | 47 +++ 7 files changed, 761 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 0c6a7de03331..767cbdccdb3e 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -427,6 +427,7 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) ocelot->ops = felix->info->ops; ocelot->inj_prefix = OCELOT_TAG_PREFIX_SHORT; ocelot->xtr_prefix = OCELOT_TAG_PREFIX_SHORT; + ocelot->devlink = felix->ds->devlink; port_phy_modes = kcalloc(num_phys_ports, sizeof(phy_interface_t), GFP_KERNEL); @@ -588,6 +589,10 @@ static int felix_setup(struct dsa_switch *ds) felix_port_qos_map_init(ocelot, port); } + err = ocelot_devlink_sb_register(ocelot); + if (err) + return err; + /* Include the CPU port module in the forwarding mask for unknown * unicast - the hardware default value for ANA_FLOODING_FLD_UNICAST * excludes BIT(ocelot->num_phys_ports), and so does ocelot_init, since @@ -609,6 +614,7 @@ static void felix_teardown(struct dsa_switch *ds) struct felix *felix = ocelot_to_felix(ocelot); int port; + ocelot_devlink_sb_unregister(ocelot); ocelot_deinit_timestamp(ocelot); ocelot_deinit(ocelot); @@ -750,6 +756,108 @@ static int felix_port_setup_tc(struct dsa_switch *ds, int port, return -EOPNOTSUPP; } +static int felix_sb_pool_get(struct dsa_switch *ds, unsigned int sb_index, + u16 pool_index, + struct devlink_sb_pool_info *pool_info) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_pool_get(ocelot, sb_index, pool_index, pool_info); +} + +static int felix_sb_pool_set(struct dsa_switch *ds, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_pool_set(ocelot, sb_index, pool_index, size, + threshold_type, extack); +} + +static int felix_sb_port_pool_get(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_port_pool_get(ocelot, port, sb_index, pool_index, + p_threshold); +} + +static int felix_sb_port_pool_set(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 threshold, struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_port_pool_set(ocelot, port, sb_index, pool_index, + threshold, extack); +} + +static int felix_sb_tc_pool_bind_get(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_tc_pool_bind_get(ocelot, port, sb_index, tc_index, + pool_type, p_pool_index, + p_threshold); +} + +static int felix_sb_tc_pool_bind_set(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_tc_pool_bind_set(ocelot, port, sb_index, tc_index, + pool_type, pool_index, threshold, + extack); +} + +static int felix_sb_occ_snapshot(struct dsa_switch *ds, + unsigned int sb_index) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_occ_snapshot(ocelot, sb_index); +} + +static int felix_sb_occ_max_clear(struct dsa_switch *ds, + unsigned int sb_index) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_occ_max_clear(ocelot, sb_index); +} + +static int felix_sb_occ_port_pool_get(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_cur, u32 *p_max) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_occ_port_pool_get(ocelot, port, sb_index, pool_index, + p_cur, p_max); +} + +static int felix_sb_occ_tc_port_bind_get(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_sb_occ_tc_port_bind_get(ocelot, port, sb_index, tc_index, + pool_type, p_cur, p_max); +} + const struct dsa_switch_ops felix_switch_ops = { .get_tag_protocol = felix_get_tag_protocol, .setup = felix_setup, @@ -788,6 +896,16 @@ const struct dsa_switch_ops felix_switch_ops = { .cls_flower_del = felix_cls_flower_del, .cls_flower_stats = felix_cls_flower_stats, .port_setup_tc = felix_port_setup_tc, + .devlink_sb_pool_get = felix_sb_pool_get, + .devlink_sb_pool_set = felix_sb_pool_set, + .devlink_sb_port_pool_get = felix_sb_port_pool_get, + .devlink_sb_port_pool_set = felix_sb_port_pool_set, + .devlink_sb_tc_pool_bind_get = felix_sb_tc_pool_bind_get, + .devlink_sb_tc_pool_bind_set = felix_sb_tc_pool_bind_set, + .devlink_sb_occ_snapshot = felix_sb_occ_snapshot, + .devlink_sb_occ_max_clear = felix_sb_occ_max_clear, + .devlink_sb_occ_port_pool_get = felix_sb_occ_port_pool_get, + .devlink_sb_occ_tc_port_bind_get= felix_sb_occ_tc_port_bind_get, }; struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 474ea5e59f89..a560d6be2a44 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1480,10 +1480,6 @@ static void ocelot_detect_features(struct ocelot *ocelot) eq_ctrl = ocelot_read(ocelot, QSYS_EQ_CTRL); ocelot->num_frame_refs = QSYS_MMGT_EQ_CTRL_FP_FREE_CNT(eq_ctrl); - - dev_info(ocelot->dev, - "Detected %d bytes of packet buffer and %d frame references\n", - ocelot->packet_buffer_size, ocelot->num_frame_refs); } int ocelot_init(struct ocelot *ocelot) @@ -1624,7 +1620,6 @@ int ocelot_init(struct ocelot *ocelot) INIT_DELAYED_WORK(&ocelot->stats_work, ocelot_check_stats_work); queue_delayed_work(ocelot->stats_queue, &ocelot->stats_work, OCELOT_STATS_CHECK_DELAY); - ocelot_watermark_init(ocelot); return 0; } diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index 48faa4dc893c..e8621dbc14f7 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -126,7 +126,6 @@ void ocelot_devlink_teardown(struct ocelot *ocelot); int ocelot_port_devlink_init(struct ocelot *ocelot, int port, enum devlink_port_flavour flavour); void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port); -void ocelot_watermark_init(struct ocelot *ocelot); extern struct notifier_block ocelot_netdevice_nb; extern struct notifier_block ocelot_switchdev_nb; diff --git a/drivers/net/ethernet/mscc/ocelot_devlink.c b/drivers/net/ethernet/mscc/ocelot_devlink.c index c58315b62841..edafbd37d12c 100644 --- a/drivers/net/ethernet/mscc/ocelot_devlink.c +++ b/drivers/net/ethernet/mscc/ocelot_devlink.c @@ -232,6 +232,14 @@ static void ocelot_wm_write(struct ocelot *ocelot, int index, u32 val) ocelot_write_gix(ocelot, wm, QSYS_RES_CFG, index); } +static void ocelot_wm_status(struct ocelot *ocelot, int index, u32 *inuse, + u32 *maxuse) +{ + int res_stat = ocelot_read_gix(ocelot, QSYS_RES_STAT, index); + + return ocelot->ops->wm_stat(res_stat, inuse, maxuse); +} + /* The hardware comes out of reset with strange defaults: the sum of all * reservations for frame memory is larger than the total buffer size. * One has to wonder how can the reservation watermarks still guarantee @@ -348,10 +356,14 @@ static void ocelot_setup_sharing_watermarks(struct ocelot *ocelot) ocelot_get_buf_rsrv(ocelot, &buf_rsrv_i, &buf_rsrv_e); ocelot_get_ref_rsrv(ocelot, &ref_rsrv_i, &ref_rsrv_e); - buf_shr_i = ocelot->packet_buffer_size - buf_rsrv_i; - buf_shr_e = ocelot->packet_buffer_size - buf_rsrv_e; - ref_shr_i = ocelot->num_frame_refs - ref_rsrv_i; - ref_shr_e = ocelot->num_frame_refs - ref_rsrv_e; + buf_shr_i = ocelot->pool_size[OCELOT_SB_BUF][OCELOT_SB_POOL_ING] - + buf_rsrv_i; + buf_shr_e = ocelot->pool_size[OCELOT_SB_BUF][OCELOT_SB_POOL_EGR] - + buf_rsrv_e; + ref_shr_i = ocelot->pool_size[OCELOT_SB_REF][OCELOT_SB_POOL_ING] - + ref_rsrv_i; + ref_shr_e = ocelot->pool_size[OCELOT_SB_REF][OCELOT_SB_POOL_EGR] - + ref_rsrv_e; buf_shr_i /= OCELOT_BUFFER_CELL_SZ; buf_shr_e /= OCELOT_BUFFER_CELL_SZ; @@ -366,6 +378,40 @@ static void ocelot_setup_sharing_watermarks(struct ocelot *ocelot) ocelot_wm_write(ocelot, REF_COL_SHR_I(1), 0); } +/* Ensure that all reservations can be enforced */ +static int ocelot_watermark_validate(struct ocelot *ocelot, + struct netlink_ext_ack *extack) +{ + u32 buf_rsrv_i, buf_rsrv_e; + u32 ref_rsrv_i, ref_rsrv_e; + + ocelot_get_buf_rsrv(ocelot, &buf_rsrv_i, &buf_rsrv_e); + ocelot_get_ref_rsrv(ocelot, &ref_rsrv_i, &ref_rsrv_e); + + if (buf_rsrv_i > ocelot->pool_size[OCELOT_SB_BUF][OCELOT_SB_POOL_ING]) { + NL_SET_ERR_MSG_MOD(extack, + "Ingress frame reservations exceed pool size"); + return -ERANGE; + } + if (buf_rsrv_e > ocelot->pool_size[OCELOT_SB_BUF][OCELOT_SB_POOL_EGR]) { + NL_SET_ERR_MSG_MOD(extack, + "Egress frame reservations exceed pool size"); + return -ERANGE; + } + if (ref_rsrv_i > ocelot->pool_size[OCELOT_SB_REF][OCELOT_SB_POOL_ING]) { + NL_SET_ERR_MSG_MOD(extack, + "Ingress reference reservations exceed pool size"); + return -ERANGE; + } + if (ref_rsrv_e > ocelot->pool_size[OCELOT_SB_REF][OCELOT_SB_POOL_EGR]) { + NL_SET_ERR_MSG_MOD(extack, + "Egress reference reservations exceed pool size"); + return -ERANGE; + } + + return 0; +} + /* The hardware works like this: * * Frame forwarding decision taken @@ -427,7 +473,7 @@ static void ocelot_setup_sharing_watermarks(struct ocelot *ocelot) * reservations by default, let sharing use all resources) and disables the * unused watermarks. */ -void ocelot_watermark_init(struct ocelot *ocelot) +static void ocelot_watermark_init(struct ocelot *ocelot) { int all_tcs = GENMASK(OCELOT_NUM_TC - 1, 0); int port; @@ -440,3 +486,400 @@ void ocelot_watermark_init(struct ocelot *ocelot) ocelot_disable_tc_sharing_watermarks(ocelot); ocelot_setup_sharing_watermarks(ocelot); } + +/* Pool size and type are fixed up at runtime. Keeping this structure to + * look up the cell size multipliers. + */ +static const struct devlink_sb_pool_info ocelot_sb_pool[] = { + [OCELOT_SB_BUF] = { + .cell_size = OCELOT_BUFFER_CELL_SZ, + .threshold_type = DEVLINK_SB_THRESHOLD_TYPE_STATIC, + }, + [OCELOT_SB_REF] = { + .cell_size = 1, + .threshold_type = DEVLINK_SB_THRESHOLD_TYPE_STATIC, + }, +}; + +/* Returns the pool size configured through ocelot_sb_pool_set */ +int ocelot_sb_pool_get(struct ocelot *ocelot, unsigned int sb_index, + u16 pool_index, + struct devlink_sb_pool_info *pool_info) +{ + if (sb_index >= OCELOT_SB_NUM) + return -ENODEV; + if (pool_index >= OCELOT_SB_POOL_NUM) + return -ENODEV; + + *pool_info = ocelot_sb_pool[sb_index]; + pool_info->size = ocelot->pool_size[sb_index][pool_index]; + if (pool_index) + pool_info->pool_type = DEVLINK_SB_POOL_TYPE_INGRESS; + else + pool_info->pool_type = DEVLINK_SB_POOL_TYPE_EGRESS; + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_pool_get); + +/* The pool size received here configures the total amount of resources used on + * ingress (or on egress, depending upon the pool index). The pool size, minus + * the values for the port and port-tc reservations, is written into the + * COL_SHR(dp=0) sharing watermark. + */ +int ocelot_sb_pool_set(struct ocelot *ocelot, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) +{ + u32 old_pool_size; + int err; + + if (sb_index >= OCELOT_SB_NUM) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid sb, use 0 for buffers and 1 for frame references"); + return -ENODEV; + } + if (pool_index >= OCELOT_SB_POOL_NUM) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid pool, use 0 for ingress and 1 for egress"); + return -ENODEV; + } + if (threshold_type != DEVLINK_SB_THRESHOLD_TYPE_STATIC) { + NL_SET_ERR_MSG_MOD(extack, + "Only static threshold supported"); + return -EOPNOTSUPP; + } + + old_pool_size = ocelot->pool_size[sb_index][pool_index]; + ocelot->pool_size[sb_index][pool_index] = size; + + err = ocelot_watermark_validate(ocelot, extack); + if (err) { + ocelot->pool_size[sb_index][pool_index] = old_pool_size; + return err; + } + + ocelot_setup_sharing_watermarks(ocelot); + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_pool_set); + +/* This retrieves the configuration made with ocelot_sb_port_pool_set */ +int ocelot_sb_port_pool_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold) +{ + int wm_index; + + switch (sb_index) { + case OCELOT_SB_BUF: + if (pool_index == OCELOT_SB_POOL_ING) + wm_index = BUF_P_RSRV_I(port); + else + wm_index = BUF_P_RSRV_E(port); + break; + case OCELOT_SB_REF: + if (pool_index == OCELOT_SB_POOL_ING) + wm_index = REF_P_RSRV_I(port); + else + wm_index = REF_P_RSRV_E(port); + break; + default: + return -ENODEV; + } + + *p_threshold = ocelot_wm_read(ocelot, wm_index); + *p_threshold *= ocelot_sb_pool[sb_index].cell_size; + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_port_pool_get); + +/* This configures the P_RSRV per-port reserved resource watermark */ +int ocelot_sb_port_pool_set(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 pool_index, + u32 threshold, struct netlink_ext_ack *extack) +{ + int wm_index, err; + u32 old_thr; + + switch (sb_index) { + case OCELOT_SB_BUF: + if (pool_index == OCELOT_SB_POOL_ING) + wm_index = BUF_P_RSRV_I(port); + else + wm_index = BUF_P_RSRV_E(port); + break; + case OCELOT_SB_REF: + if (pool_index == OCELOT_SB_POOL_ING) + wm_index = REF_P_RSRV_I(port); + else + wm_index = REF_P_RSRV_E(port); + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Invalid shared buffer"); + return -ENODEV; + } + + threshold /= ocelot_sb_pool[sb_index].cell_size; + + old_thr = ocelot_wm_read(ocelot, wm_index); + ocelot_wm_write(ocelot, wm_index, threshold); + + err = ocelot_watermark_validate(ocelot, extack); + if (err) { + ocelot_wm_write(ocelot, wm_index, old_thr); + return err; + } + + ocelot_setup_sharing_watermarks(ocelot); + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_port_pool_set); + +/* This retrieves the configuration done by ocelot_sb_tc_pool_bind_set */ +int ocelot_sb_tc_pool_bind_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold) +{ + int wm_index; + + switch (sb_index) { + case OCELOT_SB_BUF: + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + wm_index = BUF_Q_RSRV_I(port, tc_index); + else + wm_index = BUF_Q_RSRV_E(port, tc_index); + break; + case OCELOT_SB_REF: + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + wm_index = REF_Q_RSRV_I(port, tc_index); + else + wm_index = REF_Q_RSRV_E(port, tc_index); + break; + default: + return -ENODEV; + } + + *p_threshold = ocelot_wm_read(ocelot, wm_index); + *p_threshold *= ocelot_sb_pool[sb_index].cell_size; + + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + *p_pool_index = 0; + else + *p_pool_index = 1; + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_tc_pool_bind_get); + +/* This configures the Q_RSRV per-port-tc reserved resource watermark */ +int ocelot_sb_tc_pool_bind_set(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) +{ + int wm_index, err; + u32 old_thr; + + /* Paranoid check? */ + if (pool_index == OCELOT_SB_POOL_ING && + pool_type != DEVLINK_SB_POOL_TYPE_INGRESS) + return -EINVAL; + if (pool_index == OCELOT_SB_POOL_EGR && + pool_type != DEVLINK_SB_POOL_TYPE_EGRESS) + return -EINVAL; + + switch (sb_index) { + case OCELOT_SB_BUF: + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + wm_index = BUF_Q_RSRV_I(port, tc_index); + else + wm_index = BUF_Q_RSRV_E(port, tc_index); + break; + case OCELOT_SB_REF: + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + wm_index = REF_Q_RSRV_I(port, tc_index); + else + wm_index = REF_Q_RSRV_E(port, tc_index); + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Invalid shared buffer"); + return -ENODEV; + } + + threshold /= ocelot_sb_pool[sb_index].cell_size; + + old_thr = ocelot_wm_read(ocelot, wm_index); + ocelot_wm_write(ocelot, wm_index, threshold); + err = ocelot_watermark_validate(ocelot, extack); + if (err) { + ocelot_wm_write(ocelot, wm_index, old_thr); + return err; + } + + ocelot_setup_sharing_watermarks(ocelot); + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_tc_pool_bind_set); + +/* The hardware does not support atomic snapshots, we'll read out the + * occupancy registers individually and have this as just a stub. + */ +int ocelot_sb_occ_snapshot(struct ocelot *ocelot, unsigned int sb_index) +{ + return 0; +} +EXPORT_SYMBOL(ocelot_sb_occ_snapshot); + +/* The watermark occupancy registers are cleared upon read, + * so let's read them. + */ +int ocelot_sb_occ_max_clear(struct ocelot *ocelot, unsigned int sb_index) +{ + u32 inuse, maxuse; + int port, prio; + + switch (sb_index) { + case OCELOT_SB_BUF: + for (port = 0; port <= ocelot->num_phys_ports; port++) { + for (prio = 0; prio < OCELOT_NUM_TC; prio++) { + ocelot_wm_status(ocelot, BUF_Q_RSRV_I(port, prio), + &inuse, &maxuse); + ocelot_wm_status(ocelot, BUF_Q_RSRV_E(port, prio), + &inuse, &maxuse); + } + ocelot_wm_status(ocelot, BUF_P_RSRV_I(port), + &inuse, &maxuse); + ocelot_wm_status(ocelot, BUF_P_RSRV_E(port), + &inuse, &maxuse); + } + break; + case OCELOT_SB_REF: + for (port = 0; port <= ocelot->num_phys_ports; port++) { + for (prio = 0; prio < OCELOT_NUM_TC; prio++) { + ocelot_wm_status(ocelot, REF_Q_RSRV_I(port, prio), + &inuse, &maxuse); + ocelot_wm_status(ocelot, REF_Q_RSRV_E(port, prio), + &inuse, &maxuse); + } + ocelot_wm_status(ocelot, REF_P_RSRV_I(port), + &inuse, &maxuse); + ocelot_wm_status(ocelot, REF_P_RSRV_E(port), + &inuse, &maxuse); + } + break; + default: + return -ENODEV; + } + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_occ_max_clear); + +/* This retrieves the watermark occupancy for per-port P_RSRV watermarks */ +int ocelot_sb_occ_port_pool_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_cur, u32 *p_max) +{ + int wm_index; + + switch (sb_index) { + case OCELOT_SB_BUF: + if (pool_index == OCELOT_SB_POOL_ING) + wm_index = BUF_P_RSRV_I(port); + else + wm_index = BUF_P_RSRV_E(port); + break; + case OCELOT_SB_REF: + if (pool_index == OCELOT_SB_POOL_ING) + wm_index = REF_P_RSRV_I(port); + else + wm_index = REF_P_RSRV_E(port); + break; + default: + return -ENODEV; + } + + ocelot_wm_status(ocelot, wm_index, p_cur, p_max); + *p_cur *= ocelot_sb_pool[sb_index].cell_size; + *p_max *= ocelot_sb_pool[sb_index].cell_size; + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_occ_port_pool_get); + +/* This retrieves the watermark occupancy for per-port-tc Q_RSRV watermarks */ +int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max) +{ + int wm_index; + + switch (sb_index) { + case OCELOT_SB_BUF: + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + wm_index = BUF_Q_RSRV_I(port, tc_index); + else + wm_index = BUF_Q_RSRV_E(port, tc_index); + break; + case OCELOT_SB_REF: + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS) + wm_index = REF_Q_RSRV_I(port, tc_index); + else + wm_index = REF_Q_RSRV_E(port, tc_index); + break; + default: + return -ENODEV; + } + + ocelot_wm_status(ocelot, wm_index, p_cur, p_max); + *p_cur *= ocelot_sb_pool[sb_index].cell_size; + *p_max *= ocelot_sb_pool[sb_index].cell_size; + + return 0; +} +EXPORT_SYMBOL(ocelot_sb_occ_tc_port_bind_get); + +int ocelot_devlink_sb_register(struct ocelot *ocelot) +{ + int err; + + err = devlink_sb_register(ocelot->devlink, OCELOT_SB_BUF, + ocelot->packet_buffer_size, 1, 1, + OCELOT_NUM_TC, OCELOT_NUM_TC); + if (err) + return err; + + err = devlink_sb_register(ocelot->devlink, OCELOT_SB_REF, + ocelot->num_frame_refs, 1, 1, + OCELOT_NUM_TC, OCELOT_NUM_TC); + if (err) { + devlink_sb_unregister(ocelot->devlink, OCELOT_SB_BUF); + return err; + } + + ocelot->pool_size[OCELOT_SB_BUF][OCELOT_SB_POOL_ING] = ocelot->packet_buffer_size; + ocelot->pool_size[OCELOT_SB_BUF][OCELOT_SB_POOL_EGR] = ocelot->packet_buffer_size; + ocelot->pool_size[OCELOT_SB_REF][OCELOT_SB_POOL_ING] = ocelot->num_frame_refs; + ocelot->pool_size[OCELOT_SB_REF][OCELOT_SB_POOL_EGR] = ocelot->num_frame_refs; + + ocelot_watermark_init(ocelot); + + return 0; +} +EXPORT_SYMBOL(ocelot_devlink_sb_register); + +void ocelot_devlink_sb_unregister(struct ocelot *ocelot) +{ + devlink_sb_unregister(ocelot->devlink, OCELOT_SB_BUF); + devlink_sb_unregister(ocelot->devlink, OCELOT_SB_REF); +} +EXPORT_SYMBOL(ocelot_devlink_sb_unregister); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 4485faefc2b1..a520fd485912 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1,14 +1,154 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Microsemi Ocelot Switch driver + * + * This contains glue logic between the switchdev driver operations and the + * mscc_ocelot_switch_lib. * * Copyright (c) 2017, 2019 Microsemi Corporation + * Copyright 2020-2021 NXP Semiconductors */ #include #include "ocelot.h" #include "ocelot_vcap.h" +static struct ocelot *devlink_port_to_ocelot(struct devlink_port *dlp) +{ + return devlink_priv(dlp->devlink); +} + +static int devlink_port_to_port(struct devlink_port *dlp) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + + return dlp - ocelot->devlink_ports; +} + +static int ocelot_devlink_sb_pool_get(struct devlink *dl, + unsigned int sb_index, u16 pool_index, + struct devlink_sb_pool_info *pool_info) +{ + struct ocelot *ocelot = devlink_priv(dl); + + return ocelot_sb_pool_get(ocelot, sb_index, pool_index, pool_info); +} + +static int ocelot_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = devlink_priv(dl); + + return ocelot_sb_pool_set(ocelot, sb_index, pool_index, size, + threshold_type, extack); +} + +static int ocelot_devlink_sb_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + int port = devlink_port_to_port(dlp); + + return ocelot_sb_port_pool_get(ocelot, port, sb_index, pool_index, + p_threshold); +} + +static int ocelot_devlink_sb_port_pool_set(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + int port = devlink_port_to_port(dlp); + + return ocelot_sb_port_pool_set(ocelot, port, sb_index, pool_index, + threshold, extack); +} + +static int +ocelot_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + int port = devlink_port_to_port(dlp); + + return ocelot_sb_tc_pool_bind_get(ocelot, port, sb_index, tc_index, + pool_type, p_pool_index, + p_threshold); +} + +static int +ocelot_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + int port = devlink_port_to_port(dlp); + + return ocelot_sb_tc_pool_bind_set(ocelot, port, sb_index, tc_index, + pool_type, pool_index, threshold, + extack); +} + +static int ocelot_devlink_sb_occ_snapshot(struct devlink *dl, + unsigned int sb_index) +{ + struct ocelot *ocelot = devlink_priv(dl); + + return ocelot_sb_occ_snapshot(ocelot, sb_index); +} + +static int ocelot_devlink_sb_occ_max_clear(struct devlink *dl, + unsigned int sb_index) +{ + struct ocelot *ocelot = devlink_priv(dl); + + return ocelot_sb_occ_max_clear(ocelot, sb_index); +} + +static int ocelot_devlink_sb_occ_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, + u16 pool_index, u32 *p_cur, + u32 *p_max) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + int port = devlink_port_to_port(dlp); + + return ocelot_sb_occ_port_pool_get(ocelot, port, sb_index, pool_index, + p_cur, p_max); +} + +static int +ocelot_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max) +{ + struct ocelot *ocelot = devlink_port_to_ocelot(dlp); + int port = devlink_port_to_port(dlp); + + return ocelot_sb_occ_tc_port_bind_get(ocelot, port, sb_index, + tc_index, pool_type, + p_cur, p_max); +} + const struct devlink_ops ocelot_devlink_ops = { + .sb_pool_get = ocelot_devlink_sb_pool_get, + .sb_pool_set = ocelot_devlink_sb_pool_set, + .sb_port_pool_get = ocelot_devlink_sb_port_pool_get, + .sb_port_pool_set = ocelot_devlink_sb_port_pool_set, + .sb_tc_pool_bind_get = ocelot_devlink_sb_tc_pool_bind_get, + .sb_tc_pool_bind_set = ocelot_devlink_sb_tc_pool_bind_set, + .sb_occ_snapshot = ocelot_devlink_sb_occ_snapshot, + .sb_occ_max_clear = ocelot_devlink_sb_occ_max_clear, + .sb_occ_port_pool_get = ocelot_devlink_sb_occ_port_pool_get, + .sb_occ_tc_port_bind_get = ocelot_devlink_sb_occ_tc_port_bind_get, }; int ocelot_port_devlink_init(struct ocelot *ocelot, int port, diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 28617023cd85..30a38df08a21 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -1363,6 +1363,10 @@ static int mscc_ocelot_probe(struct platform_device *pdev) if (err) goto out_ocelot_devlink_unregister; + err = ocelot_devlink_sb_register(ocelot); + if (err) + goto out_ocelot_release_ports; + if (ocelot->ptp) { err = ocelot_init_timestamp(ocelot, &ocelot_ptp_clock_info); if (err) { @@ -1382,6 +1386,9 @@ static int mscc_ocelot_probe(struct platform_device *pdev) return 0; +out_ocelot_release_ports: + mscc_ocelot_release_ports(ocelot); + mscc_ocelot_teardown_devlink_ports(ocelot); out_ocelot_devlink_unregister: devlink_unregister(devlink); out_ocelot_deinit: @@ -1398,6 +1405,7 @@ static int mscc_ocelot_remove(struct platform_device *pdev) struct ocelot *ocelot = platform_get_drvdata(pdev); ocelot_deinit_timestamp(ocelot); + ocelot_devlink_sb_unregister(ocelot); mscc_ocelot_release_ports(ocelot); mscc_ocelot_teardown_devlink_ports(ocelot); devlink_unregister(ocelot->devlink); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index fc7dc6679739..cdc33fa05660 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -579,6 +579,18 @@ struct ocelot_vlan { u16 vid; }; +enum ocelot_sb { + OCELOT_SB_BUF, + OCELOT_SB_REF, + OCELOT_SB_NUM, +}; + +enum ocelot_sb_pool { + OCELOT_SB_POOL_ING, + OCELOT_SB_POOL_EGR, + OCELOT_SB_POOL_NUM, +}; + struct ocelot_port { struct ocelot *ocelot; @@ -612,6 +624,7 @@ struct ocelot { const struct ocelot_stat_layout *stats_layout; unsigned int num_stats; + u32 pool_size[OCELOT_SB_NUM][OCELOT_SB_POOL_NUM]; int packet_buffer_size; int num_frame_refs; int num_mact_rows; @@ -783,4 +796,38 @@ int ocelot_port_mdb_add(struct ocelot *ocelot, int port, int ocelot_port_mdb_del(struct ocelot *ocelot, int port, const struct switchdev_obj_port_mdb *mdb); +int ocelot_devlink_sb_register(struct ocelot *ocelot); +void ocelot_devlink_sb_unregister(struct ocelot *ocelot); +int ocelot_sb_pool_get(struct ocelot *ocelot, unsigned int sb_index, + u16 pool_index, + struct devlink_sb_pool_info *pool_info); +int ocelot_sb_pool_set(struct ocelot *ocelot, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack); +int ocelot_sb_port_pool_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold); +int ocelot_sb_port_pool_set(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 pool_index, + u32 threshold, struct netlink_ext_ack *extack); +int ocelot_sb_tc_pool_bind_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold); +int ocelot_sb_tc_pool_bind_set(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack); +int ocelot_sb_occ_snapshot(struct ocelot *ocelot, unsigned int sb_index); +int ocelot_sb_occ_max_clear(struct ocelot *ocelot, unsigned int sb_index); +int ocelot_sb_occ_port_pool_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_cur, u32 *p_max); +int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max); + #endif -- cgit v1.2.3 From 9ab7e76aefc97a9aa664accb59d6e8dc5e52514a Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sat, 9 Jan 2021 23:00:21 -0800 Subject: GTP: add support for flow based tunneling API Following patch add support for flow based tunneling API to send and recv GTP tunnel packet over tunnel metadata API. This would allow this device integration with OVS or eBPF using flow based tunneling APIs. Signed-off-by: Pravin B Shelar Link: https://lore.kernel.org/r/20210110070021.26822-1-pbshelar@fb.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 527 +++++++++++++++++++++++++++---------- include/uapi/linux/gtp.h | 12 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/if_tunnel.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 5 files changed, 398 insertions(+), 144 deletions(-) (limited to 'include') diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4c04e271f184..851364314ecc 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -73,6 +74,9 @@ struct gtp_dev { unsigned int hash_size; struct hlist_head *tid_hash; struct hlist_head *addr_hash; + /* Used by LWT tunnel. */ + bool collect_md; + struct socket *collect_md_sock; }; static unsigned int gtp_net_id __read_mostly; @@ -179,33 +183,121 @@ static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx, return false; } -static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, - unsigned int hdrlen, unsigned int role) +static int gtp_set_tun_dst(struct gtp_dev *gtp, struct sk_buff *skb, + unsigned int hdrlen, u8 gtp_version, + __be64 tid, u8 flags) { - if (!gtp_check_ms(skb, pctx, hdrlen, role)) { - netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); - return 1; + struct metadata_dst *tun_dst; + int opts_len = 0; + + if (unlikely(flags & GTP1_F_MASK)) + opts_len = sizeof(struct gtpu_metadata); + + tun_dst = udp_tun_rx_dst(skb, gtp->sk1u->sk_family, TUNNEL_KEY, tid, opts_len); + if (!tun_dst) { + netdev_dbg(gtp->dev, "Failed to allocate tun_dst"); + goto err; } + netdev_dbg(gtp->dev, "attaching metadata_dst to skb, gtp ver %d hdrlen %d\n", + gtp_version, hdrlen); + if (unlikely(opts_len)) { + struct gtpu_metadata *opts; + struct gtp1_header *gtp1; + + opts = ip_tunnel_info_opts(&tun_dst->u.tun_info); + gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); + opts->ver = GTP_METADATA_V1; + opts->flags = gtp1->flags; + opts->type = gtp1->type; + netdev_dbg(gtp->dev, "recved control pkt: flag %x type: %d\n", + opts->flags, opts->type); + tun_dst->u.tun_info.key.tun_flags |= TUNNEL_GTPU_OPT; + tun_dst->u.tun_info.options_len = opts_len; + skb->protocol = htons(0xffff); /* Unknown */ + } /* Get rid of the GTP + UDP headers. */ if (iptunnel_pull_header(skb, hdrlen, skb->protocol, - !net_eq(sock_net(pctx->sk), dev_net(pctx->dev)))) - return -1; + !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)))) { + gtp->dev->stats.rx_length_errors++; + goto err; + } + + skb_dst_set(skb, &tun_dst->dst); + return 0; +err: + return -1; +} + +static int gtp_rx(struct gtp_dev *gtp, struct sk_buff *skb, + unsigned int hdrlen, u8 gtp_version, unsigned int role, + __be64 tid, u8 flags, u8 type) +{ + if (ip_tunnel_collect_metadata() || gtp->collect_md) { + int err; + + err = gtp_set_tun_dst(gtp, skb, hdrlen, gtp_version, tid, flags); + if (err) + goto err; + } else { + struct pdp_ctx *pctx; + + if (flags & GTP1_F_MASK) + hdrlen += 4; - netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n"); + if (type != GTP_TPDU) + return 1; + + if (gtp_version == GTP_V0) + pctx = gtp0_pdp_find(gtp, be64_to_cpu(tid)); + else + pctx = gtp1_pdp_find(gtp, be64_to_cpu(tid)); + if (!pctx) { + netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); + return 1; + } + + if (!gtp_check_ms(skb, pctx, hdrlen, role)) { + netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); + return 1; + } + /* Get rid of the GTP + UDP headers. */ + if (iptunnel_pull_header(skb, hdrlen, skb->protocol, + !net_eq(sock_net(pctx->sk), dev_net(gtp->dev)))) { + gtp->dev->stats.rx_length_errors++; + goto err; + } + } + netdev_dbg(gtp->dev, "forwarding packet from GGSN to uplink\n"); /* Now that the UDP and the GTP header have been removed, set up the * new network header. This is required by the upper layer to * calculate the transport header. */ skb_reset_network_header(skb); + if (pskb_may_pull(skb, sizeof(struct iphdr))) { + struct iphdr *iph; + + iph = ip_hdr(skb); + if (iph->version == 4) { + netdev_dbg(gtp->dev, "inner pkt: ipv4"); + skb->protocol = htons(ETH_P_IP); + } else if (iph->version == 6) { + netdev_dbg(gtp->dev, "inner pkt: ipv6"); + skb->protocol = htons(ETH_P_IPV6); + } else { + netdev_dbg(gtp->dev, "inner pkt error: Unknown type"); + } + } - skb->dev = pctx->dev; - - dev_sw_netstats_rx_add(pctx->dev, skb->len); - + skb->dev = gtp->dev; + dev_sw_netstats_rx_add(gtp->dev, skb->len); netif_rx(skb); return 0; + +err: + gtp->dev->stats.rx_dropped++; + return -1; } /* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */ @@ -214,7 +306,6 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp0_header); struct gtp0_header *gtp0; - struct pdp_ctx *pctx; if (!pskb_may_pull(skb, hdrlen)) return -1; @@ -224,16 +315,7 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) if ((gtp0->flags >> 5) != GTP_V0) return 1; - if (gtp0->type != GTP_TPDU) - return 1; - - pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid)); - if (!pctx) { - netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); - return 1; - } - - return gtp_rx(pctx, skb, hdrlen, gtp->role); + return gtp_rx(gtp, skb, hdrlen, GTP_V0, gtp->role, gtp0->tid, gtp0->flags, gtp0->type); } static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) @@ -241,41 +323,30 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp1_header); struct gtp1_header *gtp1; - struct pdp_ctx *pctx; if (!pskb_may_pull(skb, hdrlen)) return -1; gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); + netdev_dbg(gtp->dev, "GTPv1 recv: flags %x\n", gtp1->flags); if ((gtp1->flags >> 5) != GTP_V1) return 1; - if (gtp1->type != GTP_TPDU) - return 1; - /* From 29.060: "This field shall be present if and only if any one or * more of the S, PN and E flags are set.". * * If any of the bit is set, then the remaining ones also have to be * set. */ - if (gtp1->flags & GTP1_F_MASK) - hdrlen += 4; - /* Make sure the header is larger enough, including extensions. */ if (!pskb_may_pull(skb, hdrlen)) return -1; gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); - pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid)); - if (!pctx) { - netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); - return 1; - } - - return gtp_rx(pctx, skb, hdrlen, gtp->role); + return gtp_rx(gtp, skb, hdrlen, GTP_V1, gtp->role, + key32_to_tunnel_id(gtp1->tid), gtp1->flags, gtp1->type); } static void __gtp_encap_destroy(struct sock *sk) @@ -315,6 +386,11 @@ static void gtp_encap_disable(struct gtp_dev *gtp) { gtp_encap_disable_sock(gtp->sk0); gtp_encap_disable_sock(gtp->sk1u); + if (gtp->collect_md_sock) { + udp_tunnel_sock_release(gtp->collect_md_sock); + gtp->collect_md_sock = NULL; + netdev_dbg(gtp->dev, "GTP socket released.\n"); + } } /* UDP encapsulation receive handler. See net/ipv4/udp.c. @@ -329,7 +405,8 @@ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!gtp) return 1; - netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk); + netdev_dbg(gtp->dev, "encap_recv sk=%p type %d\n", + sk, udp_sk(sk)->encap_type); switch (udp_sk(sk)->encap_type) { case UDP_ENCAP_GTP0: @@ -383,12 +460,13 @@ static void gtp_dev_uninit(struct net_device *dev) static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, const struct sock *sk, - __be32 daddr) + __be32 daddr, + __be32 saddr) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; - fl4->saddr = inet_sk(sk)->inet_saddr; + fl4->saddr = saddr; fl4->flowi4_tos = RT_CONN_FLAGS(sk); fl4->flowi4_proto = sk->sk_protocol; @@ -412,7 +490,7 @@ static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) gtp0->tid = cpu_to_be64(pctx->u.v0.tid); } -static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) +static inline void gtp1_push_header(struct sk_buff *skb, __be32 tid) { int payload_len = skb->len; struct gtp1_header *gtp1; @@ -428,46 +506,63 @@ static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) gtp1->flags = 0x30; /* v1, GTP-non-prime. */ gtp1->type = GTP_TPDU; gtp1->length = htons(payload_len); - gtp1->tid = htonl(pctx->u.v1.o_tei); + gtp1->tid = tid; /* TODO: Suppport for extension header, sequence number and N-PDU. * Update the length field if any of them is available. */ } -struct gtp_pktinfo { - struct sock *sk; - struct iphdr *iph; - struct flowi4 fl4; - struct rtable *rt; - struct pdp_ctx *pctx; - struct net_device *dev; - __be16 gtph_port; -}; - -static void gtp_push_header(struct sk_buff *skb, struct gtp_pktinfo *pktinfo) +static inline int gtp1_push_control_header(struct sk_buff *skb, + __be32 tid, + struct gtpu_metadata *opts, + struct net_device *dev) { - switch (pktinfo->pctx->gtp_version) { - case GTP_V0: - pktinfo->gtph_port = htons(GTP0_PORT); - gtp0_push_header(skb, pktinfo->pctx); - break; - case GTP_V1: - pktinfo->gtph_port = htons(GTP1U_PORT); - gtp1_push_header(skb, pktinfo->pctx); - break; + struct gtp1_header *gtp1c; + int payload_len; + + if (opts->ver != GTP_METADATA_V1) + return -ENOENT; + + if (opts->type == 0xFE) { + /* for end marker ignore skb data. */ + netdev_dbg(dev, "xmit pkt with null data"); + pskb_trim(skb, 0); } + if (skb_cow_head(skb, sizeof(*gtp1c)) < 0) + return -ENOMEM; + + payload_len = skb->len; + + gtp1c = skb_push(skb, sizeof(*gtp1c)); + + gtp1c->flags = opts->flags; + gtp1c->type = opts->type; + gtp1c->length = htons(payload_len); + gtp1c->tid = tid; + netdev_dbg(dev, "xmit control pkt: ver %d flags %x type %x pkt len %d tid %x", + opts->ver, opts->flags, opts->type, skb->len, tid); + return 0; } +struct gtp_pktinfo { + struct sock *sk; + __u8 tos; + struct flowi4 fl4; + struct rtable *rt; + struct net_device *dev; + __be16 gtph_port; +}; + static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo, - struct sock *sk, struct iphdr *iph, - struct pdp_ctx *pctx, struct rtable *rt, + struct sock *sk, + __u8 tos, + struct rtable *rt, struct flowi4 *fl4, struct net_device *dev) { pktinfo->sk = sk; - pktinfo->iph = iph; - pktinfo->pctx = pctx; + pktinfo->tos = tos; pktinfo->rt = rt; pktinfo->fl4 = *fl4; pktinfo->dev = dev; @@ -477,40 +572,99 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo) { struct gtp_dev *gtp = netdev_priv(dev); + struct gtpu_metadata *opts = NULL; + struct sock *sk = NULL; struct pdp_ctx *pctx; struct rtable *rt; struct flowi4 fl4; - struct iphdr *iph; - __be16 df; + u8 gtp_version; + __be16 df = 0; + __be32 tun_id; + __be32 daddr; + __be32 saddr; + __u8 tos; int mtu; - /* Read the IP destination address and resolve the PDP context. - * Prepend PDP header with TEI/TID from PDP ctx. - */ - iph = ip_hdr(skb); - if (gtp->role == GTP_ROLE_SGSN) - pctx = ipv4_pdp_find(gtp, iph->saddr); - else - pctx = ipv4_pdp_find(gtp, iph->daddr); + if (gtp->collect_md) { + /* LWT GTP1U encap */ + struct ip_tunnel_info *info = NULL; - if (!pctx) { - netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", - &iph->daddr); - return -ENOENT; + info = skb_tunnel_info(skb); + if (!info) { + netdev_dbg(dev, "missing tunnel info"); + return -ENOENT; + } + if (info->key.tp_dst && ntohs(info->key.tp_dst) != GTP1U_PORT) { + netdev_dbg(dev, "unexpected GTP dst port: %d", ntohs(info->key.tp_dst)); + return -EOPNOTSUPP; + } + pctx = NULL; + gtp_version = GTP_V1; + tun_id = tunnel_id_to_key32(info->key.tun_id); + daddr = info->key.u.ipv4.dst; + saddr = info->key.u.ipv4.src; + sk = gtp->sk1u; + if (!sk) { + netdev_dbg(dev, "missing tunnel sock"); + return -EOPNOTSUPP; + } + tos = info->key.tos; + if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + + if (info->options_len != 0) { + if (info->key.tun_flags & TUNNEL_GTPU_OPT) { + opts = ip_tunnel_info_opts(info); + } else { + netdev_dbg(dev, "missing tunnel metadata for control pkt"); + return -EOPNOTSUPP; + } + } + netdev_dbg(dev, "flow-based GTP1U encap: tunnel id %d\n", + be32_to_cpu(tun_id)); + } else { + struct iphdr *iph; + + if (ntohs(skb->protocol) != ETH_P_IP) + return -EOPNOTSUPP; + + iph = ip_hdr(skb); + + /* Read the IP destination address and resolve the PDP context. + * Prepend PDP header with TEI/TID from PDP ctx. + */ + if (gtp->role == GTP_ROLE_SGSN) + pctx = ipv4_pdp_find(gtp, iph->saddr); + else + pctx = ipv4_pdp_find(gtp, iph->daddr); + + if (!pctx) { + netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", + &iph->daddr); + return -ENOENT; + } + sk = pctx->sk; + netdev_dbg(dev, "found PDP context %p\n", pctx); + + gtp_version = pctx->gtp_version; + tun_id = htonl(pctx->u.v1.o_tei); + daddr = pctx->peer_addr_ip4.s_addr; + saddr = inet_sk(sk)->inet_saddr; + tos = iph->tos; + df = iph->frag_off; + netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n", + &iph->saddr, &iph->daddr); } - netdev_dbg(dev, "found PDP context %p\n", pctx); - rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer_addr_ip4.s_addr); + rt = ip4_route_output_gtp(&fl4, sk, daddr, saddr); if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to SSGN %pI4\n", - &pctx->peer_addr_ip4.s_addr); + netdev_dbg(dev, "no route to SSGN %pI4\n", &daddr); dev->stats.tx_carrier_errors++; goto err; } if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to SSGN %pI4\n", - &pctx->peer_addr_ip4.s_addr); + netdev_dbg(dev, "circular route to SSGN %pI4\n", &daddr); dev->stats.collisions++; goto err_rt; } @@ -518,11 +672,10 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* This is similar to tnl_update_pmtu(). */ - df = iph->frag_off; if (df) { mtu = dst_mtu(&rt->dst) - dev->hard_header_len - sizeof(struct iphdr) - sizeof(struct udphdr); - switch (pctx->gtp_version) { + switch (gtp_version) { case GTP_V0: mtu -= sizeof(struct gtp0_header); break; @@ -536,17 +689,38 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, false); - if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && - mtu < ntohs(iph->tot_len)) { - netdev_dbg(dev, "packet too big, fragmentation needed\n"); + if (!skb_is_gso(skb) && (df & htons(IP_DF)) && mtu < skb->len) { + netdev_dbg(dev, "packet too big, fragmentation needed"); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto err_rt; } - gtp_set_pktinfo_ipv4(pktinfo, pctx->sk, iph, pctx, rt, &fl4, dev); - gtp_push_header(skb, pktinfo); + gtp_set_pktinfo_ipv4(pktinfo, sk, tos, rt, &fl4, dev); + + if (unlikely(opts)) { + int err; + + pktinfo->gtph_port = htons(GTP1U_PORT); + err = gtp1_push_control_header(skb, tun_id, opts, dev); + if (err) { + netdev_info(dev, "cntr pkt error %d", err); + goto err_rt; + } + return 0; + } + + switch (gtp_version) { + case GTP_V0: + pktinfo->gtph_port = htons(GTP0_PORT); + gtp0_push_header(skb, pctx); + break; + case GTP_V1: + pktinfo->gtph_port = htons(GTP1U_PORT); + gtp1_push_header(skb, tun_id); + break; + } return 0; err_rt: @@ -557,7 +731,6 @@ err: static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) { - unsigned int proto = ntohs(skb->protocol); struct gtp_pktinfo pktinfo; int err; @@ -569,32 +742,22 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) /* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */ rcu_read_lock(); - switch (proto) { - case ETH_P_IP: - err = gtp_build_skb_ip4(skb, dev, &pktinfo); - break; - default: - err = -EOPNOTSUPP; - break; - } + err = gtp_build_skb_ip4(skb, dev, &pktinfo); rcu_read_unlock(); if (err < 0) goto tx_err; - switch (proto) { - case ETH_P_IP: - netdev_dbg(pktinfo.dev, "gtp -> IP src: %pI4 dst: %pI4\n", - &pktinfo.iph->saddr, &pktinfo.iph->daddr); - udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb, - pktinfo.fl4.saddr, pktinfo.fl4.daddr, - pktinfo.iph->tos, - ip4_dst_hoplimit(&pktinfo.rt->dst), - 0, - pktinfo.gtph_port, pktinfo.gtph_port, - true, false); - break; - } + udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb, + pktinfo.fl4.saddr, + pktinfo.fl4.daddr, + pktinfo.tos, + ip4_dst_hoplimit(&pktinfo.rt->dst), + 0, + pktinfo.gtph_port, + pktinfo.gtph_port, + true, + false); return NETDEV_TX_OK; tx_err: @@ -610,6 +773,19 @@ static const struct net_device_ops gtp_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, }; +static struct gtp_dev *gtp_find_flow_based_dev(struct net *net) +{ + struct gtp_net *gn = net_generic(net, gtp_net_id); + struct gtp_dev *gtp; + + list_for_each_entry(gtp, &gn->gtp_dev_list, list) { + if (gtp->collect_md) + return gtp; + } + + return NULL; +} + static void gtp_link_setup(struct net_device *dev) { dev->netdev_ops = >p_netdev_ops; @@ -634,7 +810,7 @@ static void gtp_link_setup(struct net_device *dev) } static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize); -static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]); +static int gtp_encap_enable(struct gtp_dev *gtp, struct net_device *dev, struct nlattr *data[]); static void gtp_destructor(struct net_device *dev) { @@ -652,11 +828,24 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, struct gtp_net *gn; int hashsize, err; - if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1]) + if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1] && + !data[IFLA_GTP_COLLECT_METADATA]) return -EINVAL; gtp = netdev_priv(dev); + if (data[IFLA_GTP_COLLECT_METADATA]) { + if (data[IFLA_GTP_FD0]) { + netdev_dbg(dev, "LWT device does not support setting v0 socket"); + return -EINVAL; + } + if (gtp_find_flow_based_dev(src_net)) { + netdev_dbg(dev, "LWT device already exist"); + return -EBUSY; + } + gtp->collect_md = true; + } + if (!data[IFLA_GTP_PDP_HASHSIZE]) { hashsize = 1024; } else { @@ -669,7 +858,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; - err = gtp_encap_enable(gtp, data); + err = gtp_encap_enable(gtp, dev, data); if (err < 0) goto out_hashtable; @@ -683,7 +872,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, list_add_rcu(>p->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; - netdev_dbg(dev, "registered new GTP interface\n"); + netdev_dbg(dev, "registered new GTP interface %s\n", dev->name); return 0; @@ -714,6 +903,7 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = { [IFLA_GTP_FD1] = { .type = NLA_U32 }, [IFLA_GTP_PDP_HASHSIZE] = { .type = NLA_U32 }, [IFLA_GTP_ROLE] = { .type = NLA_U32 }, + [IFLA_GTP_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static int gtp_validate(struct nlattr *tb[], struct nlattr *data[], @@ -737,6 +927,9 @@ static int gtp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_GTP_PDP_HASHSIZE, gtp->hash_size)) goto nla_put_failure; + if (gtp->collect_md && nla_put_flag(skb, IFLA_GTP_COLLECT_METADATA)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -782,35 +975,24 @@ err1: return -ENOMEM; } -static struct sock *gtp_encap_enable_socket(int fd, int type, - struct gtp_dev *gtp) +static int __gtp_encap_enable_socket(struct socket *sock, int type, + struct gtp_dev *gtp) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; - struct socket *sock; struct sock *sk; - int err; - - pr_debug("enable gtp on %d, %d\n", fd, type); - - sock = sockfd_lookup(fd, &err); - if (!sock) { - pr_debug("gtp socket fd=%d not found\n", fd); - return NULL; - } sk = sock->sk; if (sk->sk_protocol != IPPROTO_UDP || sk->sk_type != SOCK_DGRAM || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { - pr_debug("socket fd=%d not UDP\n", fd); - sk = ERR_PTR(-EINVAL); - goto out_sock; + pr_debug("socket not UDP\n"); + return -EINVAL; } lock_sock(sk); if (sk->sk_user_data) { - sk = ERR_PTR(-EBUSY); - goto out_rel_sock; + release_sock(sock->sk); + return -EBUSY; } sock_hold(sk); @@ -821,15 +1003,58 @@ static struct sock *gtp_encap_enable_socket(int fd, int type, tuncfg.encap_destroy = gtp_encap_destroy; setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg); - -out_rel_sock: release_sock(sock->sk); -out_sock: + return 0; +} + +static struct sock *gtp_encap_enable_socket(int fd, int type, + struct gtp_dev *gtp) +{ + struct socket *sock; + int err; + + pr_debug("enable gtp on %d, %d\n", fd, type); + + sock = sockfd_lookup(fd, &err); + if (!sock) { + pr_debug("gtp socket fd=%d not found\n", fd); + return NULL; + } + err = __gtp_encap_enable_socket(sock, type, gtp); sockfd_put(sock); - return sk; + if (err) + return ERR_PTR(err); + + return sock->sk; } -static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) +static struct socket *gtp_create_gtp_socket(struct gtp_dev *gtp, struct net_device *dev) +{ + struct udp_port_cfg udp_conf; + struct socket *sock; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + udp_conf.local_udp_port = htons(GTP1U_PORT); + + err = udp_sock_create(dev_net(dev), &udp_conf, &sock); + if (err < 0) { + pr_debug("create gtp sock failed: %d\n", err); + return ERR_PTR(err); + } + err = __gtp_encap_enable_socket(sock, UDP_ENCAP_GTP1U, gtp); + if (err) { + pr_debug("enable gtp sock encap failed: %d\n", err); + udp_tunnel_sock_release(sock); + return ERR_PTR(err); + } + pr_debug("create gtp sock done\n"); + return sock; +} + +static int gtp_encap_enable(struct gtp_dev *gtp, struct net_device *dev, struct nlattr *data[]) { struct sock *sk1u = NULL; struct sock *sk0 = NULL; @@ -853,11 +1078,25 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) } } + if (data[IFLA_GTP_COLLECT_METADATA]) { + struct socket *sock; + + if (!sk1u) { + sock = gtp_create_gtp_socket(gtp, dev); + if (IS_ERR(sock)) + return PTR_ERR(sock); + + gtp->collect_md_sock = sock; + sk1u = sock->sk; + } else { + gtp->collect_md_sock = NULL; + } + } + if (data[IFLA_GTP_ROLE]) { role = nla_get_u32(data[IFLA_GTP_ROLE]); if (role > GTP_ROLE_SGSN) { - gtp_encap_disable_sock(sk0); - gtp_encap_disable_sock(sk1u); + gtp_encap_disable(gtp); return -EINVAL; } } @@ -1416,7 +1655,7 @@ static int __init gtp_init(void) if (err < 0) goto unreg_genl_family; - pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", + pr_info("GTP module loaded (pdp ctx size %zd bytes) with tnl-md support\n", sizeof(struct pdp_ctx)); return 0; diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 79f9191bbb24..62aff78b7c56 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ +#include + #define GTP_GENL_MCGRP_NAME "gtp" enum gtp_genl_cmds { @@ -34,4 +36,14 @@ enum gtp_attrs { }; #define GTPA_MAX (__GTPA_MAX + 1) +enum { + GTP_METADATA_V1 +}; + +struct gtpu_metadata { + __u8 ver; + __u8 flags; + __u8 type; +}; + #endif /* _UAPI_LINUX_GTP_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 82708c6db432..2bd0d8bbcdb2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -809,6 +809,7 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 7d9105533c7b..802da679fab1 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,6 +176,7 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) +#define TUNNEL_GTPU_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index d208b2af697f..28d649bda686 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -617,6 +617,7 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) -- cgit v1.2.3 From d349f997686887906b1183b5be96933c5452362a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 16 Jan 2021 16:56:57 -0800 Subject: net_sched: fix RTNL deadlock again caused by request_module() tcf_action_init_1() loads tc action modules automatically with request_module() after parsing the tc action names, and it drops RTNL lock and re-holds it before and after request_module(). This causes a lot of troubles, as discovered by syzbot, because we can be in the middle of batch initializations when we create an array of tc actions. One of the problem is deadlock: CPU 0 CPU 1 rtnl_lock(); for (...) { tcf_action_init_1(); -> rtnl_unlock(); -> request_module(); rtnl_lock(); for (...) { tcf_action_init_1(); -> tcf_idr_check_alloc(); // Insert one action into idr, // but it is not committed until // tcf_idr_insert_many(), then drop // the RTNL lock in the _next_ // iteration -> rtnl_unlock(); -> rtnl_lock(); -> a_o->init(); -> tcf_idr_check_alloc(); // Now waiting for the same index // to be committed -> request_module(); -> rtnl_lock() // Now waiting for RTNL lock } rtnl_unlock(); } rtnl_unlock(); This is not easy to solve, we can move the request_module() before this loop and pre-load all the modules we need for this netlink message and then do the rest initializations. So the loop breaks down to two now: for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; a_o = tc_action_load_ops(name, tb[i]...); ops[i - 1] = a_o; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(ops[i - 1]...); } Although this looks serious, it only has been reported by syzbot, so it seems hard to trigger this by humans. And given the size of this patch, I'd suggest to make it to net-next and not to backport to stable. This patch has been tested by syzbot and tested with tdc.py by me. Fixes: 0fedc63fadf0 ("net_sched: commit action insertions together") Reported-and-tested-by: syzbot+82752bc5331601cf4899@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+b3b63b6bff456bd95294@syzkaller.appspotmail.com Reported-by: syzbot+ba67b12b1ca729912834@syzkaller.appspotmail.com Cc: Jiri Pirko Signed-off-by: Cong Wang Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20210117005657.14810-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 5 ++- net/sched/act_api.c | 104 ++++++++++++++++++++++++++++++++------------------ net/sched/cls_api.c | 11 +++++- 3 files changed, 79 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 55dab604861f..761c0e331915 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -186,10 +186,13 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack); +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, + bool rtnl_held, + struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - bool rtnl_held, + struct tc_action_ops *ops, bool rtnl_held, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2e85b636b27b..4dd235ce9a07 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -928,19 +928,13 @@ static void tcf_idr_insert_many(struct tc_action *actions[]) } } -struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, - struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, - bool rtnl_held, - struct netlink_ext_ack *extack) +struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, + bool rtnl_held, + struct netlink_ext_ack *extack) { - struct nla_bitfield32 flags = { 0, 0 }; - u8 hw_stats = TCA_ACT_HW_STATS_ANY; - struct tc_action *a; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; - struct tc_cookie *cookie = NULL; char act_name[IFNAMSIZ]; - struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; int err; @@ -948,33 +942,21 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) - goto err_out; + return ERR_PTR(err); err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); - goto err_out; + return ERR_PTR(err); } if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); - goto err_out; - } - if (tb[TCA_ACT_COOKIE]) { - cookie = nla_memdup_cookie(tb); - if (!cookie) { - NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); - err = -ENOMEM; - goto err_out; - } + return ERR_PTR(err); } - hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); - if (tb[TCA_ACT_FLAGS]) - flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); - err = -EINVAL; - goto err_out; + return ERR_PTR(-EINVAL); } } @@ -996,24 +978,56 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, * indicate this using -EAGAIN. */ if (a_o != NULL) { - err = -EAGAIN; - goto err_mod; + module_put(a_o->owner); + return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); - err = -ENOENT; - goto err_free; + return ERR_PTR(-ENOENT); } + return a_o; +} + +struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, + struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind, + struct tc_action_ops *a_o, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 flags = { 0, 0 }; + u8 hw_stats = TCA_ACT_HW_STATS_ANY; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct tc_cookie *cookie = NULL; + struct tc_action *a; + int err; + /* backward compatibility for policer */ - if (name == NULL) + if (name == NULL) { + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, + tcf_action_policy, extack); + if (err < 0) + return ERR_PTR(err); + if (tb[TCA_ACT_COOKIE]) { + cookie = nla_memdup_cookie(tb); + if (!cookie) { + NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); + err = -ENOMEM; + goto err_out; + } + } + hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); + if (tb[TCA_ACT_FLAGS]) + flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, rtnl_held, tp, flags.value, extack); - else + } else { err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, tp, flags.value, extack); + } if (err < 0) - goto err_mod; + goto err_out; if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); @@ -1030,14 +1044,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, return a; -err_mod: - module_put(a_o->owner); -err_free: +err_out: if (cookie) { kfree(cookie->data); kfree(cookie); } -err_out: return ERR_PTR(err); } @@ -1048,6 +1059,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { + struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t sz = 0; @@ -1059,9 +1071,20 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, if (err < 0) return err; + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + struct tc_action_ops *a_o; + + a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack); + if (IS_ERR(a_o)) { + err = PTR_ERR(a_o); + goto err_mod; + } + ops[i - 1] = a_o; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - rtnl_held, extack); + ops[i - 1], rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1081,6 +1104,11 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, err: tcf_action_destroy(actions, bind); +err_mod: + for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { + if (ops[i]) + module_put(ops[i]->owner); + } return err; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 37b77bd30974..a67c66a512a4 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3043,12 +3043,19 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, size_t attr_size = 0; if (exts->police && tb[exts->police]) { + struct tc_action_ops *a_o; + + a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack); + if (IS_ERR(a_o)) + return PTR_ERR(a_o); act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, rtnl_held, + TCA_ACT_BIND, a_o, rtnl_held, extack); - if (IS_ERR(act)) + if (IS_ERR(act)) { + module_put(a_o->owner); return PTR_ERR(act); + } act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; -- cgit v1.2.3 From 719a402cf60311b1cdff3f6320abaecdcc5e46b7 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 17 Jan 2021 16:59:42 +0200 Subject: net: netdevice: Add operation ndo_sk_get_lower_dev ndo_sk_get_lower_dev returns the lower netdev that corresponds to a given socket. Additionally, we implement a helper netdev_sk_get_lowest_dev() to get the lowest one in chain. Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5b949076ed23..02dcef4d66e2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1398,6 +1398,8 @@ struct net_device_ops { struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, struct sk_buff *skb, bool all_slaves); + struct net_device* (*ndo_sk_get_lower_dev)(struct net_device *dev, + struct sock *sk); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2858,6 +2860,8 @@ int init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves); +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index bae35c1ae192..6b90520a01b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8105,6 +8105,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev, } EXPORT_SYMBOL(netdev_get_xmit_slave); +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3 From 007feb87fb15933b5de7135e6bdf57c219b3fbec Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 17 Jan 2021 16:59:44 +0200 Subject: net/bonding: Implement ndo_sk_get_lower_dev Add ndo_sk_get_lower_dev() implementation for bond interfaces. Support only for the cases where the socket's and SKBs' hash yields identical value for the whole connection lifetime. Here we restrict it to L3+4 sockets only, with xmit_hash_policy==LAYER34 and bond modes xor/802.3ad. Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 93 +++++++++++++++++++++++++++++++++++++++++ include/net/bonding.h | 2 + 2 files changed, 95 insertions(+) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 759ad22b7279..09524f99c753 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -301,6 +301,19 @@ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, return dev_queue_xmit(skb); } +bool bond_sk_check(struct bonding *bond) +{ + switch (BOND_MODE(bond)) { + case BOND_MODE_8023AD: + case BOND_MODE_XOR: + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34) + return true; + fallthrough; + default: + return false; + } +} + /*---------------------------------- VLAN -----------------------------------*/ /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, @@ -4555,6 +4568,85 @@ static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, return NULL; } +static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow) +{ + switch (sk->sk_family) { +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + if (sk->sk_ipv6only || + ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) { + flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + flow->addrs.v6addrs.src = inet6_sk(sk)->saddr; + flow->addrs.v6addrs.dst = sk->sk_v6_daddr; + break; + } + fallthrough; +#endif + default: /* AF_INET */ + flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr; + flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr; + break; + } + + flow->ports.src = inet_sk(sk)->inet_sport; + flow->ports.dst = inet_sk(sk)->inet_dport; +} + +/** + * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields + * @sk: socket to use for headers + * + * This function will extract the necessary field from the socket and use + * them to generate a hash based on the LAYER34 xmit_policy. + * Assumes that sk is a TCP or UDP socket. + */ +static u32 bond_sk_hash_l34(struct sock *sk) +{ + struct flow_keys flow; + u32 hash; + + bond_sk_to_flow(sk, &flow); + + /* L4 */ + memcpy(&hash, &flow.ports.ports, sizeof(hash)); + /* L3 */ + return bond_ip_hash(hash, &flow); +} + +static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond, + struct sock *sk) +{ + struct bond_up_slave *slaves; + struct slave *slave; + unsigned int count; + u32 hash; + + slaves = rcu_dereference(bond->usable_slaves); + count = slaves ? READ_ONCE(slaves->count) : 0; + if (unlikely(!count)) + return NULL; + + hash = bond_sk_hash_l34(sk); + slave = slaves->arr[hash % count]; + + return slave->dev; +} + +static struct net_device *bond_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + struct bonding *bond = netdev_priv(dev); + struct net_device *lower = NULL; + + rcu_read_lock(); + if (bond_sk_check(bond)) + lower = __bond_sk_get_lower_dev(bond, sk); + rcu_read_unlock(); + + return lower; +} + static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); @@ -4691,6 +4783,7 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, .ndo_get_xmit_slave = bond_xmit_get_slave, + .ndo_sk_get_lower_dev = bond_sk_get_lower_dev, }; static const struct device_type bond_type = { diff --git a/include/net/bonding.h b/include/net/bonding.h index adc3da776970..21497193c4a4 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -265,6 +265,8 @@ struct bond_vlan_tag { unsigned short vlan_id; }; +bool bond_sk_check(struct bonding *bond); + /** * Returns NULL if the net_device does not belong to any of the bond's slaves * -- cgit v1.2.3 From 89df6a8104706f94800ed527ad73d07465ea4d12 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 17 Jan 2021 16:59:46 +0200 Subject: net/bonding: Implement TLS TX device offload Implement TLS TX device offload for bonding interfaces. This allows kTLS sockets running on a bond to benefit from the device offload on capable lower devices. To allow a simple and fast maintenance of the TLS context in SW and lower devices, we bind the TLS socket to a specific lower dev. To achieve a behavior similar to SW kTLS, we support only balance-xor and 802.3ad modes, with xmit_hash_policy=layer3+4. This is enforced in bond_sk_check(), done in a previous patch. For the above configuration, the SW implementation keeps picking the same exact lower dev for all the socket's SKBs. The device offload behaves similarly, making the decision once at the connection creation. Per socket, the TLS module should work directly with the lowest netdev in chain, to call the tls_dev_ops operations. As the bond interface is being bypassed by the TLS module, interacting directly against the lower devs, there is no way for the bond interface to disable its device offload capabilities, as long as the mode/policy config allows it. Hence, the feature flag is not directly controllable, but just reflects the current offload status based on the logic under bond_sk_check(). Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 29 +++++++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 27 +++++++++++++++++++++++++-- include/net/bonding.h | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 09524f99c753..539c6bc218df 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -83,6 +83,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_TLS_DEVICE) +#include +#endif #include "bonding_priv.h" @@ -1225,6 +1228,13 @@ static netdev_features_t bond_fix_features(struct net_device *dev, netdev_features_t mask; struct slave *slave; +#if IS_ENABLED(CONFIG_TLS_DEVICE) + if (bond_sk_check(bond)) + features |= BOND_TLS_FEATURES; + else + features &= ~BOND_TLS_FEATURES; +#endif + mask = features; features &= ~NETIF_F_ONE_FOR_ALL; @@ -4647,6 +4657,16 @@ static struct net_device *bond_sk_get_lower_dev(struct net_device *dev, return lower; } +#if IS_ENABLED(CONFIG_TLS_DEVICE) +static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb, + struct net_device *dev) +{ + if (likely(bond_get_slave_by_dev(bond, tls_get_ctx(skb->sk)->netdev))) + return bond_dev_queue_xmit(bond, skb, tls_get_ctx(skb->sk)->netdev); + return bond_tx_drop(dev, skb); +} +#endif + static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); @@ -4655,6 +4675,11 @@ static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev !bond_slave_override(bond, skb)) return NETDEV_TX_OK; +#if IS_ENABLED(CONFIG_TLS_DEVICE) + if (skb->sk && tls_is_sk_tx_device_offloaded(skb->sk)) + return bond_tls_device_xmit(bond, skb, dev); +#endif + switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); @@ -4855,6 +4880,10 @@ void bond_setup(struct net_device *bond_dev) if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_dev->features |= BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ +#if IS_ENABLED(CONFIG_TLS_DEVICE) + if (bond_sk_check(bond)) + bond_dev->features |= BOND_TLS_FEATURES; +#endif } /* Destroy a bonding device. diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 7f0ad97926de..8fcbf7f9c7b2 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -758,6 +758,19 @@ static bool bond_set_xfrm_features(struct bonding *bond) return true; } +static bool bond_set_tls_features(struct bonding *bond) +{ + if (!IS_ENABLED(CONFIG_TLS_DEVICE)) + return false; + + if (bond_sk_check(bond)) + bond->dev->wanted_features |= BOND_TLS_FEATURES; + else + bond->dev->wanted_features &= ~BOND_TLS_FEATURES; + + return true; +} + static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { @@ -784,9 +797,15 @@ static int bond_option_mode_set(struct bonding *bond, bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; bond->params.mode = newval->value; - if (bond->dev->reg_state == NETREG_REGISTERED) - if (bond_set_xfrm_features(bond)) + if (bond->dev->reg_state == NETREG_REGISTERED) { + bool update = false; + + update |= bond_set_xfrm_features(bond); + update |= bond_set_tls_features(bond); + + if (update) netdev_update_features(bond->dev); + } return 0; } @@ -1220,6 +1239,10 @@ static int bond_option_xmit_hash_policy_set(struct bonding *bond, newval->string, newval->value); bond->params.xmit_policy = newval->value; + if (bond->dev->reg_state == NETREG_REGISTERED) + if (bond_set_tls_features(bond)) + netdev_update_features(bond->dev); + return 0; } diff --git a/include/net/bonding.h b/include/net/bonding.h index 21497193c4a4..97fbec02df2d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -89,6 +89,8 @@ #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) +#define BOND_TLS_FEATURES (NETIF_F_HW_TLS_TX) + #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; -- cgit v1.2.3 From dc5809f9e2b674a489723bd8d0131c97e565ca8d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 17 Jan 2021 16:59:47 +0200 Subject: net/bonding: Declare TLS RX device offload support Following the description in previous patch (for TX): As the bond interface is being bypassed by the TLS module, interacting directly against the lower devs, there is no way for the bond interface to disable its device offload capabilities, as long as the mode/policy config allows it. Hence, the feature flag is not directly controllable, but just reflects the offload status based on the logic under bond_sk_check(). Here we just declare RX device offload support, and expose it via the NETIF_F_HW_TLS_RX flag. Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Signed-off-by: Jakub Kicinski --- include/net/bonding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 97fbec02df2d..019e998d944a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -89,7 +89,7 @@ #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) -#define BOND_TLS_FEATURES (NETIF_F_HW_TLS_TX) +#define BOND_TLS_FEATURES (NETIF_F_HW_TLS_TX | NETIF_F_HW_TLS_RX) #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; -- cgit v1.2.3 From ab0da5a57188b80d16d3222236c4e184953a5bb4 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 30 Dec 2020 15:01:20 +0200 Subject: net/mlx5: Expose ifc bits for query modify header Expose ifc bits for query_modify_header_context_in to be used by DEVX. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 442c0160caab..cf692fc17f41 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5904,6 +5904,18 @@ struct mlx5_ifc_dealloc_modify_header_context_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_query_modify_header_context_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 modify_header_id[0x20]; + + u8 reserved_at_60[0xa0]; +}; + struct mlx5_ifc_query_dct_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; -- cgit v1.2.3 From 7eab14de73a8028f770e703962c5437a2b0dda82 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 16 Jan 2021 16:13:22 +0000 Subject: mdio, phy: fix -Wshadow warnings triggered by nested container_of() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit container_of() macro hides a local variable '__mptr' inside. This becomes a problem when several container_of() are nested in each other within single line or plain macros. As C preprocessor doesn't support generating random variable names, the sole solution is to avoid defining macros that consist only of container_of() calls, or they will self-shadow '__mptr' each time: In file included from ./include/linux/bitmap.h:10, from drivers/net/phy/phy_device.c:12: drivers/net/phy/phy_device.c: In function ‘phy_device_release’: ./include/linux/kernel.h:693:8: warning: declaration of ‘__mptr’ shadows a previous local [-Wshadow] 693 | void *__mptr = (void *)(ptr); \ | ^~~~~~ ./include/linux/phy.h:647:26: note: in expansion of macro ‘container_of’ 647 | #define to_phy_device(d) container_of(to_mdio_device(d), \ | ^~~~~~~~~~~~ ./include/linux/mdio.h:52:27: note: in expansion of macro ‘container_of’ 52 | #define to_mdio_device(d) container_of(d, struct mdio_device, dev) | ^~~~~~~~~~~~ ./include/linux/phy.h:647:39: note: in expansion of macro ‘to_mdio_device’ 647 | #define to_phy_device(d) container_of(to_mdio_device(d), \ | ^~~~~~~~~~~~~~ drivers/net/phy/phy_device.c:217:8: note: in expansion of macro ‘to_phy_device’ 217 | kfree(to_phy_device(dev)); | ^~~~~~~~~~~~~ ./include/linux/kernel.h:693:8: note: shadowed declaration is here 693 | void *__mptr = (void *)(ptr); \ | ^~~~~~ ./include/linux/phy.h:647:26: note: in expansion of macro ‘container_of’ 647 | #define to_phy_device(d) container_of(to_mdio_device(d), \ | ^~~~~~~~~~~~ drivers/net/phy/phy_device.c:217:8: note: in expansion of macro ‘to_phy_device’ 217 | kfree(to_phy_device(dev)); | ^~~~~~~~~~~~~ As they are declared in header files, these warnings are highly repetitive and very annoying (along with the one from linux/pci.h). Convert the related macros from linux/{mdio,phy}.h to static inlines to avoid self-shadowing and potentially improve bug-catching. No functional changes implied. Signed-off-by: Alexander Lobakin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210116161246.67075-1-alobakin@pm.me Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 23 ++++++++++++++++++----- include/linux/phy.h | 7 +++++-- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index dbd69b3d170b..ffb787d5ebde 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -49,7 +49,11 @@ struct mdio_device { unsigned int reset_assert_delay; unsigned int reset_deassert_delay; }; -#define to_mdio_device(d) container_of(d, struct mdio_device, dev) + +static inline struct mdio_device *to_mdio_device(const struct device *dev) +{ + return container_of(dev, struct mdio_device, dev); +} /* struct mdio_driver_common: Common to all MDIO drivers */ struct mdio_driver_common { @@ -57,8 +61,12 @@ struct mdio_driver_common { int flags; }; #define MDIO_DEVICE_FLAG_PHY 1 -#define to_mdio_common_driver(d) \ - container_of(d, struct mdio_driver_common, driver) + +static inline struct mdio_driver_common * +to_mdio_common_driver(const struct device_driver *driver) +{ + return container_of(driver, struct mdio_driver_common, driver); +} /* struct mdio_driver: Generic MDIO driver */ struct mdio_driver { @@ -73,8 +81,13 @@ struct mdio_driver { /* Clears up any memory if needed */ void (*remove)(struct mdio_device *mdiodev); }; -#define to_mdio_driver(d) \ - container_of(to_mdio_common_driver(d), struct mdio_driver, mdiodrv) + +static inline struct mdio_driver * +to_mdio_driver(const struct device_driver *driver) +{ + return container_of(to_mdio_common_driver(driver), struct mdio_driver, + mdiodrv); +} /* device driver data */ static inline void mdiodev_set_drvdata(struct mdio_device *mdio, void *data) diff --git a/include/linux/phy.h b/include/linux/phy.h index 24fcc6456a9e..bc323fbdd21e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -648,8 +648,11 @@ struct phy_device { const struct macsec_ops *macsec_ops; #endif }; -#define to_phy_device(d) container_of(to_mdio_device(d), \ - struct phy_device, mdio) + +static inline struct phy_device *to_phy_device(const struct device *dev) +{ + return container_of(to_mdio_device(dev), struct phy_device, mdio); +} /** * struct phy_tdr_config - Configuration of a TDR raw test -- cgit v1.2.3 From fa82117010430aff2ce86400f7328f55a31b48a6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 16 Jan 2021 14:13:37 +0800 Subject: net: add inline function skb_csum_is_sctp This patch is to define a inline function skb_csum_is_sctp(), and also replace all places where it checks if it's a SCTP CSUM skb. This function would be used later in many networking drivers in the following patches. Suggested-by: Alexander Duyck Signed-off-by: Xin Long Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 2 +- include/linux/skbuff.h | 5 +++++ net/core/dev.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index ac4cd5d82e69..162a1ff1e9d2 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -979,7 +979,7 @@ static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb) stats->vlan_inserted++; } - if (skb->csum_not_inet) + if (skb_csum_is_sctp(skb)) stats->crc32_csum++; else stats->csum++; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9568cf60c2a..46f901adf1a8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4621,6 +4621,11 @@ static inline void skb_reset_redirect(struct sk_buff *skb) #endif } +static inline bool skb_csum_is_sctp(struct sk_buff *skb) +{ + return skb->csum_not_inet; +} + static inline void skb_set_kcov_handle(struct sk_buff *skb, const u64 kcov_handle) { diff --git a/net/core/dev.c b/net/core/dev.c index 6b90520a01b1..0332f2e8f7da 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3617,7 +3617,7 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { - if (unlikely(skb->csum_not_inet)) + if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); -- cgit v1.2.3 From 7b8fc0103bb51d1d3e1fb5fd67958612e709f883 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 18 Jan 2021 20:09:27 -0500 Subject: bonding: add a vlan+srcmac tx hashing option This comes from an end-user request, where they're running multiple VMs on hosts with bonded interfaces connected to some interest switch topologies, where 802.3ad isn't an option. They're currently running a proprietary solution that effectively achieves load-balancing of VMs and bandwidth utilization improvements with a similar form of transmission algorithm. Basically, each VM has it's own vlan, so it always sends its traffic out the same interface, unless that interface fails. Traffic gets split between the interfaces, maintaining a consistent path, with failover still available if an interface goes down. Unlike bond_eth_hash(), this hash function is using the full source MAC address instead of just the last byte, as there are so few components to the hash, and in the no-vlan case, we would be returning just the last byte of the source MAC as the hash value. It's entirely possible to have two NICs in a bond with the same last byte of their MAC, but not the same MAC, so this adjustment should guarantee distinct hashes in all cases. This has been rudimetarily tested to provide similar results to the proprietary solution it is aiming to replace. A patch for iproute2 is also posted, to properly support the new mode there as well. Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Cc: Thomas Davis Signed-off-by: Jarod Wilson Link: https://lore.kernel.org/r/20210119010927.1191922-1-jarod@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/bonding.rst | 13 +++++++++++++ drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++++++++++++-- drivers/net/bonding/bond_options.c | 13 +++++++------ include/linux/netdevice.h | 1 + include/uapi/linux/if_bonding.h | 1 + 5 files changed, 54 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index adc314639085..5f690f0ad0e4 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -951,6 +951,19 @@ xmit_hash_policy packets will be distributed according to the encapsulated flows. + vlan+srcmac + + This policy uses a very rudimentary vlan ID and source mac + hash to load-balance traffic per-vlan, with failover + should one leg fail. The intended use case is for a bond + shared by multiple virtual machines, all configured to + use their own vlan, to give lacp-like functionality + without requiring lacp-capable switching hardware. + + The formula for the hash is simply + + hash = (vlan ID) XOR (source MAC vendor) XOR (source MAC dev) + The default value is layer2. This option was added in bonding version 2.6.3. In earlier versions of bonding, this parameter does not exist, and the layer2 policy is the only policy. The diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 539c6bc218df..74cbbb22470b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -167,7 +167,7 @@ module_param(xmit_hash_policy, charp, 0); MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; " "0 for layer 2 (default), 1 for layer 3+4, " "2 for layer 2+3, 3 for encap layer 2+3, " - "4 for encap layer 3+4"); + "4 for encap layer 3+4, 5 for vlan+srcmac"); module_param(arp_interval, int, 0); MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds"); module_param_array(arp_ip_target, charp, NULL, 0); @@ -1457,6 +1457,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond, return NETDEV_LAG_HASH_E23; case BOND_XMIT_POLICY_ENCAP34: return NETDEV_LAG_HASH_E34; + case BOND_XMIT_POLICY_VLAN_SRCMAC: + return NETDEV_LAG_HASH_VLAN_SRCMAC; default: return NETDEV_LAG_HASH_UNKNOWN; } @@ -3519,6 +3521,27 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, return true; } +static u32 bond_vlan_srcmac_hash(struct sk_buff *skb) +{ + struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); + u32 srcmac_vendor = 0, srcmac_dev = 0; + u16 vlan; + int i; + + for (i = 0; i < 3; i++) + srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i]; + + for (i = 3; i < ETH_ALEN; i++) + srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i]; + + if (!skb_vlan_tag_present(skb)) + return srcmac_vendor ^ srcmac_dev; + + vlan = skb_vlan_tag_get(skb); + + return vlan ^ srcmac_vendor ^ srcmac_dev; +} + /* Extract the appropriate headers based on bond's xmit policy */ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, struct flow_keys *fk) @@ -3526,10 +3549,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34; int noff, proto = -1; - if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) { + switch (bond->params.xmit_policy) { + case BOND_XMIT_POLICY_ENCAP23: + case BOND_XMIT_POLICY_ENCAP34: memset(fk, 0, sizeof(*fk)); return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, fk, NULL, 0, 0, 0, 0); + default: + break; } fk->ports.ports = 0; @@ -3591,6 +3618,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) skb->l4_hash) return skb->hash; + if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC) + return bond_vlan_srcmac_hash(skb); + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || !bond_flow_dissect(bond, skb, &flow)) return bond_eth_hash(skb); diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 8fcbf7f9c7b2..77d7c38bd435 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -96,12 +96,13 @@ static const struct bond_opt_value bond_pps_tbl[] = { }; static const struct bond_opt_value bond_xmit_hashtype_tbl[] = { - { "layer2", BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT}, - { "layer3+4", BOND_XMIT_POLICY_LAYER34, 0}, - { "layer2+3", BOND_XMIT_POLICY_LAYER23, 0}, - { "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0}, - { "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0}, - { NULL, -1, 0}, + { "layer2", BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT}, + { "layer3+4", BOND_XMIT_POLICY_LAYER34, 0}, + { "layer2+3", BOND_XMIT_POLICY_LAYER23, 0}, + { "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0}, + { "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0}, + { "vlan+srcmac", BOND_XMIT_POLICY_VLAN_SRCMAC, 0}, + { NULL, -1, 0}, }; static const struct bond_opt_value bond_arp_validate_tbl[] = { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02dcef4d66e2..ef517254367d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2617,6 +2617,7 @@ enum netdev_lag_hash { NETDEV_LAG_HASH_L23, NETDEV_LAG_HASH_E23, NETDEV_LAG_HASH_E34, + NETDEV_LAG_HASH_VLAN_SRCMAC, NETDEV_LAG_HASH_UNKNOWN, }; diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index 45f3750aa861..e8eb4ad03cf1 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -94,6 +94,7 @@ #define BOND_XMIT_POLICY_LAYER23 2 /* layer 2+3 (IP ^ MAC) */ #define BOND_XMIT_POLICY_ENCAP23 3 /* encapsulated layer 2+3 */ #define BOND_XMIT_POLICY_ENCAP34 4 /* encapsulated layer 3+4 */ +#define BOND_XMIT_POLICY_VLAN_SRCMAC 5 /* vlan + source MAC */ /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */ #define LACP_STATE_LACP_ACTIVITY 0x1 -- cgit v1.2.3 From 97a0e1ea7b41c2db762c1258632f6ccc22719510 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 12 Jan 2021 19:26:12 +0100 Subject: net, xdp: Introduce __xdp_build_skb_from_frame utility routine Introduce __xdp_build_skb_from_frame utility routine to build the skb from xdp_frame. Rely on __xdp_build_skb_from_frame in cpumap code. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/4f9f4c6b3dd3933770c617eb6689dbc0c6e25863.1610475660.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 3 +++ kernel/bpf/cpumap.c | 46 ++-------------------------------------------- net/core/xdp.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 0cf3976ce77c..689206dee6de 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -164,6 +164,9 @@ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 747313698178..5d1469de6921 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, - struct sk_buff *skb) -{ - unsigned int hard_start_headroom; - unsigned int frame_size; - void *pkt_data_start; - - /* Part of headroom was reserved to xdpf */ - hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - - /* Memory size backing xdp_frame data already have reserved - * room for build_skb to place skb_shared_info in tailroom. - */ - frame_size = xdpf->frame_sz; - - pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb_around(skb, pkt_data_start, frame_size); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, hard_start_headroom); - __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); - - /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdpf->dev_rx); - - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); - - /* Allow SKB to reuse area used by xdp_frame */ - xdp_scrub_frame(xdpf); - - return skb; -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(xdpf, skb); + skb = __xdp_build_skb_from_frame(xdpf, skb, + xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; diff --git a/net/core/xdp.c b/net/core/xdp.c index 3a8c9ab4ecbe..aeb09ed0704c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -513,3 +513,47 @@ void xdp_warn(const char *msg, const char *func, const int line) WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int headroom, frame_size; + void *hard_start; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); -- cgit v1.2.3 From 89f479f0eccfc879e7bc0a69f44ed4a4639dfc32 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 12 Jan 2021 19:26:13 +0100 Subject: net, xdp: Introduce xdp_build_skb_from_frame utility routine Introduce xdp_build_skb_from_frame utility routine to build the skb from xdp_frame. Respect to __xdp_build_skb_from_frame, xdp_build_skb_from_frame will allocate the skb object. Rely on xdp_build_skb_from_frame in veth driver. Introduce missing xdp metadata support in veth_xdp_rcv_one routine. Add missing metadata support in veth_xdp_rcv_one(). Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Reviewed-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/94ade9e853162ae1947941965193190da97457bc.1610475660.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/veth.c | 18 +++--------------- include/net/xdp.h | 2 ++ net/core/xdp.c | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 99caae7d1641..6e03b619c93c 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -567,16 +567,10 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - void *hard_start = frame->data - frame->headroom; - int len = frame->len, delta = 0; struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; - unsigned int headroom; struct sk_buff *skb; - /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ - hard_start -= sizeof(struct xdp_frame); - rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { @@ -590,8 +584,8 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, switch (act) { case XDP_PASS: - delta = frame->data - xdp.data; - len = xdp.data_end - xdp.data; + if (xdp_update_frame_from_buff(&xdp, frame)) + goto err_xdp; break; case XDP_TX: orig_frame = *frame; @@ -629,18 +623,12 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, } rcu_read_unlock(); - headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); + skb = xdp_build_skb_from_frame(frame, rq->dev); if (!skb) { xdp_return_frame(frame); stats->rx_drops++; - goto err; } - xdp_release_frame(frame); - xdp_scrub_frame(frame); - skb->protocol = eth_type_trans(skb, rq->dev); -err: return skb; err_xdp: rcu_read_unlock(); diff --git a/include/net/xdp.h b/include/net/xdp.h index 689206dee6de..c4bfdc9a8b79 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -167,6 +167,8 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/net/core/xdp.c b/net/core/xdp.c index aeb09ed0704c..0d2630a35c3e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -557,3 +557,18 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, return skb; } EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); -- cgit v1.2.3 From 6939f4ef16d48f2093f337162cfc041d0e30ed25 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 19 Jan 2021 12:22:36 +0000 Subject: trace: bpf: Allow bpf to attach to bare tracepoints Some subsystems only have bare tracepoints (a tracepoint with no associated trace event) to avoid the problem of trace events being an ABI that can't be changed. >From bpf presepective, bare tracepoints are what it calls RAW_TRACEPOINT(). Since bpf assumed there's 1:1 mapping, it relied on hooking to DEFINE_EVENT() macro to create bpf mapping of the tracepoints. Since bare tracepoints use DECLARE_TRACE() to create the tracepoint, bpf had no knowledge about their existence. By teaching bpf_probe.h to parse DECLARE_TRACE() in a similar fashion to DEFINE_EVENT(), bpf can find and attach to the new raw tracepoints. Enabling that comes with the contract that changes to raw tracepoints don't constitute a regression if they break existing bpf programs. We need the ability to continue to morph and modify these raw tracepoints without worrying about any ABI. Update Documentation/bpf/bpf_design_QA.rst to document this contract. Signed-off-by: Qais Yousef Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210119122237.2426878-2-qais.yousef@arm.com --- Documentation/bpf/bpf_design_QA.rst | 6 ++++++ include/trace/bpf_probe.h | 12 ++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 2df7b067ab93..0e15f9b05c9d 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -208,6 +208,12 @@ data structures and compile with kernel internal headers. Both of these kernel internals are subject to change and can break with newer kernels such that the program needs to be adapted accordingly. +Q: Are tracepoints part of the stable ABI? +------------------------------------------ +A: NO. Tracepoints are tied to internal implementation details hence they are +subject to change and can break with newer kernels. BPF programs need to change +accordingly when this happens. + Q: How much stack space a BPF program uses? ------------------------------------------- A: Currently all program types are limited to 512 bytes of stack diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index cd74bffed5c6..a23be89119aa 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -55,8 +55,7 @@ /* tracepoints with more than 12 arguments will hit build error */ #define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#define __BPF_DECLARE_TRACE(call, proto, args) \ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ @@ -64,6 +63,10 @@ __bpf_trace_##call(void *__data, proto) \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ } +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the @@ -111,6 +114,11 @@ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) +#undef DECLARE_TRACE +#define DECLARE_TRACE(call, proto, args) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ + __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #undef DEFINE_EVENT_WRITABLE -- cgit v1.2.3 From 9cacf81f8161111db25f98e78a7a0e32ae142b3f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 Jan 2021 08:34:59 -0800 Subject: bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE. We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom call in do_tcp_getsockopt using the on-stack data. This removes 3% overhead for locking/unlocking the socket. Without this patch: 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt | --3.30%--__cgroup_bpf_run_filter_getsockopt | --0.81%--__kmalloc With the patch applied: 0.52% 0.12% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt_kern Note, exporting uapi/tcp.h requires removing netinet/tcp.h from test_progs.h because those headers have confliciting definitions. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com --- include/linux/bpf-cgroup.h | 27 +- include/linux/indirect_call_wrapper.h | 6 + include/net/sock.h | 2 + include/net/tcp.h | 1 + kernel/bpf/cgroup.c | 46 +++ net/ipv4/tcp.c | 14 + net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/socket.c | 3 + tools/include/uapi/linux/tcp.h | 357 +++++++++++++++++++++ .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 1 + .../selftests/bpf/prog_tests/cls_redirect.c | 1 + .../selftests/bpf/prog_tests/sockmap_basic.c | 1 + .../testing/selftests/bpf/prog_tests/sockopt_sk.c | 28 ++ tools/testing/selftests/bpf/progs/sockopt_sk.c | 23 +- tools/testing/selftests/bpf/test_progs.h | 1 - 16 files changed, 506 insertions(+), 7 deletions(-) create mode 100644 tools/include/uapi/linux/tcp.h (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 72e69a0e1e8c..bcb2915e6124 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int __user *optlen, int max_optlen, int retval); +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, ({ \ int __ret = retval; \ if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ - optname, optval, \ - optlen, max_optlen, \ - retval); \ + if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ + !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ + tcp_bpf_bypass_getsockopt, \ + level, optname)) \ + __ret = __cgroup_bpf_run_filter_getsockopt( \ + sock, level, optname, optval, optlen, \ + max_optlen, retval); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ + sock, level, optname, optval, optlen, retval); \ __ret; \ }) @@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 54c02c84906a..cfcfef37b2f1 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -60,4 +60,10 @@ #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) #endif +#if IS_ENABLED(CONFIG_INET) +#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) +#endif + #endif diff --git a/include/net/sock.h b/include/net/sock.h index 129d200bccb4..7644ea64a376 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1174,6 +1174,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + bool (*bpf_bypass_getsockopt)(int level, + int optname); void (*release_cb)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 78d13c88720f..4bb42fb19711 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); +bool tcp_bpf_bypass_getsockopt(int level, int optname); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 96555a8a2c54..416e7738981b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1486,6 +1486,52 @@ out: sockopt_free_buf(&ctx); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 856ae516ac18..26aa923cf522 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, err)) goto zerocopy_rcv_sk_err; @@ -4133,6 +4135,18 @@ zerocopy_rcv_out: return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 777306b5bc22..62b6fd385a47 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2793,6 +2793,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e1509b02cb3..8539715ff035 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/socket.c b/net/socket.c index 33e8b6c4e1d3..7f0617ab5437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __sys_setsockopt(fd, level, optname, optval, optlen); } +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, + int optname)); + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. diff --git a/tools/include/uapi/linux/tcp.h b/tools/include/uapi/linux/tcp.h new file mode 100644 index 000000000000..13ceeb395eb8 --- /dev/null +++ b/tools/include/uapi/linux/tcp.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol. + * + * Version: @(#)tcp.h 1.0.2 04/28/93 + * + * Author: Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _UAPI_LINUX_TCP_H +#define _UAPI_LINUX_TCP_H + +#include +#include +#include + +struct tcphdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +}; + +/* + * The union cast uses a gcc extension to avoid aliasing problems + * (union is compatible to any of its members) + * This means this part of the code is -fstrict-aliasing safe now. + */ +union tcp_word_hdr { + struct tcphdr hdr; + __be32 words[5]; +}; + +#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) + +enum { + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) +}; + +/* + * TCP general constants + */ +#define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */ +#define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */ + +/* TCP socket options */ +#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ +#define TCP_MAXSEG 2 /* Limit MSS */ +#define TCP_CORK 3 /* Never send partially complete segments */ +#define TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define TCP_INFO 11 /* Information about this connection. */ +#define TCP_QUICKACK 12 /* Block/reenable quick acks */ +#define TCP_CONGESTION 13 /* Congestion control algorithm */ +#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 +#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define TCP_TIMESTAMP 24 +#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ +#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */ +#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ +#define TCP_ULP 31 /* Attach a ULP to a TCP connection */ +#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ +#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ +#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 +#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ + +#define TCP_CM_INQ TCP_INQ + +#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ + + +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +struct tcp_repair_window { + __u32 snd_wl1; + __u32 snd_wnd; + __u32 max_window; + + __u32 rcv_wnd; + __u32 rcv_wup; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; + +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + +/* for TCP_INFO socket option */ +#define TCPI_OPT_TIMESTAMPS 1 +#define TCPI_OPT_SACK 2 +#define TCPI_OPT_WSCALE 4 +#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ +#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ +#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ +enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1< +#include #include #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 9781d85cb223..e075d03ab630 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -7,6 +7,7 @@ #include #include +#include #include diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 85f73261fab0..b8b48cac2ac3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include +#include #include "test_progs.h" #include "test_skmsg_load_helpers.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index b25c9c45c148..d5b44b135c00 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -2,6 +2,12 @@ #include #include "cgroup_helpers.h" +#include + +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef static int getsetsockopt(void) @@ -11,6 +17,7 @@ static int getsetsockopt(void) char u8[4]; __u32 u32; char cc[16]; /* TCP_CA_NAME_MAX */ + struct tcp_zerocopy_receive zc; } buf = {}; socklen_t optlen; char *big_buf = NULL; @@ -154,6 +161,27 @@ static int getsetsockopt(void) goto err; } + /* TCP_ZEROCOPY_RECEIVE triggers */ + memset(&buf, 0, sizeof(buf)); + optlen = sizeof(buf.zc); + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (err) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + + memset(&buf, 0, sizeof(buf)); + buf.zc.address = 12345; /* rejected by BPF */ + optlen = sizeof(buf.zc); + errno = 0; + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (errno != EPERM) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + free(big_buf); close(fd); return 0; diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 712df7b49cb1..d3597f81e6e9 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -#include +#include #include +#include #include char _license[] SEC("license") = "GPL"; @@ -12,6 +12,10 @@ __u32 _version SEC("version") = 1; #define PAGE_SIZE 4096 #endif +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef struct sockopt_sk { @@ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. + * It has a custom implementation for performance + * reasons. + */ + + if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) + return 0; /* EPERM, bounds check */ + + if (((struct tcp_zerocopy_receive *)optval)->address != 0) + return 0; /* EPERM, unexpected data */ + + return 1; + } + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { if (optval + 1 > optval_end) return 0; /* EPERM, bounds check */ diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e49e2fdde942..f7c2fd89d01a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -16,7 +16,6 @@ typedef __u16 __sum16; #include #include #include -#include #include #include #include -- cgit v1.2.3 From 20f2505fb436cfa674cf1f46aaa624f44d3d1d03 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 Jan 2021 08:35:00 -0800 Subject: bpf: Try to avoid kzalloc in cgroup/{s,g}etsockopt When we attach a bpf program to cgroup/getsockopt any other getsockopt() syscall starts incurring kzalloc/kfree cost. Let add a small buffer on the stack and use it for small (majority) {s,g}etsockopt values. The buffer is small enough to fit into the cache line and cover the majority of simple options (most of them are 4 byte ints). It seems natural to do the same for setsockopt, but it's a bit more involved when the BPF program modifies the data (where we have to kmalloc). The assumption is that for the majority of setsockopt calls (which are doing pure BPF options or apply policy) this will bring some benefit as well. Without this patch (we remove about 1% __kmalloc): 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt | --3.30%--__cgroup_bpf_run_filter_getsockopt | --0.81%--__kmalloc Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210115163501.805133-3-sdf@google.com --- include/linux/filter.h | 5 +++++ kernel/bpf/cgroup.c | 52 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fdce5407214..5b3137d7b690 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1298,6 +1298,11 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +#define BPF_SOCKOPT_KERN_BUF_SIZE 32 +struct bpf_sockopt_buf { + u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; +}; + struct bpf_sockopt_kern { struct sock *sk; u8 *optval; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 416e7738981b..ba8a1199d0ba 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1350,7 +1370,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1410,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1444,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1425,7 +1463,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1483,7 +1521,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } -- cgit v1.2.3 From a9ed15dae0755a0368735e0556a462d8519bdb05 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 Jan 2021 08:35:01 -0800 Subject: bpf: Split cgroup_bpf_enabled per attach type When we attach any cgroup hook, the rest (even if unused/unattached) start to contribute small overhead. In particular, the one we want to avoid is __cgroup_bpf_run_filter_skb which does two redirections to get to the cgroup and pushes/pulls skb. Let's split cgroup_bpf_enabled to be per-attach to make sure only used attach types trigger. I've dropped some existing high-level cgroup_bpf_enabled in some places because BPF_PROG_CGROUP_XXX_RUN macros usually have another cgroup_bpf_enabled check. I also had to copy-paste BPF_CGROUP_RUN_SA_PROG_LOCK for GETPEERNAME/GETSOCKNAME because type for cgroup_bpf_enabled[type] has to be constant and known at compile time. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210115163501.805133-4-sdf@google.com --- include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++------------------ kernel/bpf/cgroup.c | 14 ++++++-------- net/ipv4/af_inet.c | 9 +++++---- net/ipv4/udp.c | 7 +++---- net/ipv6/af_inet6.c | 9 +++++---- net/ipv6/udp.c | 7 +++---- 6 files changed, 42 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index bcb2915e6124..0748fd87969e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,8 +23,8 @@ struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF -extern struct static_key_false cgroup_bpf_enabled_key; -#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -189,7 +189,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ BPF_CGROUP_INET_INGRESS); \ \ @@ -199,7 +199,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ @@ -211,7 +211,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ @@ -232,7 +232,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ NULL); \ __ret; \ @@ -241,7 +241,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ t_ctx); \ @@ -256,8 +256,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) -#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ - sk->sk_prot->pre_connect) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ + ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) @@ -301,7 +303,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ @@ -311,7 +313,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ @@ -324,7 +326,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ access, \ BPF_CGROUP_DEVICE); \ @@ -336,7 +338,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ BPF_CGROUP_SYSCTL); \ @@ -347,7 +349,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -358,7 +360,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -367,7 +369,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ @@ -382,7 +384,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ @@ -444,7 +446,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled (0) +#define cgroup_bpf_enabled(type) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ba8a1199d0ba..da649f20d6b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1360,8 +1360,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1457,8 +1456,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b94fa8eb831b..6ba2930ff49b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -777,18 +777,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69ea76578abb..c67e483fce41 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1124,7 +1124,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1858,9 +1858,8 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8e9c3e9ea36e..b9c654836b72 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -527,18 +527,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b9f3dfdd2383..a02ac875a923 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -409,9 +409,8 @@ try_again: } *addr_len = sizeof(*sin6); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -1462,7 +1461,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) -- cgit v1.2.3 From 7baf2429a1a965369b0ce44efb6315cdd515aa9c Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 19 Jan 2021 16:31:50 +0800 Subject: net/sched: cls_flower add CT_FLAGS_INVALID flag support This patch add the TCA_FLOWER_KEY_CT_FLAGS_INVALID flag to match the ct_state with invalid for conntrack. Signed-off-by: wenxu Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/1611045110-682-1-git-send-email-wenxu@ucloud.cn Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ++-- include/net/sch_generic.h | 1 + include/uapi/linux/pkt_cls.h | 1 + net/core/dev.c | 2 ++ net/core/flow_dissector.c | 13 +++++++++---- net/sched/act_ct.c | 1 + net/sched/cls_flower.c | 4 +++- 7 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46f901adf1a8..186dad231e30 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1353,8 +1353,8 @@ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - u16 *ctinfo_map, - size_t mapsize); + u16 *ctinfo_map, size_t mapsize, + bool post_ct); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 639e465a108f..e7bee99aebce 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -388,6 +388,7 @@ struct qdisc_skb_cb { #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; u16 mru; + bool post_ct; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ee95f42fb0ec..709668e264b0 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -591,6 +591,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ + TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index 00f970ba0248..d9ce02e95992 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3878,6 +3878,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4960,6 +4961,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, qdisc_skb_cb(skb)->pkt_len = skb->len; qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2d70ded389ae..c565c7a17091 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -237,9 +237,8 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - u16 *ctinfo_map, - size_t mapsize) + void *target_container, u16 *ctinfo_map, + size_t mapsize, bool post_ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; @@ -251,13 +250,19 @@ skb_flow_dissect_ct(const struct sk_buff *skb, return; ct = nf_ct_get(skb, &ctinfo); - if (!ct) + if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); + if (!ct) { + key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID; + return; + } + if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 83a5c6722a06..b3442078aabc 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1030,6 +1030,7 @@ out_push: out: tcf_action_update_bstats(&c->common, skb); + qdisc_skb_cb(skb)->post_ct = true; if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 84f932532db7..4a9297a89c77 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -302,6 +302,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); + bool post_ct = qdisc_skb_cb(skb)->post_ct; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -318,7 +319,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, - ARRAY_SIZE(fl_ct_info_to_flower_map)); + ARRAY_SIZE(fl_ct_info_to_flower_map), + post_ct); skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); -- cgit v1.2.3 From 07be2fed5ee7b3a01e0b21c15814b590af9c1527 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:13 +0100 Subject: net/fq_impl: bulk-free packets from a flow on overmemory This is similar to what sch_fq_codel does. It also amortizes the worst case cost of a follow-up patch that changes the selection of the biggest flow for dropping packets Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-1-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 55 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index e73d74d2fabf..06d2a79233c9 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -11,17 +11,25 @@ /* functions that are embedded into includer */ + +static void +__fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, + unsigned int bytes, unsigned int truesize) +{ + struct fq_tin *tin = flow->tin; + + tin->backlog_bytes -= bytes; + tin->backlog_packets -= packets; + flow->backlog -= bytes; + fq->backlog -= packets; + fq->memory_usage -= truesize; +} + static void fq_adjust_removal(struct fq *fq, struct fq_flow *flow, struct sk_buff *skb) { - struct fq_tin *tin = flow->tin; - - tin->backlog_bytes -= skb->len; - tin->backlog_packets--; - flow->backlog -= skb->len; - fq->backlog--; - fq->memory_usage -= skb->truesize; + __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow) @@ -59,6 +67,34 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq, return skb; } +static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, + fq_skb_free_t free_func) +{ + unsigned int packets = 0, bytes = 0, truesize = 0; + struct fq_tin *tin = flow->tin; + struct sk_buff *skb; + int pending; + + lockdep_assert_held(&fq->lock); + + pending = min_t(int, 32, skb_queue_len(&flow->queue) / 2); + do { + skb = __skb_dequeue(&flow->queue); + if (!skb) + break; + + packets++; + bytes += skb->len; + truesize += skb->truesize; + free_func(fq, tin, flow, skb); + } while (packets < pending); + + __fq_adjust_removal(fq, flow, packets, bytes, truesize); + fq_rejigger_backlog(fq, flow); + + return packets; +} + static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) @@ -190,12 +226,9 @@ static void fq_tin_enqueue(struct fq *fq, if (!flow) return; - skb = fq_flow_dequeue(fq, flow); - if (!skb) + if (!fq_flow_drop(fq, flow, free_func)) return; - free_func(fq, flow->tin, flow, skb); - flow->tin->overlimit++; fq->overlimit++; if (oom) { -- cgit v1.2.3 From bf9009bf21b53501f2abb2f59f9314d85bde5fc9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:14 +0100 Subject: net/fq_impl: drop get_default_func, move default flow to fq_tin Simplifies the code and prepares for a rework of scanning for flows on overmemory drop. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-2-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/fq.h | 1 + include/net/fq_impl.h | 11 +++++------ net/mac80211/ieee80211_i.h | 1 - net/mac80211/tx.c | 22 ++++------------------ 4 files changed, 10 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/fq.h b/include/net/fq.h index e39f3f8d5f8a..5df100b77099 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -47,6 +47,7 @@ struct fq_flow { struct fq_tin { struct list_head new_flows; struct list_head old_flows; + struct fq_flow default_flow; u32 backlog_bytes; u32 backlog_packets; u32 overlimit; diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 06d2a79233c9..dd374c7f0fe5 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -151,8 +151,7 @@ static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, - struct sk_buff *skb, - fq_flow_get_default_t get_default_func) + struct sk_buff *skb) { struct fq_flow *flow; @@ -160,7 +159,7 @@ static struct fq_flow *fq_flow_classify(struct fq *fq, flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { - flow = get_default_func(fq, tin, idx, skb); + flow = &tin->default_flow; tin->collisions++; fq->collisions++; } @@ -192,15 +191,14 @@ static void fq_recalc_backlog(struct fq *fq, static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, - fq_skb_free_t free_func, - fq_flow_get_default_t get_default_func) + fq_skb_free_t free_func) { struct fq_flow *flow; bool oom; lockdep_assert_held(&fq->lock); - flow = fq_flow_classify(fq, tin, idx, skb, get_default_func); + flow = fq_flow_classify(fq, tin, idx, skb); flow->tin = tin; flow->backlog += skb->len; @@ -331,6 +329,7 @@ static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); + fq_flow_init(&tin->default_flow); } static int fq_init(struct fq *fq, int flows_cnt) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8bf9c0e974d6..c0f6168fdeed 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -848,7 +848,6 @@ enum txq_info_flags { */ struct txq_info { struct fq_tin tin; - struct fq_flow def_flow; struct codel_vars def_cvars; struct codel_stats cstats; struct sk_buff_head frags; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ebb3228ce971..935a9ce65749 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1309,7 +1309,7 @@ static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, fq = &local->fq; if (cvars == &txqi->def_cvars) - flow = &txqi->def_flow; + flow = &txqi->tin.default_flow; else flow = &fq->flows[cvars - local->cvars]; @@ -1352,7 +1352,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, cparams = &local->cparams; } - if (flow == &txqi->def_flow) + if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else cvars = &local->cvars[flow - fq->flows]; @@ -1379,17 +1379,6 @@ static void fq_skb_free_func(struct fq *fq, ieee80211_free_txskb(&local->hw, skb); } -static struct fq_flow *fq_flow_get_default_func(struct fq *fq, - struct fq_tin *tin, - int idx, - struct sk_buff *skb) -{ - struct txq_info *txqi; - - txqi = container_of(tin, struct txq_info, tin); - return &txqi->def_flow; -} - static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct txq_info *txqi, struct sk_buff *skb) @@ -1402,8 +1391,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, spin_lock_bh(&fq->lock); fq_tin_enqueue(fq, tin, flow_idx, skb, - fq_skb_free_func, - fq_flow_get_default_func); + fq_skb_free_func); spin_unlock_bh(&fq->lock); } @@ -1446,7 +1434,6 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct txq_info *txqi, int tid) { fq_tin_init(&txqi->tin); - fq_flow_init(&txqi->def_flow); codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); @@ -3283,8 +3270,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, */ tin = &txqi->tin; - flow = fq_flow_classify(fq, tin, flow_idx, skb, - fq_flow_get_default_func); + flow = fq_flow_classify(fq, tin, flow_idx, skb); head = skb_peek_tail(&flow->queue); if (!head || skb_is_gso(head)) goto out; -- cgit v1.2.3 From d7b649291782430904e17cde2ebfc90f76021ca5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:15 +0100 Subject: net/fq_impl: do not maintain a backlog-sorted list of flows A sorted flow list is only needed to drop packets in the biggest flow when hitting the overmemory condition. By scanning flows only when needed, we can avoid paying the cost of maintaining the list under normal conditions In order to avoid scanning lots of empty flows and touching too many cold cache lines, a bitmap of flows with backlog is maintained Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-3-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/fq.h | 10 ++--- include/net/fq_impl.h | 113 ++++++++++++++++++++++++++++++-------------------- net/mac80211/tx.c | 2 - 3 files changed, 71 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/net/fq.h b/include/net/fq.h index 5df100b77099..2eccbbd2b559 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -19,8 +19,6 @@ struct fq_tin; * @flowchain: can be linked to fq_tin's new_flows or old_flows. Used for DRR++ * (deficit round robin) based round robin queuing similar to the one * found in net/sched/sch_fq_codel.c - * @backlogchain: can be linked to other fq_flow and fq. Used to keep track of - * fat flows and efficient head-dropping if packet limit is reached * @queue: sk_buff queue to hold packets * @backlog: number of bytes pending in the queue. The number of packets can be * found in @queue.qlen @@ -29,7 +27,6 @@ struct fq_tin; struct fq_flow { struct fq_tin *tin; struct list_head flowchain; - struct list_head backlogchain; struct sk_buff_head queue; u32 backlog; int deficit; @@ -47,6 +44,7 @@ struct fq_flow { struct fq_tin { struct list_head new_flows; struct list_head old_flows; + struct list_head tin_list; struct fq_flow default_flow; u32 backlog_bytes; u32 backlog_packets; @@ -60,14 +58,14 @@ struct fq_tin { /** * struct fq - main container for fair queuing purposes * - * @backlogs: linked to fq_flows. Used to maintain fat flows for efficient - * head-dropping when @backlog reaches @limit * @limit: max number of packets that can be queued across all flows * @backlog: number of packets queued across all flows */ struct fq { struct fq_flow *flows; - struct list_head backlogs; + unsigned long *flows_bitmap; + + struct list_head tin_backlog; spinlock_t lock; u32 flows_cnt; u32 limit; diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index dd374c7f0fe5..a5f67a2c0c73 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -17,12 +17,24 @@ __fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, unsigned int bytes, unsigned int truesize) { struct fq_tin *tin = flow->tin; + int idx; tin->backlog_bytes -= bytes; tin->backlog_packets -= packets; flow->backlog -= bytes; fq->backlog -= packets; fq->memory_usage -= truesize; + + if (flow->backlog) + return; + + if (flow == &tin->default_flow) { + list_del_init(&tin->tin_list); + return; + } + + idx = flow - fq->flows; + __clear_bit(idx, fq->flows_bitmap); } static void fq_adjust_removal(struct fq *fq, @@ -32,24 +44,6 @@ static void fq_adjust_removal(struct fq *fq, __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } -static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow) -{ - struct fq_flow *i; - - if (flow->backlog == 0) { - list_del_init(&flow->backlogchain); - } else { - i = flow; - - list_for_each_entry_continue(i, &fq->backlogs, backlogchain) - if (i->backlog < flow->backlog) - break; - - list_move_tail(&flow->backlogchain, - &i->backlogchain); - } -} - static struct sk_buff *fq_flow_dequeue(struct fq *fq, struct fq_flow *flow) { @@ -62,7 +56,6 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq, return NULL; fq_adjust_removal(fq, flow, skb); - fq_rejigger_backlog(fq, flow); return skb; } @@ -90,7 +83,6 @@ static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, } while (packets < pending); __fq_adjust_removal(fq, flow, packets, bytes, truesize); - fq_rejigger_backlog(fq, flow); return packets; } @@ -170,22 +162,36 @@ static struct fq_flow *fq_flow_classify(struct fq *fq, return flow; } -static void fq_recalc_backlog(struct fq *fq, - struct fq_tin *tin, - struct fq_flow *flow) +static struct fq_flow *fq_find_fattest_flow(struct fq *fq) { - struct fq_flow *i; + struct fq_tin *tin; + struct fq_flow *flow = NULL; + u32 len = 0; + int i; - if (list_empty(&flow->backlogchain)) - list_add_tail(&flow->backlogchain, &fq->backlogs); + for_each_set_bit(i, fq->flows_bitmap, fq->flows_cnt) { + struct fq_flow *cur = &fq->flows[i]; + unsigned int cur_len; - i = flow; - list_for_each_entry_continue_reverse(i, &fq->backlogs, - backlogchain) - if (i->backlog > flow->backlog) - break; + cur_len = cur->backlog; + if (cur_len <= len) + continue; + + flow = cur; + len = cur_len; + } + + list_for_each_entry(tin, &fq->tin_backlog, tin_list) { + unsigned int cur_len = tin->default_flow.backlog; - list_move(&flow->backlogchain, &i->backlogchain); + if (cur_len <= len) + continue; + + flow = &tin->default_flow; + len = cur_len; + } + + return flow; } static void fq_tin_enqueue(struct fq *fq, @@ -200,6 +206,13 @@ static void fq_tin_enqueue(struct fq *fq, flow = fq_flow_classify(fq, tin, idx, skb); + if (!flow->backlog) { + if (flow != &tin->default_flow) + __set_bit(idx, fq->flows_bitmap); + else if (list_empty(&tin->tin_list)) + list_add(&tin->tin_list, &fq->tin_backlog); + } + flow->tin = tin; flow->backlog += skb->len; tin->backlog_bytes += skb->len; @@ -207,8 +220,6 @@ static void fq_tin_enqueue(struct fq *fq, fq->memory_usage += skb->truesize; fq->backlog++; - fq_recalc_backlog(fq, tin, flow); - if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; list_add_tail(&flow->flowchain, @@ -218,9 +229,7 @@ static void fq_tin_enqueue(struct fq *fq, __skb_queue_tail(&flow->queue, skb); oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { - flow = list_first_entry_or_null(&fq->backlogs, - struct fq_flow, - backlogchain); + flow = fq_find_fattest_flow(fq); if (!flow) return; @@ -255,8 +264,6 @@ static void fq_flow_filter(struct fq *fq, fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } - - fq_rejigger_backlog(fq, flow); } static void fq_tin_filter(struct fq *fq, @@ -279,16 +286,18 @@ static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { + struct fq_tin *tin = flow->tin; struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) - free_func(fq, flow->tin, flow, skb); + free_func(fq, tin, flow, skb); - if (!list_empty(&flow->flowchain)) + if (!list_empty(&flow->flowchain)) { list_del_init(&flow->flowchain); - - if (!list_empty(&flow->backlogchain)) - list_del_init(&flow->backlogchain); + if (list_empty(&tin->new_flows) && + list_empty(&tin->old_flows)) + list_del_init(&tin->tin_list); + } flow->tin = NULL; @@ -314,6 +323,7 @@ static void fq_tin_reset(struct fq *fq, fq_flow_reset(fq, flow, free_func); } + WARN_ON_ONCE(!list_empty(&tin->tin_list)); WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } @@ -321,7 +331,6 @@ static void fq_tin_reset(struct fq *fq, static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); - INIT_LIST_HEAD(&flow->backlogchain); __skb_queue_head_init(&flow->queue); } @@ -329,6 +338,7 @@ static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); + INIT_LIST_HEAD(&tin->tin_list); fq_flow_init(&tin->default_flow); } @@ -337,8 +347,8 @@ static int fq_init(struct fq *fq, int flows_cnt) int i; memset(fq, 0, sizeof(fq[0])); - INIT_LIST_HEAD(&fq->backlogs); spin_lock_init(&fq->lock); + INIT_LIST_HEAD(&fq->tin_backlog); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; @@ -348,6 +358,14 @@ static int fq_init(struct fq *fq, int flows_cnt) if (!fq->flows) return -ENOMEM; + fq->flows_bitmap = kcalloc(BITS_TO_LONGS(fq->flows_cnt), sizeof(long), + GFP_KERNEL); + if (!fq->flows_bitmap) { + kvfree(fq->flows); + fq->flows = NULL; + return -ENOMEM; + } + for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); @@ -364,6 +382,9 @@ static void fq_reset(struct fq *fq, kvfree(fq->flows); fq->flows = NULL; + + kfree(fq->flows_bitmap); + fq->flows_bitmap = NULL; } #endif diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 935a9ce65749..d981647c2863 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3337,8 +3337,6 @@ out_recalc: if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; - - fq_recalc_backlog(fq, tin, flow); } out: spin_unlock_bh(&fq->lock); -- cgit v1.2.3 From 80a915ec4427f0083829f7e6518ee9f21521ee1e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:18 +0100 Subject: mac80211: add rx decapsulation offload support This allows drivers to pass 802.3 frames to mac80211, with some restrictions: - the skb must be passed with a valid sta - fast-rx needs to be active for the sta - monitor mode needs to be disabled mac80211 will tell the driver when it is safe to enable rx decap offload for a particular station. In order to implement support, a driver must: - call ieee80211_hw_set(hw, SUPPORTS_RX_DECAP_OFFLOAD) - implement ops->sta_set_decap_offload - mark 802.3 frames with RX_FLAG_8023 If it doesn't want to enable offload for some vif types, it can mask out IEEE80211_OFFLOAD_DECAP_ENABLED in vif->offload_flags from within the .add_interface or .update_vif_offload driver ops Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-6-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 +++ net/mac80211/debugfs.c | 1 + net/mac80211/debugfs_sta.c | 1 + net/mac80211/driver-ops.h | 16 +++ net/mac80211/iface.c | 17 +++- net/mac80211/rx.c | 243 +++++++++++++++++++++++++++++---------------- net/mac80211/sta_info.h | 2 + net/mac80211/trace.h | 18 +++- 8 files changed, 225 insertions(+), 89 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2bdbf62f4ecd..176fe0d9f67f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1296,6 +1296,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the "0-length PSDU" field included there. The value for it is * in &struct ieee80211_rx_status. Note that if this value isn't * known the frame shouldn't be reported. + * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by + * hardware or driver) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -1328,6 +1330,7 @@ enum mac80211_rx_flags { RX_FLAG_RADIOTAP_HE_MU = BIT(27), RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), + RX_FLAG_8023 = BIT(30), }; /** @@ -1649,11 +1652,15 @@ enum ieee80211_vif_flags { * The driver supports sending frames passed as 802.3 frames by mac80211. * It must also support sending 802.11 packets for the same interface. * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload + * @IEEE80211_OFFLOAD_DECAP_ENABLED: rx encapsulation offload is enabled + * The driver supports passing received 802.11 frames as 802.3 frames to + * mac80211. */ enum ieee80211_offload_flags { IEEE80211_OFFLOAD_ENCAP_ENABLED = BIT(0), IEEE80211_OFFLOAD_ENCAP_4ADDR = BIT(1), + IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2), }; /** @@ -2389,6 +2396,9 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation * offload * + * @IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD: Hardware supports rx decapsulation + * offload + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2442,6 +2452,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, + IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -3881,6 +3892,8 @@ enum ieee80211_reconfig_type { * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode * @set_sar_specs: Update the SAR (TX power) settings. + * @sta_set_decap_offload: Called to notify the driver when a station is allowed + * to use rx decapsulation offload */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4198,6 +4211,9 @@ struct ieee80211_ops { struct ieee80211_sta *sta, bool enabled); int (*set_sar_specs)(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); + void (*sta_set_decap_offload)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, bool enabled); }; /** diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 9e723d943421..08a6f6644dc4 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -405,6 +405,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), + FLAG(SUPPORTS_RX_DECAP_OFFLOAD), #undef FLAG }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index eb4bb79d936a..5a27c61a7b38 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -79,6 +79,7 @@ static const char * const sta_flag_names[] = { FLAG(MPSP_RECIPIENT), FLAG(PS_DELIVER), FLAG(USES_ENCRYPTION), + FLAG(DECAP_OFFLOAD), #undef FLAG }; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index bcdfd19a596b..604ca59937f0 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1413,4 +1413,20 @@ static inline void drv_sta_set_4addr(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + bool enabled) +{ + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_sta_set_decap_offload(local, sdata, sta, enabled); + if (local->ops->sta_set_decap_offload) + local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta, + enabled); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 3b9ec4ef81c3..1b44690823aa 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -765,7 +765,7 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = { .ndo_get_stats64 = ieee80211_get_stats64, }; -static bool ieee80211_iftype_supports_encap_offload(enum nl80211_iftype iftype) +static bool ieee80211_iftype_supports_hdr_offload(enum nl80211_iftype iftype) { switch (iftype) { /* P2P GO and client are mapped to AP/STATION types */ @@ -785,7 +785,7 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat flags = sdata->vif.offload_flags; if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) && - ieee80211_iftype_supports_encap_offload(sdata->vif.type)) { + ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED; if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) && @@ -798,10 +798,21 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; } + if (ieee80211_hw_check(&local->hw, SUPPORTS_RX_DECAP_OFFLOAD) && + ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { + flags |= IEEE80211_OFFLOAD_DECAP_ENABLED; + + if (local->monitors) + flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; + } else { + flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; + } + if (sdata->vif.offload_flags == flags) return false; sdata->vif.offload_flags = flags; + ieee80211_check_fast_rx_iface(sdata); return true; } @@ -819,7 +830,7 @@ static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata) } if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) || - !ieee80211_iftype_supports_encap_offload(bss->vif.type)) + !ieee80211_iftype_supports_hdr_offload(bss->vif.type)) return; enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 972895e9f22d..c1343c028b76 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4095,7 +4095,9 @@ void ieee80211_check_fast_rx(struct sta_info *sta) .vif_type = sdata->vif.type, .control_port_protocol = sdata->control_port_protocol, }, *old, *new = NULL; + bool set_offload = false; bool assign = false; + bool offload; /* use sparse to check that we don't return without updating */ __acquire(check_fast_rx); @@ -4208,6 +4210,17 @@ void ieee80211_check_fast_rx(struct sta_info *sta) if (assign) new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL); + offload = assign && + (sdata->vif.offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED); + + if (offload) + set_offload = !test_and_set_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); + else + set_offload = test_and_clear_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); + + if (set_offload) + drv_sta_set_decap_offload(local, sdata, &sta->sta, assign); + spin_lock_bh(&sta->lock); old = rcu_dereference_protected(sta->fast_rx, true); rcu_assign_pointer(sta->fast_rx, new); @@ -4254,6 +4267,104 @@ void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->sta_mtx); } +static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, + struct ieee80211_fast_rx *fast_rx, + int orig_len) +{ + struct ieee80211_sta_rx_stats *stats; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + struct sta_info *sta = rx->sta; + struct sk_buff *skb = rx->skb; + void *sa = skb->data + ETH_ALEN; + void *da = skb->data; + + stats = &sta->rx_stats; + if (fast_rx->uses_rss) + stats = this_cpu_ptr(sta->pcpu_rx_stats); + + /* statistics part of ieee80211_rx_h_sta_process() */ + if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { + stats->last_signal = status->signal; + if (!fast_rx->uses_rss) + ewma_signal_add(&sta->rx_stats_avg.signal, + -status->signal); + } + + if (status->chains) { + int i; + + stats->chains = status->chains; + for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { + int signal = status->chain_signal[i]; + + if (!(status->chains & BIT(i))) + continue; + + stats->chain_signal_last[i] = signal; + if (!fast_rx->uses_rss) + ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], + -signal); + } + } + /* end of statistics */ + + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); + + stats->fragments++; + stats->packets++; + + skb->dev = fast_rx->dev; + + dev_sw_netstats_rx_add(fast_rx->dev, skb->len); + + /* The seqno index has the same property as needed + * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS + * for non-QoS-data frames. Here we know it's a data + * frame, so count MSDUs. + */ + u64_stats_update_begin(&stats->syncp); + stats->msdu[rx->seqno_idx]++; + stats->bytes += orig_len; + u64_stats_update_end(&stats->syncp); + + if (fast_rx->internal_forward) { + struct sk_buff *xmit_skb = NULL; + if (is_multicast_ether_addr(da)) { + xmit_skb = skb_copy(skb, GFP_ATOMIC); + } else if (!ether_addr_equal(da, sa) && + sta_info_get(rx->sdata, da)) { + xmit_skb = skb; + skb = NULL; + } + + if (xmit_skb) { + /* + * Send to wireless media and increase priority by 256 + * to keep the received priority instead of + * reclassifying the frame (see cfg80211_classify8021d). + */ + xmit_skb->priority += 256; + xmit_skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(xmit_skb); + skb_reset_mac_header(xmit_skb); + dev_queue_xmit(xmit_skb); + } + + if (!skb) + return; + } + + /* deliver to local stack */ + skb->protocol = eth_type_trans(skb, fast_rx->dev); + memset(skb->cb, 0, sizeof(skb->cb)); + if (rx->list) + list_add_tail(&skb->list, rx->list); + else + netif_receive_skb(skb); + +} + static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct ieee80211_fast_rx *fast_rx) { @@ -4274,9 +4385,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, } addrs __aligned(2); struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; - if (fast_rx->uses_rss) - stats = this_cpu_ptr(sta->pcpu_rx_stats); - /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write * to a common data structure; drivers can implement that per queue * but we don't have that information in mac80211 @@ -4350,32 +4458,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, pskb_trim(skb, skb->len - fast_rx->icv_len)) goto drop; - /* statistics part of ieee80211_rx_h_sta_process() */ - if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { - stats->last_signal = status->signal; - if (!fast_rx->uses_rss) - ewma_signal_add(&sta->rx_stats_avg.signal, - -status->signal); - } - - if (status->chains) { - int i; - - stats->chains = status->chains; - for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { - int signal = status->chain_signal[i]; - - if (!(status->chains & BIT(i))) - continue; - - stats->chain_signal_last[i] = signal; - if (!fast_rx->uses_rss) - ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], - -signal); - } - } - /* end of statistics */ - if (rx->key && !ieee80211_has_protected(hdr->frame_control)) goto drop; @@ -4387,12 +4469,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, return true; } - stats->last_rx = jiffies; - stats->last_rate = sta_stats_encode_rate(status); - - stats->fragments++; - stats->packets++; - /* do the header conversion - first grab the addresses */ ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs); ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs); @@ -4401,58 +4477,14 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* push the addresses in front */ memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs)); - skb->dev = fast_rx->dev; - - dev_sw_netstats_rx_add(fast_rx->dev, skb->len); - - /* The seqno index has the same property as needed - * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS - * for non-QoS-data frames. Here we know it's a data - * frame, so count MSDUs. - */ - u64_stats_update_begin(&stats->syncp); - stats->msdu[rx->seqno_idx]++; - stats->bytes += orig_len; - u64_stats_update_end(&stats->syncp); - - if (fast_rx->internal_forward) { - struct sk_buff *xmit_skb = NULL; - if (is_multicast_ether_addr(addrs.da)) { - xmit_skb = skb_copy(skb, GFP_ATOMIC); - } else if (!ether_addr_equal(addrs.da, addrs.sa) && - sta_info_get(rx->sdata, addrs.da)) { - xmit_skb = skb; - skb = NULL; - } - - if (xmit_skb) { - /* - * Send to wireless media and increase priority by 256 - * to keep the received priority instead of - * reclassifying the frame (see cfg80211_classify8021d). - */ - xmit_skb->priority += 256; - xmit_skb->protocol = htons(ETH_P_802_3); - skb_reset_network_header(xmit_skb); - skb_reset_mac_header(xmit_skb); - dev_queue_xmit(xmit_skb); - } - - if (!skb) - return true; - } - - /* deliver to local stack */ - skb->protocol = eth_type_trans(skb, fast_rx->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->list) - list_add_tail(&skb->list, rx->list); - else - netif_receive_skb(skb); + ieee80211_rx_8023(rx, fast_rx, orig_len); return true; drop: dev_kfree_skb(skb); + if (fast_rx->uses_rss) + stats = this_cpu_ptr(sta->pcpu_rx_stats); + stats->dropped++; return true; } @@ -4506,6 +4538,43 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, return true; } +static void __ieee80211_rx_handle_8023(struct ieee80211_hw *hw, + struct ieee80211_sta *pubsta, + struct sk_buff *skb, + struct list_head *list) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_fast_rx *fast_rx; + struct ieee80211_rx_data rx; + + memset(&rx, 0, sizeof(rx)); + rx.skb = skb; + rx.local = local; + rx.list = list; + + I802_DEBUG_INC(local->dot11ReceivedFragmentCount); + + /* drop frame if too short for header */ + if (skb->len < sizeof(struct ethhdr)) + goto drop; + + if (!pubsta) + goto drop; + + rx.sta = container_of(pubsta, struct sta_info, sta); + rx.sdata = rx.sta->sdata; + + fast_rx = rcu_dereference(rx.sta->fast_rx); + if (!fast_rx) + goto drop; + + ieee80211_rx_8023(&rx, fast_rx, skb->len); + return; + +drop: + dev_kfree_skb(skb); +} + /* * This is the actual Rx frames handler. as it belongs to Rx path it must * be called with rcu_read_lock protection. @@ -4737,13 +4806,17 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, * if it was previously present. * Also, frames with less than 16 bytes are dropped. */ - skb = ieee80211_rx_monitor(local, skb, rate); + if (!(status->flag & RX_FLAG_8023)) + skb = ieee80211_rx_monitor(local, skb, rate); if (skb) { ieee80211_tpt_led_trig_rx(local, ((struct ieee80211_hdr *)skb->data)->frame_control, skb->len); - __ieee80211_rx_handle_packet(hw, pubsta, skb, list); + if (status->flag & RX_FLAG_8023) + __ieee80211_rx_handle_8023(hw, pubsta, skb, list); + else + __ieee80211_rx_handle_packet(hw, pubsta, skb, list); } kcov_remote_stop(); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 7afd07636b81..78b9d0c7cc58 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -71,6 +71,7 @@ * until pending frames are delivered * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, * so drop all packets without a key later. + * @WLAN_STA_DECAP_OFFLOAD: This station uses rx decap offload * * @NUM_WLAN_STA_FLAGS: number of defined flags */ @@ -102,6 +103,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, WLAN_STA_USES_ENCRYPTION, + WLAN_STA_DECAP_OFFLOAD, NUM_WLAN_STA_FLAGS, }; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 601322e16957..8fcc39056402 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2761,7 +2761,7 @@ DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload, TP_ARGS(local, sdata) ); -TRACE_EVENT(drv_sta_set_4addr, +DECLARE_EVENT_CLASS(sta_flag_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled), @@ -2788,6 +2788,22 @@ TRACE_EVENT(drv_sta_set_4addr, ) ); +DEFINE_EVENT(sta_flag_evt, drv_sta_set_4addr, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, bool enabled), + + TP_ARGS(local, sdata, sta, enabled) +); + +DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, bool enabled), + + TP_ARGS(local, sdata, sta, enabled) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From e8e507a8ac90d48053dfdea9d4855495b0204956 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 19 Jan 2021 16:07:47 +0100 Subject: soc: fsl: qe: make cpm_muram_offset take a const void* argument Allow passing const-qualified pointers without requiring a cast in the caller. Signed-off-by: Rasmus Villemoes Acked-by: Li Yang Signed-off-by: Jakub Kicinski --- drivers/soc/fsl/qe/qe_common.c | 2 +- include/soc/fsl/qe/qe.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/soc/fsl/qe/qe_common.c b/drivers/soc/fsl/qe/qe_common.c index 497a7e0fd027..b81cd1b2e4a0 100644 --- a/drivers/soc/fsl/qe/qe_common.c +++ b/drivers/soc/fsl/qe/qe_common.c @@ -223,7 +223,7 @@ void __iomem *cpm_muram_addr(unsigned long offset) } EXPORT_SYMBOL(cpm_muram_addr); -unsigned long cpm_muram_offset(void __iomem *addr) +unsigned long cpm_muram_offset(const void __iomem *addr) { return addr - (void __iomem *)muram_vbase; } diff --git a/include/soc/fsl/qe/qe.h b/include/soc/fsl/qe/qe.h index 3feddfec9f87..8ee3747433c0 100644 --- a/include/soc/fsl/qe/qe.h +++ b/include/soc/fsl/qe/qe.h @@ -102,7 +102,7 @@ s32 cpm_muram_alloc(unsigned long size, unsigned long align); void cpm_muram_free(s32 offset); s32 cpm_muram_alloc_fixed(unsigned long offset, unsigned long size); void __iomem *cpm_muram_addr(unsigned long offset); -unsigned long cpm_muram_offset(void __iomem *addr); +unsigned long cpm_muram_offset(const void __iomem *addr); dma_addr_t cpm_muram_dma(void __iomem *addr); #else static inline s32 cpm_muram_alloc(unsigned long size, @@ -126,7 +126,7 @@ static inline void __iomem *cpm_muram_addr(unsigned long offset) return NULL; } -static inline unsigned long cpm_muram_offset(void __iomem *addr) +static inline unsigned long cpm_muram_offset(const void __iomem *addr) { return -ENOSYS; } -- cgit v1.2.3 From 186b8daffb4ec2dabb8a3d93b329b16152a5a100 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 19 Jan 2021 16:07:49 +0100 Subject: soc: fsl: qe: add cpm_muram_free_addr() helper Add a helper that takes a virtual address rather than the muram offset. This will be used in a couple of places to avoid having to store both the offset and the virtual address, as well as removing NULL checks from the callers. Signed-off-by: Rasmus Villemoes Acked-by: Li Yang Signed-off-by: Jakub Kicinski --- drivers/soc/fsl/qe/qe_common.c | 12 ++++++++++++ include/soc/fsl/qe/qe.h | 5 +++++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/drivers/soc/fsl/qe/qe_common.c b/drivers/soc/fsl/qe/qe_common.c index 236d962d45f1..654e9246ce6b 100644 --- a/drivers/soc/fsl/qe/qe_common.c +++ b/drivers/soc/fsl/qe/qe_common.c @@ -238,3 +238,15 @@ dma_addr_t cpm_muram_dma(void __iomem *addr) return muram_pbase + (addr - muram_vbase); } EXPORT_SYMBOL(cpm_muram_dma); + +/* + * As cpm_muram_free, but takes the virtual address rather than the + * muram offset. + */ +void cpm_muram_free_addr(const void __iomem *addr) +{ + if (!addr) + return; + cpm_muram_free(cpm_muram_offset(addr)); +} +EXPORT_SYMBOL(cpm_muram_free_addr); diff --git a/include/soc/fsl/qe/qe.h b/include/soc/fsl/qe/qe.h index 8ee3747433c0..66f1afc393d1 100644 --- a/include/soc/fsl/qe/qe.h +++ b/include/soc/fsl/qe/qe.h @@ -104,6 +104,7 @@ s32 cpm_muram_alloc_fixed(unsigned long offset, unsigned long size); void __iomem *cpm_muram_addr(unsigned long offset); unsigned long cpm_muram_offset(const void __iomem *addr); dma_addr_t cpm_muram_dma(void __iomem *addr); +void cpm_muram_free_addr(const void __iomem *addr); #else static inline s32 cpm_muram_alloc(unsigned long size, unsigned long align) @@ -135,6 +136,9 @@ static inline dma_addr_t cpm_muram_dma(void __iomem *addr) { return 0; } +static inline void cpm_muram_free_addr(const void __iomem *addr) +{ +} #endif /* defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE) */ /* QE PIO */ @@ -239,6 +243,7 @@ static inline int qe_alive_during_sleep(void) #define qe_muram_addr cpm_muram_addr #define qe_muram_offset cpm_muram_offset #define qe_muram_dma cpm_muram_dma +#define qe_muram_free_addr cpm_muram_free_addr #ifdef CONFIG_PPC32 #define qe_iowrite8(val, addr) out_8(addr, val) -- cgit v1.2.3 From 64a99fe596f9cb2af2c23c64352817ff8cf662bb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 19 Jan 2021 16:07:58 +0100 Subject: ethernet: ucc_geth: remove bd_mem_part and all associated code The bd_mem_part member of ucc_geth_info always has the value MEM_PART_SYSTEM, and AFAICT, there has never been any code setting it to any other value. Moreover, muram is a somewhat precious resource, so there's no point using that when normal memory serves just as well. Apart from removing a lot of dead code, this is also motivated by wanting to clean up the "store result from kmalloc() in a u32" mess. Signed-off-by: Rasmus Villemoes Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/ucc_geth.c | 108 ++++++++---------------------- include/soc/fsl/qe/qe.h | 6 -- include/soc/fsl/qe/ucc_fast.h | 1 - 3 files changed, 29 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 2369a5ede680..1e9d2f3f47a3 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -72,7 +72,6 @@ MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 0xffff=all)"); static const struct ucc_geth_info ugeth_primary_info = { .uf_info = { - .bd_mem_part = MEM_PART_SYSTEM, .rtsm = UCC_FAST_SEND_IDLES_BETWEEN_FRAMES, .max_rx_buf_length = 1536, /* adjusted at startup if max-speed 1000 */ @@ -1854,12 +1853,7 @@ static void ucc_geth_free_rx(struct ucc_geth_private *ugeth) kfree(ugeth->rx_skbuff[i]); - if (ugeth->ug_info->uf_info.bd_mem_part == - MEM_PART_SYSTEM) - kfree((void *)ugeth->rx_bd_ring_offset[i]); - else if (ugeth->ug_info->uf_info.bd_mem_part == - MEM_PART_MURAM) - qe_muram_free(ugeth->rx_bd_ring_offset[i]); + kfree((void *)ugeth->rx_bd_ring_offset[i]); ugeth->p_rx_bd_ring[i] = NULL; } } @@ -1897,12 +1891,7 @@ static void ucc_geth_free_tx(struct ucc_geth_private *ugeth) kfree(ugeth->tx_skbuff[i]); if (ugeth->p_tx_bd_ring[i]) { - if (ugeth->ug_info->uf_info.bd_mem_part == - MEM_PART_SYSTEM) - kfree((void *)ugeth->tx_bd_ring_offset[i]); - else if (ugeth->ug_info->uf_info.bd_mem_part == - MEM_PART_MURAM) - qe_muram_free(ugeth->tx_bd_ring_offset[i]); + kfree((void *)ugeth->tx_bd_ring_offset[i]); ugeth->p_tx_bd_ring[i] = NULL; } } @@ -2060,13 +2049,6 @@ static int ucc_struct_init(struct ucc_geth_private *ugeth) ug_info = ugeth->ug_info; uf_info = &ug_info->uf_info; - if (!((uf_info->bd_mem_part == MEM_PART_SYSTEM) || - (uf_info->bd_mem_part == MEM_PART_MURAM))) { - if (netif_msg_probe(ugeth)) - pr_err("Bad memory partition value\n"); - return -EINVAL; - } - /* Rx BD lengths */ for (i = 0; i < ug_info->numQueuesRx; i++) { if ((ug_info->bdRingLenRx[i] < UCC_GETH_RX_BD_RING_SIZE_MIN) || @@ -2186,6 +2168,8 @@ static int ucc_geth_alloc_tx(struct ucc_geth_private *ugeth) /* Allocate Tx bds */ for (j = 0; j < ug_info->numQueuesTx; j++) { + u32 align = UCC_GETH_TX_BD_RING_ALIGNMENT; + /* Allocate in multiple of UCC_GETH_TX_BD_RING_SIZE_MEMORY_ALIGNMENT, according to spec */ @@ -2195,25 +2179,15 @@ static int ucc_geth_alloc_tx(struct ucc_geth_private *ugeth) if ((ug_info->bdRingLenTx[j] * sizeof(struct qe_bd)) % UCC_GETH_TX_BD_RING_SIZE_MEMORY_ALIGNMENT) length += UCC_GETH_TX_BD_RING_SIZE_MEMORY_ALIGNMENT; - if (uf_info->bd_mem_part == MEM_PART_SYSTEM) { - u32 align = UCC_GETH_TX_BD_RING_ALIGNMENT; - - ugeth->tx_bd_ring_offset[j] = - (u32) kmalloc((u32) (length + align), GFP_KERNEL); - - if (ugeth->tx_bd_ring_offset[j] != 0) - ugeth->p_tx_bd_ring[j] = - (u8 __iomem *)((ugeth->tx_bd_ring_offset[j] + - align) & ~(align - 1)); - } else if (uf_info->bd_mem_part == MEM_PART_MURAM) { - ugeth->tx_bd_ring_offset[j] = - qe_muram_alloc(length, - UCC_GETH_TX_BD_RING_ALIGNMENT); - if (!IS_ERR_VALUE(ugeth->tx_bd_ring_offset[j])) - ugeth->p_tx_bd_ring[j] = - (u8 __iomem *) qe_muram_addr(ugeth-> - tx_bd_ring_offset[j]); - } + + ugeth->tx_bd_ring_offset[j] = + (u32) kmalloc((u32) (length + align), GFP_KERNEL); + + if (ugeth->tx_bd_ring_offset[j] != 0) + ugeth->p_tx_bd_ring[j] = + (u8 __iomem *)((ugeth->tx_bd_ring_offset[j] + + align) & ~(align - 1)); + if (!ugeth->p_tx_bd_ring[j]) { if (netif_msg_ifup(ugeth)) pr_err("Can not allocate memory for Tx bd rings\n"); @@ -2271,25 +2245,16 @@ static int ucc_geth_alloc_rx(struct ucc_geth_private *ugeth) /* Allocate Rx bds */ for (j = 0; j < ug_info->numQueuesRx; j++) { + u32 align = UCC_GETH_RX_BD_RING_ALIGNMENT; + length = ug_info->bdRingLenRx[j] * sizeof(struct qe_bd); - if (uf_info->bd_mem_part == MEM_PART_SYSTEM) { - u32 align = UCC_GETH_RX_BD_RING_ALIGNMENT; - - ugeth->rx_bd_ring_offset[j] = - (u32) kmalloc((u32) (length + align), GFP_KERNEL); - if (ugeth->rx_bd_ring_offset[j] != 0) - ugeth->p_rx_bd_ring[j] = - (u8 __iomem *)((ugeth->rx_bd_ring_offset[j] + - align) & ~(align - 1)); - } else if (uf_info->bd_mem_part == MEM_PART_MURAM) { - ugeth->rx_bd_ring_offset[j] = - qe_muram_alloc(length, - UCC_GETH_RX_BD_RING_ALIGNMENT); - if (!IS_ERR_VALUE(ugeth->rx_bd_ring_offset[j])) - ugeth->p_rx_bd_ring[j] = - (u8 __iomem *) qe_muram_addr(ugeth-> - rx_bd_ring_offset[j]); - } + ugeth->rx_bd_ring_offset[j] = + (u32) kmalloc((u32) (length + align), GFP_KERNEL); + if (ugeth->rx_bd_ring_offset[j] != 0) + ugeth->p_rx_bd_ring[j] = + (u8 __iomem *)((ugeth->rx_bd_ring_offset[j] + + align) & ~(align - 1)); + if (!ugeth->p_rx_bd_ring[j]) { if (netif_msg_ifup(ugeth)) pr_err("Can not allocate memory for Rx bd rings\n"); @@ -2554,20 +2519,11 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth) endOfRing = ugeth->p_tx_bd_ring[i] + (ug_info->bdRingLenTx[i] - 1) * sizeof(struct qe_bd); - if (ugeth->ug_info->uf_info.bd_mem_part == MEM_PART_SYSTEM) { - out_be32(&ugeth->p_send_q_mem_reg->sqqd[i].bd_ring_base, - (u32) virt_to_phys(ugeth->p_tx_bd_ring[i])); - out_be32(&ugeth->p_send_q_mem_reg->sqqd[i]. - last_bd_completed_address, - (u32) virt_to_phys(endOfRing)); - } else if (ugeth->ug_info->uf_info.bd_mem_part == - MEM_PART_MURAM) { - out_be32(&ugeth->p_send_q_mem_reg->sqqd[i].bd_ring_base, - (u32)qe_muram_dma(ugeth->p_tx_bd_ring[i])); - out_be32(&ugeth->p_send_q_mem_reg->sqqd[i]. - last_bd_completed_address, - (u32)qe_muram_dma(endOfRing)); - } + out_be32(&ugeth->p_send_q_mem_reg->sqqd[i].bd_ring_base, + (u32) virt_to_phys(ugeth->p_tx_bd_ring[i])); + out_be32(&ugeth->p_send_q_mem_reg->sqqd[i]. + last_bd_completed_address, + (u32) virt_to_phys(endOfRing)); } /* schedulerbasepointer */ @@ -2786,14 +2742,8 @@ static int ucc_geth_startup(struct ucc_geth_private *ugeth) /* Setup the table */ /* Assume BD rings are already established */ for (i = 0; i < ug_info->numQueuesRx; i++) { - if (ugeth->ug_info->uf_info.bd_mem_part == MEM_PART_SYSTEM) { - out_be32(&ugeth->p_rx_bd_qs_tbl[i].externalbdbaseptr, - (u32) virt_to_phys(ugeth->p_rx_bd_ring[i])); - } else if (ugeth->ug_info->uf_info.bd_mem_part == - MEM_PART_MURAM) { - out_be32(&ugeth->p_rx_bd_qs_tbl[i].externalbdbaseptr, - (u32)qe_muram_dma(ugeth->p_rx_bd_ring[i])); - } + out_be32(&ugeth->p_rx_bd_qs_tbl[i].externalbdbaseptr, + (u32) virt_to_phys(ugeth->p_rx_bd_ring[i])); /* rest of fields handled by QE */ } diff --git a/include/soc/fsl/qe/qe.h b/include/soc/fsl/qe/qe.h index 66f1afc393d1..4925a1b59dc9 100644 --- a/include/soc/fsl/qe/qe.h +++ b/include/soc/fsl/qe/qe.h @@ -27,12 +27,6 @@ #define QE_NUM_OF_BRGS 16 #define QE_NUM_OF_PORTS 1024 -/* Memory partitions -*/ -#define MEM_PART_SYSTEM 0 -#define MEM_PART_SECONDARY 1 -#define MEM_PART_MURAM 2 - /* Clocks and BRGs */ enum qe_clock { QE_CLK_NONE = 0, diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h index dc4e79468094..9696a5b9b5d1 100644 --- a/include/soc/fsl/qe/ucc_fast.h +++ b/include/soc/fsl/qe/ucc_fast.h @@ -146,7 +146,6 @@ struct ucc_fast_info { resource_size_t regs; int irq; u32 uccm_mask; - int bd_mem_part; int brkpt_support; int grant_support; int tsa; -- cgit v1.2.3 From d9c85e24726587277ce0dcf33b5695acfcc72234 Mon Sep 17 00:00:00 2001 From: Max Chen Date: Wed, 6 Jan 2021 15:50:49 -0800 Subject: cfg80211: Add phyrate conversion support for extended MCS in 60GHz band The current phyrate conversion does not include extended MCS and provides incorrect rates. Add a flag for extended MCS in DMG and add corresponding phyrate table for the correct conversions using base MCS in DMG specs. Signed-off-by: Max Chen Link: https://lore.kernel.org/r/1609977050-7089-2-git-send-email-mxchen@codeaurora.org [reduce data size, make a single WARN] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ net/wireless/util.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0d6f7ec86061..798f8eb15e00 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1460,6 +1460,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_DMG: 60GHz MCS * @RATE_INFO_FLAGS_HE_MCS: HE MCS information * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode + * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1468,6 +1469,7 @@ enum rate_info_flags { RATE_INFO_FLAGS_DMG = BIT(3), RATE_INFO_FLAGS_HE_MCS = BIT(4), RATE_INFO_FLAGS_EDMG = BIT(5), + RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), }; /** diff --git a/net/wireless/util.c b/net/wireless/util.c index c22ada0a36fa..eab928002cd8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1188,6 +1188,25 @@ static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) return __mcs2bitrate[rate->mcs]; } +static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) +{ + static const u32 __mcs2bitrate[] = { + [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ + [7 - 6] = 50050, /* MCS 12.1 */ + [8 - 6] = 53900, + [9 - 6] = 57750, + [10 - 6] = 63900, + [11 - 6] = 75075, + [12 - 6] = 80850, + }; + + /* Extended SC MCS not defined for base MCS below 6 or above 12 */ + if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) + return 0; + + return __mcs2bitrate[rate->mcs - 6]; +} + static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { @@ -1406,6 +1425,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); + if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) + return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) -- cgit v1.2.3 From 2fe8ef106238b274c505c480ecf00d8765abf0d8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 Jan 2021 16:19:42 +0100 Subject: cfg80211: change netdev registration/unregistration semantics We used to not require anything in terms of registering netdevs with cfg80211, using a netdev notifier instead. However, in the next patch reducing RTNL locking, this causes big problems, and the simplest way is to just require drivers to do things better. Change the registration/unregistration semantics to require the drivers to call cfg80211_(un)register_netdevice() when this is happening due to a cfg80211 request, i.e. add_virtual_intf() or del_virtual_intf() (or if it somehow has to happen in any other cfg80211 callback). Otherwise, in other contexts, drivers may continue to use the normal netdev (un)registration functions as usual. Internally, we still use the netdev notifier and track (by the new wdev->registered bool) if the wdev had already been added to cfg80211 or not. Link: https://lore.kernel.org/r/20210122161942.cf2f4b65e4e9.Ida8234e50da13eb675b557bac52a713ad4eddf71@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 4 +- drivers/net/wireless/ath/wil6210/netdev.c | 4 +- .../wireless/broadcom/brcm80211/brcmfmac/core.c | 6 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 4 +- drivers/net/wireless/microchip/wilc1000/cfg80211.c | 2 +- drivers/net/wireless/microchip/wilc1000/mon.c | 4 +- drivers/net/wireless/microchip/wilc1000/netdev.c | 2 +- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 4 +- drivers/net/wireless/quantenna/qtnfmac/core.c | 2 +- include/net/cfg80211.h | 40 +++++-- net/mac80211/iface.c | 9 +- net/wireless/core.c | 119 ++++++++++++--------- 12 files changed, 123 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 9c83e9a4299b..29527e8dcced 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -3648,7 +3648,7 @@ void ath6kl_cfg80211_vif_cleanup(struct ath6kl_vif *vif) kfree(mc_filter); } - unregister_netdevice(vif->ndev); + cfg80211_unregister_netdevice(vif->ndev); ar->num_vif--; } @@ -3821,7 +3821,7 @@ struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name, netdev_set_default_ethtool_ops(ndev, &ath6kl_ethtool_ops); - if (register_netdevice(ndev)) + if (cfg80211_register_netdevice(ndev)) goto err; ar->avail_idx_map &= ~BIT(fw_vif_idx); diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c index 07b4a252a23c..472fe804203d 100644 --- a/drivers/net/wireless/ath/wil6210/netdev.c +++ b/drivers/net/wireless/ath/wil6210/netdev.c @@ -424,7 +424,7 @@ int wil_vif_add(struct wil6210_priv *wil, struct wil6210_vif *vif) if (rc) return rc; } - rc = register_netdevice(ndev); + rc = cfg80211_register_netdevice(ndev); if (rc < 0) { dev_err(&ndev->dev, "Failed to register netdev: %d\n", rc); if (any_active && vif->mid != 0) @@ -511,7 +511,7 @@ void wil_vif_remove(struct wil6210_priv *wil, u8 mid) /* during unregister_netdevice cfg80211_leave may perform operations * such as stop AP, disconnect, so we only clear the VIF afterwards */ - unregister_netdevice(ndev); + cfg80211_unregister_netdevice(ndev); if (any_active && vif->mid != 0) wmi_port_delete(wil, vif->mid); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c index 3dd28f5fef19..6cf308d5934c 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c @@ -657,7 +657,7 @@ int brcmf_net_attach(struct brcmf_if *ifp, bool rtnl_locked) INIT_WORK(&ifp->ndoffload_work, _brcmf_update_ndtable); if (rtnl_locked) - err = register_netdevice(ndev); + err = cfg80211_register_netdevice(ndev); else err = register_netdev(ndev); if (err != 0) { @@ -681,7 +681,7 @@ void brcmf_net_detach(struct net_device *ndev, bool rtnl_locked) { if (ndev->reg_state == NETREG_REGISTERED) { if (rtnl_locked) - unregister_netdevice(ndev); + cfg80211_unregister_netdevice(ndev); else unregister_netdev(ndev); } else { @@ -758,7 +758,7 @@ int brcmf_net_mon_attach(struct brcmf_if *ifp) ndev = ifp->ndev; ndev->netdev_ops = &brcmf_netdev_ops_mon; - err = register_netdevice(ndev); + err = cfg80211_register_netdevice(ndev); if (err) bphy_err(drvr, "Failed to register %s device\n", ndev->name); diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index a6b9dc6700b1..15e1cee7f465 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -3081,7 +3081,7 @@ struct wireless_dev *mwifiex_add_virtual_intf(struct wiphy *wiphy, mutex_init(&priv->async_mutex); /* Register network device */ - if (register_netdevice(dev)) { + if (cfg80211_register_netdevice(dev)) { mwifiex_dbg(adapter, ERROR, "cannot register network device\n"); ret = -EFAULT; goto err_reg_netdev; @@ -3160,7 +3160,7 @@ int mwifiex_del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev) netif_carrier_off(priv->netdev); if (wdev->netdev->reg_state == NETREG_REGISTERED) - unregister_netdevice(wdev->netdev); + cfg80211_unregister_netdevice(wdev->netdev); if (priv->dfs_cac_workqueue) { flush_workqueue(priv->dfs_cac_workqueue); diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index e3dd205cbbe5..96973ec7bd9a 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -1538,7 +1538,7 @@ static int del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev) wilc_wfi_deinit_mon_interface(wl, true); vif = netdev_priv(wdev->netdev); cfg80211_stop_iface(wiphy, wdev, GFP_KERNEL); - unregister_netdevice(vif->ndev); + cfg80211_unregister_netdevice(vif->ndev); vif->monitor_flag = 0; wilc_set_operation_mode(vif, 0, 0, 0); diff --git a/drivers/net/wireless/microchip/wilc1000/mon.c b/drivers/net/wireless/microchip/wilc1000/mon.c index b5a1b65c087c..6bd63934c2d8 100644 --- a/drivers/net/wireless/microchip/wilc1000/mon.c +++ b/drivers/net/wireless/microchip/wilc1000/mon.c @@ -233,7 +233,7 @@ struct net_device *wilc_wfi_init_mon_interface(struct wilc *wl, wl->monitor_dev->netdev_ops = &wilc_wfi_netdev_ops; wl->monitor_dev->needs_free_netdev = true; - if (register_netdevice(wl->monitor_dev)) { + if (cfg80211_register_netdevice(wl->monitor_dev)) { netdev_err(real_dev, "register_netdevice failed\n"); free_netdev(wl->monitor_dev); return NULL; @@ -251,7 +251,7 @@ void wilc_wfi_deinit_mon_interface(struct wilc *wl, bool rtnl_locked) return; if (rtnl_locked) - unregister_netdevice(wl->monitor_dev); + cfg80211_unregister_netdevice(wl->monitor_dev); else unregister_netdev(wl->monitor_dev); wl->monitor_dev = NULL; diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.c b/drivers/net/wireless/microchip/wilc1000/netdev.c index 2a1fbbdd6a4b..643cbb155439 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.c +++ b/drivers/net/wireless/microchip/wilc1000/netdev.c @@ -950,7 +950,7 @@ struct wilc_vif *wilc_netdev_ifc_init(struct wilc *wl, const char *name, vif->priv.dev = ndev; if (rtnl_locked) - ret = register_netdevice(ndev); + ret = cfg80211_register_netdevice(ndev); else ret = register_netdev(ndev); diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 54cdf3ad09d7..504b4d0b98c4 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -180,7 +180,7 @@ int qtnf_del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev) cancel_work_sync(&vif->high_pri_tx_work); if (netdev->reg_state == NETREG_REGISTERED) - unregister_netdevice(netdev); + cfg80211_unregister_netdevice(netdev); if (qtnf_cmd_send_del_intf(vif)) pr_err("VIF%u.%u: failed to delete VIF\n", vif->mac->macid, @@ -267,7 +267,7 @@ static struct wireless_dev *qtnf_add_virtual_intf(struct wiphy *wiphy, if (qtnf_hwcap_is_set(&mac->bus->hw_info, QLINK_HW_CAPAB_HW_BRIDGE)) { ret = qtnf_cmd_netdev_changeupper(vif, vif->netdev->ifindex); if (ret) { - unregister_netdevice(vif->netdev); + cfg80211_unregister_netdevice(vif->netdev); vif->netdev = NULL; goto error_del_vif; } diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c index ad726bd100ec..18964e2a9f28 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.c +++ b/drivers/net/wireless/quantenna/qtnfmac/core.c @@ -492,7 +492,7 @@ int qtnf_core_net_attach(struct qtnf_wmac *mac, struct qtnf_vif *vif, SET_NETDEV_DEV(dev, wiphy_dev(wiphy)); - ret = register_netdevice(dev); + ret = cfg80211_register_netdevice(dev); if (ret) { free_netdev(dev); vif->netdev = NULL; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 798f8eb15e00..e7703fdbac8d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5228,6 +5228,7 @@ struct cfg80211_cqm_config; * * @wiphy: pointer to hardware description * @iftype: interface type + * @registered: is this wdev already registered with cfg80211 * @list: (private) Used to collect the interfaces * @netdev: (private) Used to reference back to the netdev, may be %NULL * @identifier: (private) Identifier used in nl80211 to identify this @@ -5311,7 +5312,7 @@ struct wireless_dev { struct mutex mtx; - bool use_4addr, is_running; + bool use_4addr, is_running, registered; u8 address[ETH_ALEN] __aligned(sizeof(u16)); @@ -7654,18 +7655,41 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate); * cfg80211_unregister_wdev - remove the given wdev * @wdev: struct wireless_dev to remove * - * Call this function only for wdevs that have no netdev assigned, - * e.g. P2P Devices. It removes the device from the list so that - * it can no longer be used. It is necessary to call this function - * even when cfg80211 requests the removal of the interface by - * calling the del_virtual_intf() callback. The function must also - * be called when the driver wishes to unregister the wdev, e.g. - * when the device is unbound from the driver. + * This function removes the device so it can no longer be used. It is necessary + * to call this function even when cfg80211 requests the removal of the device + * by calling the del_virtual_intf() callback. The function must also be called + * when the driver wishes to unregister the wdev, e.g. when the hardware device + * is unbound from the driver. * * Requires the RTNL to be held. */ void cfg80211_unregister_wdev(struct wireless_dev *wdev); +/** + * cfg80211_register_netdevice - register the given netdev + * @dev: the netdev to register + * + * Note: In contexts coming from cfg80211 callbacks, you must call this rather + * than register_netdevice(), unregister_netdev() is impossible as the RTNL is + * held. Otherwise, both register_netdevice() and register_netdev() are usable + * instead as well. + */ +int cfg80211_register_netdevice(struct net_device *dev); + +/** + * cfg80211_unregister_netdevice - unregister the given netdev + * @dev: the netdev to register + * + * Note: In contexts coming from cfg80211 callbacks, you must call this rather + * than unregister_netdevice(), unregister_netdev() is impossible as the RTNL + * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are + * usable instead as well. + */ +static inline void cfg80211_unregister_netdevice(struct net_device *dev) +{ + cfg80211_unregister_wdev(dev->ieee80211_ptr); +} + /** * struct cfg80211_ft_event_params - FT Information Elements * @ies: FT IEs diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1b44690823aa..fcaf4d20cabf 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1976,7 +1976,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ndev->min_mtu = 256; ndev->max_mtu = local->hw.max_mtu; - ret = register_netdevice(ndev); + ret = cfg80211_register_netdevice(ndev); if (ret) { free_netdev(ndev); return ret; @@ -2006,10 +2006,9 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) synchronize_rcu(); - if (sdata->dev) { - unregister_netdevice(sdata->dev); - } else { - cfg80211_unregister_wdev(&sdata->wdev); + cfg80211_unregister_wdev(&sdata->wdev); + + if (!sdata->dev) { ieee80211_teardown_sdata(sdata); kfree(sdata); } diff --git a/net/wireless/core.c b/net/wireless/core.c index 4b1f35e976e7..9e7d1f9620bd 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1094,7 +1094,8 @@ void cfg80211_cqm_config_free(struct wireless_dev *wdev) wdev->cqm_config = NULL; } -static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) +static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, + bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -1104,9 +1105,16 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); + wdev->registered = false; + + if (wdev->netdev) { + sysfs_remove_link(&wdev->netdev->dev.kobj, "phy80211"); + if (unregister_netdev) + unregister_netdevice(wdev->netdev); + } + list_del_rcu(&wdev->list); - if (sync) - synchronize_rcu(); + synchronize_net(); rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); @@ -1131,14 +1139,23 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) flush_work(&wdev->disconnect_wk); cfg80211_cqm_config_free(wdev); + + /* + * Ensure that all events have been processed and + * freed. + */ + cfg80211_process_wdev_events(wdev); + + if (WARN_ON(wdev->current_bss)) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); + wdev->current_bss = NULL; + } } void cfg80211_unregister_wdev(struct wireless_dev *wdev) { - if (WARN_ON(wdev->netdev)) - return; - - __cfg80211_unregister_wdev(wdev, true); + _cfg80211_unregister_wdev(wdev, true); } EXPORT_SYMBOL(cfg80211_unregister_wdev); @@ -1290,10 +1307,49 @@ void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, wdev->identifier = ++rdev->wdev_id; list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; + wdev->registered = true; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } +int cfg80211_register_netdevice(struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev; + int ret; + + ASSERT_RTNL(); + + if (WARN_ON(!wdev)) + return -EINVAL; + + rdev = wiphy_to_rdev(wdev->wiphy); + + lockdep_assert_held(&rdev->wiphy.mtx); + + /* we'll take care of this */ + wdev->registered = true; + ret = register_netdevice(dev); + if (ret) + goto out; + + if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, + "phy80211")) { + pr_err("failed to add phy80211 symlink to netdev!\n"); + unregister_netdevice(dev); + ret = -EINVAL; + goto out; + } + + cfg80211_register_wdev(rdev, wdev); + ret = 0; +out: + if (ret) + wdev->registered = false; + return ret; +} +EXPORT_SYMBOL(cfg80211_register_netdevice); + static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { @@ -1319,17 +1375,16 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: + if (!wdev->registered) + cfg80211_register_wdev(rdev, wdev); + break; + case NETDEV_UNREGISTER: /* - * NB: cannot take rdev->mtx here because this may be - * called within code protected by it when interfaces - * are added with nl80211. + * It is possible to get NETDEV_UNREGISTER multiple times, + * so check wdev->registered. */ - if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, - "phy80211")) { - pr_err("failed to add phy80211 symlink to netdev!\n"); - } - - cfg80211_register_wdev(rdev, wdev); + if (wdev->registered) + _cfg80211_unregister_wdev(wdev, false); break; case NETDEV_GOING_DOWN: cfg80211_leave(rdev, wdev); @@ -1401,38 +1456,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->ps = false; } break; - case NETDEV_UNREGISTER: - /* - * It is possible to get NETDEV_UNREGISTER - * multiple times. To detect that, check - * that the interface is still on the list - * of registered interfaces, and only then - * remove and clean it up. - */ - if (!list_empty(&wdev->list)) { - __cfg80211_unregister_wdev(wdev, false); - sysfs_remove_link(&dev->dev.kobj, "phy80211"); - } - /* - * synchronise (so that we won't find this netdev - * from other code any more) and then clear the list - * head so that the above code can safely check for - * !list_empty() to avoid double-cleanup. - */ - synchronize_rcu(); - INIT_LIST_HEAD(&wdev->list); - /* - * Ensure that all events have been processed and - * freed. - */ - cfg80211_process_wdev_events(wdev); - - if (WARN_ON(wdev->current_bss)) { - cfg80211_unhold_bss(wdev->current_bss); - cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); - wdev->current_bss = NULL; - } - break; case NETDEV_PRE_UP: if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, wdev->use_4addr, 0)) -- cgit v1.2.3 From b8288837ef6bdaac331752b401f5ca3b59b37430 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:13 -0800 Subject: devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/net/devlink.h | 16 ++++++++++++++++ include/uapi/linux/devlink.h | 5 +++++ net/core/devlink.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index f466819cc477..dc3bf8000082 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -93,6 +93,18 @@ struct devlink_port_pci_vf_attrs { u8 external:1; }; +/** + * struct devlink_port_pci_sf_attrs - devlink port's PCI SF attributes + * @controller: Associated controller number + * @sf: Associated PCI SF for of the PCI PF for this port. + * @pf: Associated PCI PF number for this port. + */ +struct devlink_port_pci_sf_attrs { + u32 controller; + u32 sf; + u16 pf; +}; + /** * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port @@ -103,6 +115,7 @@ struct devlink_port_pci_vf_attrs { * @phys: physical port attributes * @pci_pf: PCI PF port attributes * @pci_vf: PCI VF port attributes + * @pci_sf: PCI SF port attributes */ struct devlink_port_attrs { u8 split:1, @@ -114,6 +127,7 @@ struct devlink_port_attrs { struct devlink_port_phys_attrs phys; struct devlink_port_pci_pf_attrs pci_pf; struct devlink_port_pci_vf_attrs pci_vf; + struct devlink_port_pci_sf_attrs pci_sf; }; }; @@ -1404,6 +1418,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); +void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, + u32 controller, u16 pf, u32 sf); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cf89c318f2ac..1a241b09a7f8 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -200,6 +200,10 @@ enum devlink_port_flavour { DEVLINK_PORT_FLAVOUR_UNUSED, /* Port which exists in the switch, but * is not used in any way. */ + DEVLINK_PORT_FLAVOUR_PCI_SF, /* Represents eswitch port + * for the PCI SF. It is an internal + * port that faces the PCI SF. + */ }; enum devlink_param_cmode { @@ -529,6 +533,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION_INFO, /* nested */ DEVLINK_ATTR_RELOAD_ACTION_STATS, /* nested */ + DEVLINK_ATTR_PORT_PCI_SF_NUMBER, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index c39496311b71..4cbc02fb602d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -690,6 +690,15 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_sf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_sf.pf) || + nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, + attrs->pci_sf.sf)) + return -EMSGSIZE; + break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: @@ -8374,6 +8383,32 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); +/** + * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes + * + * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance + * @pf: associated PF for the devlink port instance + * @sf: associated SF of a PF for the devlink port instance + */ +void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u32 sf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + if (WARN_ON(devlink_port->registered)) + return; + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_SF); + if (ret) + return; + attrs->pci_sf.controller = controller; + attrs->pci_sf.pf = pf; + attrs->pci_sf.sf = sf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -8422,6 +8457,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, + attrs->pci_sf.sf); + break; } if (n >= len) -- cgit v1.2.3 From cd76dcd68d96aa5bbc63b7ef25a87a1dbea3d73c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:14 -0800 Subject: devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/net/devlink.h | 52 ++++++++++++++++++++++ net/core/devlink.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index dc3bf8000082..d8edd9a10907 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -152,6 +152,17 @@ struct devlink_port { struct mutex reporters_lock; /* Protects reporter_list */ }; +struct devlink_port_new_attrs { + enum devlink_port_flavour flavour; + unsigned int port_index; + u32 controller; + u32 sfnum; + u16 pfnum; + u8 port_index_valid:1, + controller_valid:1, + sfnum_valid:1; +}; + struct devlink_sb_pool_info { enum devlink_sb_pool_type pool_type; u32 size; @@ -1362,6 +1373,47 @@ struct devlink_ops { int (*port_function_hw_addr_set)(struct devlink *devlink, struct devlink_port *port, const u8 *hw_addr, int hw_addr_len, struct netlink_ext_ack *extack); + /** + * port_new() - Add a new port function of a specified flavor + * @devlink: Devlink instance + * @attrs: attributes of the new port + * @extack: extack for reporting error messages + * @new_port_index: index of the new port + * + * Devlink core will call this device driver function upon user request + * to create a new port function of a specified flavor and optional + * attributes + * + * Notes: + * - Called without devlink instance lock being held. Drivers must + * implement own means of synchronization + * - On success, drivers must register a port with devlink core + * + * Return: 0 on success, negative value otherwise. + */ + int (*port_new)(struct devlink *devlink, + const struct devlink_port_new_attrs *attrs, + struct netlink_ext_ack *extack, + unsigned int *new_port_index); + /** + * port_del() - Delete a port function + * @devlink: Devlink instance + * @port_index: port function index to delete + * @extack: extack for reporting error messages + * + * Devlink core will call this device driver function upon user request + * to delete a previously created port function + * + * Notes: + * - Called without devlink instance lock being held. Drivers must + * implement own means of synchronization + * - On success, drivers must unregister the corresponding devlink + * port + * + * Return: 0 on success, negative value otherwise. + */ + int (*port_del)(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/net/core/devlink.c b/net/core/devlink.c index 4cbc02fb602d..541b5f549274 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1147,6 +1147,111 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, return devlink_port_unsplit(devlink, port_index, info->extack); } +static int devlink_port_new_notifiy(struct devlink *devlink, + unsigned int port_index, + struct genl_info *info) +{ + struct devlink_port *devlink_port; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&devlink->lock); + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) { + err = -ENODEV; + goto out; + } + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, info->snd_portid, + info->snd_seq, 0, NULL); + if (err) + goto out; + + err = genlmsg_reply(msg, info); + mutex_unlock(&devlink->lock); + return err; + +out: + mutex_unlock(&devlink->lock); + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink_port_new_attrs new_attrs = {}; + struct devlink *devlink = info->user_ptr[0]; + unsigned int new_port_index; + int err; + + if (!devlink->ops->port_new || !devlink->ops->port_del) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || + !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { + NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified"); + return -EINVAL; + } + new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); + new_attrs.pfnum = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + /* Port index of the new port being created by driver. */ + new_attrs.port_index = + nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + new_attrs.port_index_valid = true; + } + if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { + new_attrs.controller = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); + new_attrs.controller_valid = true; + } + if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && + info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { + new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); + new_attrs.sfnum_valid = true; + } + + err = devlink->ops->port_new(devlink, &new_attrs, extack, + &new_port_index); + if (err) + return err; + + err = devlink_port_new_notifiy(devlink, new_port_index, info); + if (err && err != -ENODEV) { + /* Fail to send the response; destroy newly created port. */ + devlink->ops->port_del(devlink, new_port_index, extack); + } + return err; +} + +static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + unsigned int port_index; + + if (!devlink->ops->port_del) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); + return -EINVAL; + } + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + return devlink->ops->port_del(devlink, port_index, extack); +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -7605,6 +7710,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_ACTION_MAX), [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), + [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7644,6 +7753,18 @@ static const struct genl_small_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_PORT_NEW, + .doit = devlink_nl_cmd_port_new_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, + { + .cmd = DEVLINK_CMD_PORT_DEL, + .doit = devlink_nl_cmd_port_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -- cgit v1.2.3 From a556dded9c23c51c82654f1ebe389cbc0bc22057 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:15 -0800 Subject: devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/net/devlink.h | 32 ++++++++++++++++ include/uapi/linux/devlink.h | 20 ++++++++++ net/core/devlink.c | 90 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 141 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index d8edd9a10907..691ee76ca548 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1414,6 +1414,38 @@ struct devlink_ops { */ int (*port_del)(struct devlink *devlink, unsigned int port_index, struct netlink_ext_ack *extack); + /** + * port_fn_state_get() - Get the state of a port function + * @devlink: Devlink instance + * @port: The devlink port + * @state: Admin configured state + * @opstate: Current operational state + * @extack: extack for reporting error messages + * + * Reports the admin and operational state of a devlink port function + * + * Return: 0 on success, negative value otherwise. + */ + int (*port_fn_state_get)(struct devlink *devlink, + struct devlink_port *port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack); + /** + * port_fn_state_set() - Set the admin state of a port function + * @devlink: Devlink instance + * @port: The devlink port + * @state: Admin state + * @extack: extack for reporting error messages + * + * Set the admin state of a devlink port function + * + * Return: 0 on success, negative value otherwise. + */ + int (*port_fn_state_set)(struct devlink *devlink, + struct devlink_port *port, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1a241b09a7f8..f6008b2fa60f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -583,9 +583,29 @@ enum devlink_resource_unit { enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ + DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ + DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 }; +enum devlink_port_fn_state { + DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE, +}; + +/** + * enum devlink_port_fn_opstate - indicates operational state of the function + * @DEVLINK_PORT_FN_OPSTATE_ATTACHED: Driver is attached to the function. + * For graceful tear down of the function, after inactivation of the + * function, user should wait for operational state to turn DETACHED. + * @DEVLINK_PORT_FN_OPSTATE_DETACHED: Driver is detached from the function. + * It is safe to delete the port. + */ +enum devlink_port_fn_opstate { + DEVLINK_PORT_FN_OPSTATE_DETACHED, + DEVLINK_PORT_FN_OPSTATE_ATTACHED, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 541b5f549274..9f1be69bd5f8 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -87,6 +87,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, + [DEVLINK_PORT_FN_ATTR_STATE] = + NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE), }; static LIST_HEAD(devlink_list); @@ -746,6 +749,58 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * return 0; } +static bool +devlink_port_fn_state_valid(enum devlink_port_fn_state state) +{ + return state == DEVLINK_PORT_FN_STATE_INACTIVE || + state == DEVLINK_PORT_FN_STATE_ACTIVE; +} + +static bool +devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) +{ + return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || + opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; +} + +static int +devlink_port_fn_state_fill(struct devlink *devlink, + const struct devlink_ops *ops, + struct devlink_port *port, struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + enum devlink_port_fn_opstate opstate; + enum devlink_port_fn_state state; + int err; + + if (!ops->port_fn_state_get) + return 0; + + err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + if (!devlink_port_fn_state_valid(state)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver"); + return -EINVAL; + } + if (!devlink_port_fn_opstate_valid(opstate)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG_MOD(extack, + "Invalid operational state read from driver"); + return -EINVAL; + } + if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || + nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) + return -EMSGSIZE; + *msg_updated = true; + return 0; +} + static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) @@ -763,6 +818,11 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por ops = devlink->ops; err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg, extack, &msg_updated); + if (err) + goto out; + err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack, + &msg_updated); +out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else @@ -1028,6 +1088,24 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); } +static int devlink_port_fn_state_set(struct devlink *devlink, + struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + enum devlink_port_fn_state state; + const struct devlink_ops *ops; + + state = nla_get_u8(attr); + ops = devlink->ops; + if (!ops->port_fn_state_set) { + NL_SET_ERR_MSG_MOD(extack, + "Function does not support state setting"); + return -EOPNOTSUPP; + } + return ops->port_fn_state_set(devlink, port, state, extack); +} + static int devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) @@ -1043,8 +1121,18 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, } attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; - if (attr) + if (attr) { err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when + * multiple port function attributes are set along with state, + * Those can be applied first before activating the state. + */ + attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; + if (attr) + err = devlink_port_fn_state_set(devlink, port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); -- cgit v1.2.3 From f3196bb0f14c0ffb5089c15668bda196c98d3900 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:16 -0800 Subject: net/mlx5: Introduce vhca state event notifier vhca state events indicates change in the state of the vhca that may occur due to a SF allocation, deallocation or enabling/disabling the SF HCA. Introduce vhca state event handler which will be used by SF devlink port manager and SF hardware id allocator in subsequent patches to act on the event. This enables single entity to subscribe, query and rearm the event for a function. Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 9 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 4 + drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 + drivers/net/ethernet/mellanox/mlx5/core/eq.c | 3 + drivers/net/ethernet/mellanox/mlx5/core/events.c | 7 + drivers/net/ethernet/mellanox/mlx5/core/main.c | 16 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 + .../mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h | 82 +++++++++ drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h | 45 +++++ .../ethernet/mellanox/mlx5/core/sf/vhca_event.c | 189 +++++++++++++++++++++ .../ethernet/mellanox/mlx5/core/sf/vhca_event.h | 57 +++++++ include/linux/mlx5/driver.h | 4 + 12 files changed, 422 insertions(+) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 6e4d7bb7fea2..d6c48582e7a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -203,3 +203,12 @@ config MLX5_SW_STEERING default y help Build support for software-managed steering in the NIC. + +config MLX5_SF + bool "Mellanox Technologies subfunction device support using auxiliary device" + depends on MLX5_CORE && MLX5_CORE_EN + default n + help + Build support for subfuction device in the NIC. A Mellanox subfunction + device can support RDMA, netdevice and vdpa device. + It is similar to a SRIOV VF but it doesn't require SRIOV support. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 134bd038ae8a..22ef2ebbee96 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -86,3 +86,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o steering/dr_ste_v0.o \ steering/dr_cmd.o steering/dr_fw.o \ steering/dr_action.o steering/fs_dr.o +# +# SF device +# +mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 50c7b9ee80c3..47dcc3ac2cf0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -464,6 +464,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_ALLOC_MEMIC: case MLX5_CMD_OP_MODIFY_XRQ: case MLX5_CMD_OP_RELEASE_XRQ_ERROR: + case MLX5_CMD_OP_QUERY_VHCA_STATE: + case MLX5_CMD_OP_MODIFY_VHCA_STATE: *status = MLX5_DRIVER_STATUS_ABORTED; *synd = MLX5_DRIVER_SYND; return -EIO; @@ -657,6 +659,8 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(DESTROY_UMEM); MLX5_COMMAND_STR_CASE(RELEASE_XRQ_ERROR); MLX5_COMMAND_STR_CASE(MODIFY_XRQ); + MLX5_COMMAND_STR_CASE(QUERY_VHCA_STATE); + MLX5_COMMAND_STR_CASE(MODIFY_VHCA_STATE); default: return "unknown command opcode"; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index fc0afa03d407..421febebc658 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -595,6 +595,9 @@ static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4]) async_event_mask |= (1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED); + if (MLX5_CAP_GEN_MAX(dev, vhca_state)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_VHCA_STATE_CHANGE); + mask[0] = async_event_mask; if (MLX5_CAP_GEN(dev, event_cap)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index 3ce17c3d7a00..5523d218e5fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -110,6 +110,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_CMD"; case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED: return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED"; + case MLX5_EVENT_TYPE_VHCA_STATE_CHANGE: + return "MLX5_EVENT_TYPE_VHCA_STATE_CHANGE"; case MLX5_EVENT_TYPE_PAGE_REQUEST: return "MLX5_EVENT_TYPE_PAGE_REQUEST"; case MLX5_EVENT_TYPE_PAGE_FAULT: @@ -403,3 +405,8 @@ int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, voi { return atomic_notifier_call_chain(&events->nh, event, data); } + +void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work) +{ + queue_work(dev->priv.events->wq, work); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ca6f2fc39ea0..b16f57befe52 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -73,6 +73,7 @@ #include "ecpf.h" #include "lib/hv_vhca.h" #include "diag/rsc_dump.h" +#include "sf/vhca_event.h" MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver"); @@ -567,6 +568,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) if (MLX5_CAP_GEN_MAX(dev, mkey_by_name)) MLX5_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1); + mlx5_vhca_state_cap_handle(dev, set_hca_cap); + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); } @@ -884,6 +887,12 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) goto err_eswitch_cleanup; } + err = mlx5_vhca_event_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init vhca event notifier %d\n", err); + goto err_fpga_cleanup; + } + dev->dm = mlx5_dm_create(dev); if (IS_ERR(dev->dm)) mlx5_core_warn(dev, "Failed to init device memory%d\n", err); @@ -894,6 +903,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) return 0; +err_fpga_cleanup: + mlx5_fpga_cleanup(dev); err_eswitch_cleanup: mlx5_eswitch_cleanup(dev->priv.eswitch); err_sriov_cleanup: @@ -925,6 +936,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_hv_vhca_destroy(dev->hv_vhca); mlx5_fw_tracer_destroy(dev->tracer); mlx5_dm_cleanup(dev); + mlx5_vhca_event_cleanup(dev); mlx5_fpga_cleanup(dev); mlx5_eswitch_cleanup(dev->priv.eswitch); mlx5_sriov_cleanup(dev); @@ -1129,6 +1141,8 @@ static int mlx5_load(struct mlx5_core_dev *dev) goto err_sriov; } + mlx5_vhca_event_start(dev); + err = mlx5_ec_init(dev); if (err) { mlx5_core_err(dev, "Failed to init embedded CPU\n"); @@ -1146,6 +1160,7 @@ static int mlx5_load(struct mlx5_core_dev *dev) err_sriov: mlx5_ec_cleanup(dev); err_ec: + mlx5_vhca_event_stop(dev); mlx5_cleanup_fs(dev); err_fs: mlx5_accel_tls_cleanup(dev); @@ -1173,6 +1188,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) { mlx5_sriov_detach(dev); mlx5_ec_cleanup(dev); + mlx5_vhca_event_stop(dev); mlx5_cleanup_fs(dev); mlx5_accel_ipsec_cleanup(dev); mlx5_accel_tls_cleanup(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 0a0302ce7144..a33b7496d748 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -259,4 +259,6 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state); void mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup); int mlx5_load_one(struct mlx5_core_dev *dev, bool boot); + +void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work); #endif /* __MLX5_CORE_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h new file mode 100644 index 000000000000..1daf5a122ba3 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_IFC_VHCA_EVENT_H__ +#define __MLX5_IFC_VHCA_EVENT_H__ + +enum mlx5_ifc_vhca_state { + MLX5_VHCA_STATE_INVALID = 0x0, + MLX5_VHCA_STATE_ALLOCATED = 0x1, + MLX5_VHCA_STATE_ACTIVE = 0x2, + MLX5_VHCA_STATE_IN_USE = 0x3, + MLX5_VHCA_STATE_TEARDOWN_REQUEST = 0x4, +}; + +struct mlx5_ifc_vhca_state_context_bits { + u8 arm_change_event[0x1]; + u8 reserved_at_1[0xb]; + u8 vhca_state[0x4]; + u8 reserved_at_10[0x10]; + + u8 sw_function_id[0x20]; + + u8 reserved_at_40[0x80]; +}; + +struct mlx5_ifc_query_vhca_state_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_vhca_state_context_bits vhca_state_context; +}; + +struct mlx5_ifc_query_vhca_state_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 embedded_cpu_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_vhca_state_field_select_bits { + u8 reserved_at_0[0x1e]; + u8 sw_function_id[0x1]; + u8 arm_change_event[0x1]; +}; + +struct mlx5_ifc_modify_vhca_state_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_vhca_state_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 embedded_cpu_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + struct mlx5_ifc_vhca_state_field_select_bits vhca_state_field_select; + + struct mlx5_ifc_vhca_state_context_bits vhca_state_context; +}; + +#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h new file mode 100644 index 000000000000..623191679b49 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_H__ +#define __MLX5_SF_H__ + +#include + +static inline u16 mlx5_sf_start_function_id(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf_base_id); +} + +#ifdef CONFIG_MLX5_SF + +static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf); +} + +static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev) +{ + if (!mlx5_sf_supported(dev)) + return 0; + if (MLX5_CAP_GEN(dev, max_num_sf)) + return MLX5_CAP_GEN(dev, max_num_sf); + else + return 1 << MLX5_CAP_GEN(dev, log_max_sf); +} + +#else + +static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev) +{ + return false; +} + +static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev) +{ + return 0; +} + +#endif + +#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c new file mode 100644 index 000000000000..af2f2dd9db25 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include "mlx5_ifc_vhca_event.h" +#include "mlx5_core.h" +#include "vhca_event.h" +#include "ecpf.h" + +struct mlx5_vhca_state_notifier { + struct mlx5_core_dev *dev; + struct mlx5_nb nb; + struct blocking_notifier_head n_head; +}; + +struct mlx5_vhca_event_work { + struct work_struct work; + struct mlx5_vhca_state_notifier *notifier; + struct mlx5_vhca_state_event event; +}; + +int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, + bool ecpu, u32 *out, u32 outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {}; + + MLX5_SET(query_vhca_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_STATE); + MLX5_SET(query_vhca_state_in, in, function_id, function_id); + MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, ecpu); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); +} + +static int mlx5_cmd_modify_vhca_state(struct mlx5_core_dev *dev, u16 function_id, + bool ecpu, u32 *in, u32 inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {}; + + MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE); + MLX5_SET(modify_vhca_state_in, in, function_id, function_id); + MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu); + + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +} + +int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id) +{ + u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {}; + + MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE); + MLX5_SET(modify_vhca_state_in, in, function_id, function_id); + MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu); + MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.sw_function_id, 1); + MLX5_SET(modify_vhca_state_in, in, vhca_state_context.sw_function_id, sw_fn_id); + + return mlx5_cmd_exec_inout(dev, modify_vhca_state, in, out); +} + +int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu) +{ + u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {}; + + MLX5_SET(modify_vhca_state_in, in, vhca_state_context.arm_change_event, 1); + MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.arm_change_event, 1); + + return mlx5_cmd_modify_vhca_state(dev, function_id, ecpu, in, sizeof(in)); +} + +static void +mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event *event) +{ + u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + int err; + + err = mlx5_cmd_query_vhca_state(dev, event->function_id, event->ecpu, out, sizeof(out)); + if (err) + return; + + event->sw_function_id = MLX5_GET(query_vhca_state_out, out, + vhca_state_context.sw_function_id); + event->new_vhca_state = MLX5_GET(query_vhca_state_out, out, + vhca_state_context.vhca_state); + + mlx5_vhca_event_arm(dev, event->function_id, event->ecpu); + + blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event); +} + +static void mlx5_vhca_state_work_handler(struct work_struct *_work) +{ + struct mlx5_vhca_event_work *work = container_of(_work, struct mlx5_vhca_event_work, work); + struct mlx5_vhca_state_notifier *notifier = work->notifier; + struct mlx5_core_dev *dev = notifier->dev; + + mlx5_vhca_event_notify(dev, &work->event); +} + +static int +mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data) +{ + struct mlx5_vhca_state_notifier *notifier = + mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb); + struct mlx5_vhca_event_work *work; + struct mlx5_eqe *eqe = data; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + INIT_WORK(&work->work, &mlx5_vhca_state_work_handler); + work->notifier = notifier; + work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id); + work->event.ecpu = be16_to_cpu(eqe->data.vhca_state.ec_function); + mlx5_events_work_enqueue(notifier->dev, &work->work); + return NOTIFY_OK; +} + +void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap) +{ + if (!mlx5_vhca_event_supported(dev)) + return; + + MLX5_SET(cmd_hca_cap, set_hca_cap, vhca_state, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_allocated, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_active, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_in_use, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_teardown_request, 1); +} + +int mlx5_vhca_event_init(struct mlx5_core_dev *dev) +{ + struct mlx5_vhca_state_notifier *notifier; + + if (!mlx5_vhca_event_supported(dev)) + return 0; + + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); + if (!notifier) + return -ENOMEM; + + dev->priv.vhca_state_notifier = notifier; + notifier->dev = dev; + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->n_head); + MLX5_NB_INIT(¬ifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE); + return 0; +} + +void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev) +{ + if (!mlx5_vhca_event_supported(dev)) + return; + + kfree(dev->priv.vhca_state_notifier); + dev->priv.vhca_state_notifier = NULL; +} + +void mlx5_vhca_event_start(struct mlx5_core_dev *dev) +{ + struct mlx5_vhca_state_notifier *notifier; + + if (!dev->priv.vhca_state_notifier) + return; + + notifier = dev->priv.vhca_state_notifier; + mlx5_eq_notifier_register(dev, ¬ifier->nb); +} + +void mlx5_vhca_event_stop(struct mlx5_core_dev *dev) +{ + struct mlx5_vhca_state_notifier *notifier; + + if (!dev->priv.vhca_state_notifier) + return; + + notifier = dev->priv.vhca_state_notifier; + mlx5_eq_notifier_unregister(dev, ¬ifier->nb); +} + +int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + if (!dev->priv.vhca_state_notifier) + return -EOPNOTSUPP; + return blocking_notifier_chain_register(&dev->priv.vhca_state_notifier->n_head, nb); +} + +void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&dev->priv.vhca_state_notifier->n_head, nb); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h new file mode 100644 index 000000000000..1fe1ec6f4d4b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_VHCA_EVENT_H__ +#define __MLX5_VHCA_EVENT_H__ + +#ifdef CONFIG_MLX5_SF + +struct mlx5_vhca_state_event { + u16 function_id; + u16 sw_function_id; + u8 new_vhca_state; + bool ecpu; +}; + +static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN_MAX(dev, vhca_state); +} + +void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap); +int mlx5_vhca_event_init(struct mlx5_core_dev *dev); +void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev); +void mlx5_vhca_event_start(struct mlx5_core_dev *dev); +void mlx5_vhca_event_stop(struct mlx5_core_dev *dev); +int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb); +void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb); +int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id); +int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu); +int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, + bool ecpu, u32 *out, u32 outlen); +#else + +static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap) +{ +} + +static inline int mlx5_vhca_event_init(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev) +{ +} + +static inline void mlx5_vhca_event_start(struct mlx5_core_dev *dev) +{ +} + +static inline void mlx5_vhca_event_stop(struct mlx5_core_dev *dev) +{ +} + +#endif + +#endif diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f93bfe7473aa..ffba0786051e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -507,6 +507,7 @@ struct mlx5_devcom; struct mlx5_fw_reset; struct mlx5_eq_table; struct mlx5_irq_table; +struct mlx5_vhca_state_notifier; struct mlx5_rate_limit { u32 rate; @@ -603,6 +604,9 @@ struct mlx5_priv { struct mlx5_bfreg_data bfregs; struct mlx5_uars_page *uar; +#ifdef CONFIG_MLX5_SF + struct mlx5_vhca_state_notifier *vhca_state_notifier; +#endif }; enum mlx5_device_state { -- cgit v1.2.3 From 90d010b8634b89a97ca3b7aa6a88fd566fc77717 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:17 -0800 Subject: net/mlx5: SF, Add auxiliary device support Introduce API to add and delete an auxiliary device for an SF. Each SF has its own dedicated window in the PCI BAR 2. SF device is similar to PCI PF and VF that supports multiple class of devices such as net, rdma and vdpa. SF device will be added or removed in subsequent patch during SF devlink port function state change command. A subfunction device exposes user supplied subfunction number which will be further used by systemd/udev to have deterministic name for its netdevice and rdma device. An mlx5 subfunction auxiliary device example: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set ens2f0npf0sf88 hw_addr 00:00:00:00:88:88 state active On activation, $ ls -l /sys/bus/auxiliary/devices/ mlx5_core.sf.4 -> ../../../devices/pci0000:00/0000:00:03.0/0000:06:00.0/mlx5_core.sf.4 $ cat /sys/bus/auxiliary/devices/mlx5_core.sf.4/sfnum 88 Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- .../device_drivers/ethernet/mellanox/mlx5.rst | 5 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 + .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c | 265 +++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h | 35 +++ include/linux/mlx5/driver.h | 2 + 6 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h (limited to 'include') diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst index e9b65035cd47..a5eb22793bb9 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst @@ -97,6 +97,11 @@ Enabling the driver and kconfig options | Provides low-level InfiniBand/RDMA and `RoCE `_ support. +**CONFIG_MLX5_SF=(y/n)** + +| Build support for subfunction. +| Subfunctons are more light weight than PCI SRIOV VFs. Choosing this option +| will enable support for creating subfunction devices. **External options** ( Choose if the corresponding mlx5 feature is required ) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 22ef2ebbee96..f5b2e9101348 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -89,4 +89,4 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o # # SF device # -mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o +mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index b16f57befe52..26b5502712a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -74,6 +74,7 @@ #include "lib/hv_vhca.h" #include "diag/rsc_dump.h" #include "sf/vhca_event.h" +#include "sf/dev/dev.h" MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver"); @@ -1155,6 +1156,8 @@ static int mlx5_load(struct mlx5_core_dev *dev) goto err_sriov; } + mlx5_sf_dev_table_create(dev); + return 0; err_sriov: @@ -1186,6 +1189,7 @@ err_irq_table: static void mlx5_unload(struct mlx5_core_dev *dev) { + mlx5_sf_dev_table_destroy(dev); mlx5_sriov_detach(dev); mlx5_ec_cleanup(dev); mlx5_vhca_event_stop(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c new file mode 100644 index 000000000000..4a8eeb7c853e --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include +#include "mlx5_core.h" +#include "dev.h" +#include "sf/vhca_event.h" +#include "sf/sf.h" +#include "sf/mlx5_ifc_vhca_event.h" +#include "ecpf.h" + +struct mlx5_sf_dev_table { + struct xarray devices; + unsigned int max_sfs; + phys_addr_t base_address; + u64 sf_bar_length; + struct notifier_block nb; + struct mlx5_core_dev *dev; +}; + +static bool mlx5_sf_dev_supported(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf) && mlx5_vhca_event_supported(dev); +} + +static ssize_t sfnum_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct auxiliary_device *adev = container_of(dev, struct auxiliary_device, dev); + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", sf_dev->sfnum); +} +static DEVICE_ATTR_RO(sfnum); + +static struct attribute *sf_device_attrs[] = { + &dev_attr_sfnum.attr, + NULL, +}; + +static const struct attribute_group sf_attr_group = { + .attrs = sf_device_attrs, +}; + +static const struct attribute_group *sf_attr_groups[2] = { + &sf_attr_group, + NULL +}; + +static void mlx5_sf_dev_release(struct device *device) +{ + struct auxiliary_device *adev = container_of(device, struct auxiliary_device, dev); + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + + mlx5_adev_idx_free(adev->id); + kfree(sf_dev); +} + +static void mlx5_sf_dev_remove(struct mlx5_sf_dev *sf_dev) +{ + auxiliary_device_delete(&sf_dev->adev); + auxiliary_device_uninit(&sf_dev->adev); +} + +static void mlx5_sf_dev_add(struct mlx5_core_dev *dev, u16 sf_index, u32 sfnum) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + struct mlx5_sf_dev *sf_dev; + struct pci_dev *pdev; + int err; + int id; + + id = mlx5_adev_idx_alloc(); + if (id < 0) { + err = id; + goto add_err; + } + + sf_dev = kzalloc(sizeof(*sf_dev), GFP_KERNEL); + if (!sf_dev) { + mlx5_adev_idx_free(id); + err = -ENOMEM; + goto add_err; + } + pdev = dev->pdev; + sf_dev->adev.id = id; + sf_dev->adev.name = MLX5_SF_DEV_ID_NAME; + sf_dev->adev.dev.release = mlx5_sf_dev_release; + sf_dev->adev.dev.parent = &pdev->dev; + sf_dev->adev.dev.groups = sf_attr_groups; + sf_dev->sfnum = sfnum; + sf_dev->parent_mdev = dev; + + if (!table->max_sfs) { + mlx5_adev_idx_free(id); + kfree(sf_dev); + err = -EOPNOTSUPP; + goto add_err; + } + sf_dev->bar_base_addr = table->base_address + (sf_index * table->sf_bar_length); + + err = auxiliary_device_init(&sf_dev->adev); + if (err) { + mlx5_adev_idx_free(id); + kfree(sf_dev); + goto add_err; + } + + err = auxiliary_device_add(&sf_dev->adev); + if (err) { + put_device(&sf_dev->adev.dev); + goto add_err; + } + + err = xa_insert(&table->devices, sf_index, sf_dev, GFP_KERNEL); + if (err) + goto xa_err; + return; + +xa_err: + mlx5_sf_dev_remove(sf_dev); +add_err: + mlx5_core_err(dev, "SF DEV: fail device add for index=%d sfnum=%d err=%d\n", + sf_index, sfnum, err); +} + +static void mlx5_sf_dev_del(struct mlx5_core_dev *dev, struct mlx5_sf_dev *sf_dev, u16 sf_index) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + + xa_erase(&table->devices, sf_index); + mlx5_sf_dev_remove(sf_dev); +} + +static int +mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_code, void *data) +{ + struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb); + const struct mlx5_vhca_state_event *event = data; + struct mlx5_sf_dev *sf_dev; + u16 sf_index; + + sf_index = event->function_id - MLX5_CAP_GEN(table->dev, sf_base_id); + sf_dev = xa_load(&table->devices, sf_index); + switch (event->new_vhca_state) { + case MLX5_VHCA_STATE_ALLOCATED: + if (sf_dev) + mlx5_sf_dev_del(table->dev, sf_dev, sf_index); + break; + case MLX5_VHCA_STATE_TEARDOWN_REQUEST: + if (sf_dev) + mlx5_sf_dev_del(table->dev, sf_dev, sf_index); + else + mlx5_core_err(table->dev, + "SF DEV: teardown state for invalid dev index=%d fn_id=0x%x\n", + sf_index, event->sw_function_id); + break; + case MLX5_VHCA_STATE_ACTIVE: + if (!sf_dev) + mlx5_sf_dev_add(table->dev, sf_index, event->sw_function_id); + break; + default: + break; + } + return 0; +} + +static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table) +{ + struct mlx5_core_dev *dev = table->dev; + u16 max_functions; + u16 function_id; + int err = 0; + bool ecpu; + int i; + + max_functions = mlx5_sf_max_functions(dev); + function_id = MLX5_CAP_GEN(dev, sf_base_id); + ecpu = mlx5_read_embedded_cpu(dev); + /* Arm the vhca context as the vhca event notifier */ + for (i = 0; i < max_functions; i++) { + err = mlx5_vhca_event_arm(dev, function_id, ecpu); + if (err) + return err; + + function_id++; + } + return 0; +} + +void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_dev_table *table; + unsigned int max_sfs; + int err; + + if (!mlx5_sf_dev_supported(dev) || !mlx5_vhca_event_supported(dev)) + return; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) { + err = -ENOMEM; + goto table_err; + } + + table->nb.notifier_call = mlx5_sf_dev_state_change_handler; + table->dev = dev; + if (MLX5_CAP_GEN(dev, max_num_sf)) + max_sfs = MLX5_CAP_GEN(dev, max_num_sf); + else + max_sfs = 1 << MLX5_CAP_GEN(dev, log_max_sf); + table->sf_bar_length = 1 << (MLX5_CAP_GEN(dev, log_min_sf_size) + 12); + table->base_address = pci_resource_start(dev->pdev, 2); + table->max_sfs = max_sfs; + xa_init(&table->devices); + dev->priv.sf_dev_table = table; + + err = mlx5_vhca_event_notifier_register(dev, &table->nb); + if (err) + goto vhca_err; + err = mlx5_sf_dev_vhca_arm_all(table); + if (err) + goto arm_err; + mlx5_core_dbg(dev, "SF DEV: max sf devices=%d\n", max_sfs); + return; + +arm_err: + mlx5_vhca_event_notifier_unregister(dev, &table->nb); +vhca_err: + table->max_sfs = 0; + kfree(table); + dev->priv.sf_dev_table = NULL; +table_err: + mlx5_core_err(dev, "SF DEV table create err = %d\n", err); +} + +static void mlx5_sf_dev_destroy_all(struct mlx5_sf_dev_table *table) +{ + struct mlx5_sf_dev *sf_dev; + unsigned long index; + + xa_for_each(&table->devices, index, sf_dev) { + xa_erase(&table->devices, index); + mlx5_sf_dev_remove(sf_dev); + } +} + +void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + + if (!table) + return; + + mlx5_vhca_event_notifier_unregister(dev, &table->nb); + + /* Now that event handler is not running, it is safe to destroy + * the sf device without race. + */ + mlx5_sf_dev_destroy_all(table); + + WARN_ON(!xa_empty(&table->devices)); + kfree(table); + dev->priv.sf_dev_table = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h new file mode 100644 index 000000000000..a6fb7289ba2c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_DEV_H__ +#define __MLX5_SF_DEV_H__ + +#ifdef CONFIG_MLX5_SF + +#include + +#define MLX5_SF_DEV_ID_NAME "sf" + +struct mlx5_sf_dev { + struct auxiliary_device adev; + struct mlx5_core_dev *parent_mdev; + phys_addr_t bar_base_addr; + u32 sfnum; +}; + +void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev); +void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev); + +#else + +static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) +{ +} + +static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) +{ +} + +#endif + +#endif diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ffba0786051e..08e5fbe97df0 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -508,6 +508,7 @@ struct mlx5_fw_reset; struct mlx5_eq_table; struct mlx5_irq_table; struct mlx5_vhca_state_notifier; +struct mlx5_sf_dev_table; struct mlx5_rate_limit { u32 rate; @@ -606,6 +607,7 @@ struct mlx5_priv { struct mlx5_uars_page *uar; #ifdef CONFIG_MLX5_SF struct mlx5_vhca_state_notifier *vhca_state_notifier; + struct mlx5_sf_dev_table *sf_dev_table; #endif }; -- cgit v1.2.3 From 1958fc2f0712ae771bfd2351b55e5a5b6b0bcfa4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:18 -0800 Subject: net/mlx5: SF, Add auxiliary device driver Add auxiliary device driver for mlx5 subfunction auxiliary device. A mlx5 subfunction is similar to PCI PF and VF. For a subfunction an auxiliary device is created. As a result, when mlx5 SF auxiliary device binds to the driver, its netdev and rdma device are created, they appear as $ ls -l /sys/bus/auxiliary/devices/ mlx5_core.sf.4 -> ../../../devices/pci0000:00/0000:00:03.0/0000:06:00.0/mlx5_core.sf.4 $ ls -l /sys/class/net/eth1/device /sys/class/net/eth1/device -> ../../../mlx5_core.sf.4 $ cat /sys/bus/auxiliary/devices/mlx5_core.sf.4/sfnum 88 $ devlink dev show pci/0000:06:00.0 auxiliary/mlx5_core.sf.4 $ devlink port show auxiliary/mlx5_core.sf.4/1 auxiliary/mlx5_core.sf.4/1: type eth netdev p0sf88 flavour virtual port 0 splittable false $ rdma link show mlx5_0/1 link mlx5_0/1 state ACTIVE physical_state LINK_UP netdev p0sf88 $ rdma dev show 8: rocep6s0f1: node_type ca fw 16.29.0550 node_guid 248a:0703:00b3:d113 sys_image_guid 248a:0703:00b3:d112 13: mlx5_0: node_type ca fw 16.29.0550 node_guid 0000:00ff:fe00:8888 sys_image_guid 248a:0703:00b3:d112 In future, devlink device instance name will adapt to have sfnum annotation using either an alias or as devlink instance name described in RFC [1]. [1] https://lore.kernel.org/netdev/20200519092258.GF4655@nanopsycho/ Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 12 +++ drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 12 ++- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 10 ++ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 20 ++++ .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c | 10 ++ .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h | 20 ++++ .../ethernet/mellanox/mlx5/core/sf/dev/driver.c | 101 +++++++++++++++++++++ include/linux/mlx5/driver.h | 4 +- 10 files changed, 187 insertions(+), 6 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index f5b2e9101348..43fa5efd403c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -89,4 +89,4 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o # # SF device # -mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o +mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 3261d0dc1104..9afe918c5827 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -7,6 +7,7 @@ #include "fw_reset.h" #include "fs_core.h" #include "eswitch.h" +#include "sf/dev/dev.h" static int mlx5_devlink_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, @@ -127,6 +128,17 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); + bool sf_dev_allocated; + + sf_dev_allocated = mlx5_sf_dev_allocated(dev); + if (sf_dev_allocated) { + /* Reload results in deleting SF device which further results in + * unregistering devlink instance while holding devlink_mutext. + * Hence, do not support reload. + */ + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported when SFs are allocated\n"); + return -EOPNOTSUPP; + } switch (action) { case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 421febebc658..174dfbc996c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -467,7 +467,7 @@ int mlx5_eq_table_init(struct mlx5_core_dev *dev) for (i = 0; i < MLX5_EVENT_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&eq_table->nh[i]); - eq_table->irq_table = dev->priv.irq_table; + eq_table->irq_table = mlx5_irq_table_get(dev); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 26b5502712a6..ab68320e5516 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -84,7 +84,6 @@ unsigned int mlx5_core_debug_mask; module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644); MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0"); -#define MLX5_DEFAULT_PROF 2 static unsigned int prof_sel = MLX5_DEFAULT_PROF; module_param_named(prof_sel, prof_sel, uint, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); @@ -1303,7 +1302,7 @@ out: mutex_unlock(&dev->intf_state_mutex); } -static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) { struct mlx5_priv *priv = &dev->priv; int err; @@ -1353,7 +1352,7 @@ err_health_init: return err; } -static void mlx5_mdev_uninit(struct mlx5_core_dev *dev) +void mlx5_mdev_uninit(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; @@ -1696,6 +1695,10 @@ static int __init init(void) if (err) goto err_debug; + err = mlx5_sf_driver_register(); + if (err) + goto err_sf; + #ifdef CONFIG_MLX5_CORE_EN err = mlx5e_init(); if (err) { @@ -1706,6 +1709,8 @@ static int __init init(void) return 0; +err_sf: + pci_unregister_driver(&mlx5_core_driver); err_debug: mlx5_unregister_debugfs(); return err; @@ -1716,6 +1721,7 @@ static void __exit cleanup(void) #ifdef CONFIG_MLX5_CORE_EN mlx5e_cleanup(); #endif + mlx5_sf_driver_unregister(); pci_unregister_driver(&mlx5_core_driver); mlx5_unregister_debugfs(); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index a33b7496d748..3754ef98554f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -117,6 +117,8 @@ enum mlx5_semaphore_space_address { MLX5_SEMAPHORE_SW_RESET = 0x20, }; +#define MLX5_DEFAULT_PROF 2 + int mlx5_query_hca_caps(struct mlx5_core_dev *dev); int mlx5_query_board_id(struct mlx5_core_dev *dev); int mlx5_cmd_init(struct mlx5_core_dev *dev); @@ -176,6 +178,7 @@ struct cpumask * mlx5_irq_get_affinity_mask(struct mlx5_irq_table *irq_table, int vecidx); struct cpu_rmap *mlx5_irq_get_rmap(struct mlx5_irq_table *table); int mlx5_irq_get_num_comp(struct mlx5_irq_table *table); +struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev); int mlx5_events_init(struct mlx5_core_dev *dev); void mlx5_events_cleanup(struct mlx5_core_dev *dev); @@ -257,6 +260,13 @@ enum { u8 mlx5_get_nic_state(struct mlx5_core_dev *dev); void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state); +static inline bool mlx5_core_is_sf(const struct mlx5_core_dev *dev) +{ + return dev->coredev_type == MLX5_COREDEV_SF; +} + +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx); +void mlx5_mdev_uninit(struct mlx5_core_dev *dev); void mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup); int mlx5_load_one(struct mlx5_core_dev *dev, bool boot); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 6fd974920394..a61e09aff152 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -30,6 +30,9 @@ int mlx5_irq_table_init(struct mlx5_core_dev *dev) { struct mlx5_irq_table *irq_table; + if (mlx5_core_is_sf(dev)) + return 0; + irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL); if (!irq_table) return -ENOMEM; @@ -40,6 +43,9 @@ int mlx5_irq_table_init(struct mlx5_core_dev *dev) void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev) { + if (mlx5_core_is_sf(dev)) + return; + kvfree(dev->priv.irq_table); } @@ -268,6 +274,9 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev) int nvec; int err; + if (mlx5_core_is_sf(dev)) + return 0; + nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + MLX5_IRQ_VEC_COMP_BASE; nvec = min_t(int, nvec, num_eqs); @@ -319,6 +328,9 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev) struct mlx5_irq_table *table = dev->priv.irq_table; int i; + if (mlx5_core_is_sf(dev)) + return; + /* free_irq requires that affinity and rmap will be cleared * before calling it. This is why there is asymmetry with set_rmap * which should be called after alloc_irq but before request_irq. @@ -332,3 +344,11 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev) kfree(table->irq); } +struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev) +{ +#ifdef CONFIG_MLX5_SF + if (mlx5_core_is_sf(dev)) + return dev->priv.parent_mdev->priv.irq_table; +#endif + return dev->priv.irq_table; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index 4a8eeb7c853e..b265f27b2166 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -24,6 +24,16 @@ static bool mlx5_sf_dev_supported(const struct mlx5_core_dev *dev) return MLX5_CAP_GEN(dev, sf) && mlx5_vhca_event_supported(dev); } +bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + + if (!mlx5_sf_dev_supported(dev)) + return false; + + return !xa_empty(&table->devices); +} + static ssize_t sfnum_show(struct device *dev, struct device_attribute *attr, char *buf) { struct auxiliary_device *adev = container_of(dev, struct auxiliary_device, dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h index a6fb7289ba2c..4de02902aef1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h @@ -13,6 +13,7 @@ struct mlx5_sf_dev { struct auxiliary_device adev; struct mlx5_core_dev *parent_mdev; + struct mlx5_core_dev *mdev; phys_addr_t bar_base_addr; u32 sfnum; }; @@ -20,6 +21,11 @@ struct mlx5_sf_dev { void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev); void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev); +int mlx5_sf_driver_register(void); +void mlx5_sf_driver_unregister(void); + +bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev); + #else static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) @@ -30,6 +36,20 @@ static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) { } +static inline int mlx5_sf_driver_register(void) +{ + return 0; +} + +static inline void mlx5_sf_driver_unregister(void) +{ +} + +static inline bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev) +{ + return 0; +} + #endif #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c new file mode 100644 index 000000000000..daf63a8115e0 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include +#include "mlx5_core.h" +#include "dev.h" +#include "devlink.h" + +static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + struct mlx5_core_dev *mdev; + struct devlink *devlink; + int err; + + devlink = mlx5_devlink_alloc(); + if (!devlink) + return -ENOMEM; + + mdev = devlink_priv(devlink); + mdev->device = &adev->dev; + mdev->pdev = sf_dev->parent_mdev->pdev; + mdev->bar_addr = sf_dev->bar_base_addr; + mdev->iseg_base = sf_dev->bar_base_addr; + mdev->coredev_type = MLX5_COREDEV_SF; + mdev->priv.parent_mdev = sf_dev->parent_mdev; + mdev->priv.adev_idx = adev->id; + sf_dev->mdev = mdev; + + err = mlx5_mdev_init(mdev, MLX5_DEFAULT_PROF); + if (err) { + mlx5_core_warn(mdev, "mlx5_mdev_init on err=%d\n", err); + goto mdev_err; + } + + mdev->iseg = ioremap(mdev->iseg_base, sizeof(*mdev->iseg)); + if (!mdev->iseg) { + mlx5_core_warn(mdev, "remap error\n"); + goto remap_err; + } + + err = mlx5_load_one(mdev, true); + if (err) { + mlx5_core_warn(mdev, "mlx5_load_one err=%d\n", err); + goto load_one_err; + } + return 0; + +load_one_err: + iounmap(mdev->iseg); +remap_err: + mlx5_mdev_uninit(mdev); +mdev_err: + mlx5_devlink_free(devlink); + return err; +} + +static void mlx5_sf_dev_remove(struct auxiliary_device *adev) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + struct devlink *devlink; + + devlink = priv_to_devlink(sf_dev->mdev); + mlx5_unload_one(sf_dev->mdev, true); + iounmap(sf_dev->mdev->iseg); + mlx5_mdev_uninit(sf_dev->mdev); + mlx5_devlink_free(devlink); +} + +static void mlx5_sf_dev_shutdown(struct auxiliary_device *adev) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + + mlx5_unload_one(sf_dev->mdev, false); +} + +static const struct auxiliary_device_id mlx5_sf_dev_id_table[] = { + { .name = MLX5_ADEV_NAME "." MLX5_SF_DEV_ID_NAME, }, + { }, +}; + +MODULE_DEVICE_TABLE(auxiliary, mlx5_sf_dev_id_table); + +static struct auxiliary_driver mlx5_sf_driver = { + .name = MLX5_SF_DEV_ID_NAME, + .probe = mlx5_sf_dev_probe, + .remove = mlx5_sf_dev_remove, + .shutdown = mlx5_sf_dev_shutdown, + .id_table = mlx5_sf_dev_id_table, +}; + +int mlx5_sf_driver_register(void) +{ + return auxiliary_driver_register(&mlx5_sf_driver); +} + +void mlx5_sf_driver_unregister(void) +{ + auxiliary_driver_unregister(&mlx5_sf_driver); +} diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 08e5fbe97df0..48e3638b1185 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -193,7 +193,8 @@ enum port_state_policy { enum mlx5_coredev_type { MLX5_COREDEV_PF, - MLX5_COREDEV_VF + MLX5_COREDEV_VF, + MLX5_COREDEV_SF, }; struct mlx5_field_desc { @@ -608,6 +609,7 @@ struct mlx5_priv { #ifdef CONFIG_MLX5_SF struct mlx5_vhca_state_notifier *vhca_state_notifier; struct mlx5_sf_dev_table *sf_dev_table; + struct mlx5_core_dev *parent_mdev; #endif }; -- cgit v1.2.3 From 8f01054186683fe0986d54d584aa13723d51edce Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:21 -0800 Subject: net/mlx5: SF, Add port add delete functionality To handle SF port management outside of the eswitch as independent software layer, introduce eswitch notifier APIs so that mlx5 upper layer who wish to support sf port management in switchdev mode can perform its task whenever eswitch mode is set to switchdev or before eswitch is disabled. Initialize sf port table on such eswitch event. Add SF port add and delete functionality in switchdev mode. Destroy all SF ports when eswitch is disabled. Expose SF port add and delete to user via devlink commands. $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached or by its unique port index: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00", "state": "inactive", "opstate": "detached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 10 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 5 + drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 + drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 5 + drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 25 ++ drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 12 + drivers/net/ethernet/mellanox/mlx5/core/main.c | 18 ++ drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c | 27 ++ .../net/ethernet/mellanox/mlx5/core/sf/devlink.c | 316 +++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/sf/hw_table.c | 125 ++++++++ drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h | 17 ++ drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h | 37 +++ include/linux/mlx5/driver.h | 6 + 13 files changed, 607 insertions(+) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index d6c48582e7a8..ad45d20f9d44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -212,3 +212,13 @@ config MLX5_SF Build support for subfuction device in the NIC. A Mellanox subfunction device can support RDMA, netdevice and vdpa device. It is similar to a SRIOV VF but it doesn't require SRIOV support. + +config MLX5_SF_MANAGER + bool + depends on MLX5_SF && MLX5_ESWITCH + default y + help + Build support for subfuction port in the NIC. A Mellanox subfunction + port is managed through devlink. A subfunction supports RDMA, netdevice + and vdpa device. It is similar to a SRIOV VF but it doesn't require + SRIOV support. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 43fa5efd403c..40ed31574c80 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -90,3 +90,8 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o # SF device # mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o + +# +# SF manager +# +mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 47dcc3ac2cf0..e8cecd50558d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -333,6 +333,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_DEALLOC_MEMIC: case MLX5_CMD_OP_PAGE_FAULT_RESUME: case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS: + case MLX5_CMD_OP_DEALLOC_SF: return MLX5_CMD_STAT_OK; case MLX5_CMD_OP_QUERY_HCA_CAP: @@ -466,6 +467,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_RELEASE_XRQ_ERROR: case MLX5_CMD_OP_QUERY_VHCA_STATE: case MLX5_CMD_OP_MODIFY_VHCA_STATE: + case MLX5_CMD_OP_ALLOC_SF: *status = MLX5_DRIVER_STATUS_ABORTED; *synd = MLX5_DRIVER_SYND; return -EIO; @@ -661,6 +663,8 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(MODIFY_XRQ); MLX5_COMMAND_STR_CASE(QUERY_VHCA_STATE); MLX5_COMMAND_STR_CASE(MODIFY_VHCA_STATE); + MLX5_COMMAND_STR_CASE(ALLOC_SF); + MLX5_COMMAND_STR_CASE(DEALLOC_SF); default: return "unknown command opcode"; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 9afe918c5827..d4c0cdf5edd9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -8,6 +8,7 @@ #include "fs_core.h" #include "eswitch.h" #include "sf/dev/dev.h" +#include "sf/sf.h" static int mlx5_devlink_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, @@ -190,6 +191,10 @@ static const struct devlink_ops mlx5_devlink_ops = { .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, .port_function_hw_addr_get = mlx5_devlink_port_function_hw_addr_get, .port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set, +#endif +#ifdef CONFIG_MLX5_SF_MANAGER + .port_new = mlx5_devlink_sf_port_new, + .port_del = mlx5_devlink_sf_port_del, #endif .flash_update = mlx5_devlink_flash_update, .info_get = mlx5_devlink_info_get, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 538d2e44a589..820305b1664e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1599,6 +1599,15 @@ mlx5_eswitch_update_num_of_vfs(struct mlx5_eswitch *esw, int num_vfs) kvfree(out); } +static void mlx5_esw_mode_change_notify(struct mlx5_eswitch *esw, u16 mode) +{ + struct mlx5_esw_event_info info = {}; + + info.new_mode = mode; + + blocking_notifier_call_chain(&esw->n_head, 0, &info); +} + /** * mlx5_eswitch_enable_locked - Enable eswitch * @esw: Pointer to eswitch @@ -1659,6 +1668,8 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs) mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS", esw->esw_funcs.num_vfs, esw->enabled_vports); + mlx5_esw_mode_change_notify(esw, mode); + return 0; abort: @@ -1715,6 +1726,11 @@ void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf) esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS", esw->esw_funcs.num_vfs, esw->enabled_vports); + /* Notify eswitch users that it is exiting from current mode. + * So that it can do necessary cleanup before the eswitch is disabled. + */ + mlx5_esw_mode_change_notify(esw, MLX5_ESWITCH_NONE); + mlx5_eswitch_event_handlers_unregister(esw); if (esw->mode == MLX5_ESWITCH_LEGACY) @@ -1815,6 +1831,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE; dev->priv.eswitch = esw; + BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); return 0; abort: if (esw->work_queue) @@ -2506,4 +2523,12 @@ bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS); } +int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&esw->n_head, nb); +} +void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&esw->n_head, nb); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 54514b04808d..479d2ac2cd85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -278,6 +278,7 @@ struct mlx5_eswitch { struct { u32 large_group_num; } params; + struct blocking_notifier_head n_head; }; void esw_offloads_disable(struct mlx5_eswitch *esw); @@ -733,6 +734,17 @@ int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_p u16 vport_num, u32 sfnum); void mlx5_esw_offloads_sf_vport_disable(struct mlx5_eswitch *esw, u16 vport_num); +/** + * mlx5_esw_event_info - Indicates eswitch mode changed/changing. + * + * @new_mode: New mode of eswitch. + */ +struct mlx5_esw_event_info { + u16 new_mode; +}; + +int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *n); +void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *n); #else /* CONFIG_MLX5_ESWITCH */ /* eswitch API stubs */ static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ab68320e5516..d6bd09dd7490 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -893,6 +893,18 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) goto err_fpga_cleanup; } + err = mlx5_sf_hw_table_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init SF HW table %d\n", err); + goto err_sf_hw_table_cleanup; + } + + err = mlx5_sf_table_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init SF table %d\n", err); + goto err_sf_table_cleanup; + } + dev->dm = mlx5_dm_create(dev); if (IS_ERR(dev->dm)) mlx5_core_warn(dev, "Failed to init device memory%d\n", err); @@ -903,6 +915,10 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) return 0; +err_sf_table_cleanup: + mlx5_sf_hw_table_cleanup(dev); +err_sf_hw_table_cleanup: + mlx5_vhca_event_cleanup(dev); err_fpga_cleanup: mlx5_fpga_cleanup(dev); err_eswitch_cleanup: @@ -936,6 +952,8 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_hv_vhca_destroy(dev->hv_vhca); mlx5_fw_tracer_destroy(dev->tracer); mlx5_dm_cleanup(dev); + mlx5_sf_table_cleanup(dev); + mlx5_sf_hw_table_cleanup(dev); mlx5_vhca_event_cleanup(dev); mlx5_fpga_cleanup(dev); mlx5_eswitch_cleanup(dev->priv.eswitch); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c new file mode 100644 index 000000000000..0bc3075f34fa --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include "priv.h" + +int mlx5_cmd_alloc_sf(struct mlx5_core_dev *dev, u16 function_id) +{ + u32 out[MLX5_ST_SZ_DW(alloc_sf_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_sf_in)] = {}; + + MLX5_SET(alloc_sf_in, in, opcode, MLX5_CMD_OP_ALLOC_SF); + MLX5_SET(alloc_sf_in, in, function_id, function_id); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id) +{ + u32 out[MLX5_ST_SZ_DW(dealloc_sf_out)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_sf_in)] = {}; + + MLX5_SET(dealloc_sf_in, in, opcode, MLX5_CMD_OP_DEALLOC_SF); + MLX5_SET(dealloc_sf_in, in, function_id, function_id); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c new file mode 100644 index 000000000000..7ad0d210ec30 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include "eswitch.h" +#include "priv.h" + +struct mlx5_sf { + struct devlink_port dl_port; + unsigned int port_index; + u16 id; +}; + +struct mlx5_sf_table { + struct mlx5_core_dev *dev; /* To refer from notifier context. */ + struct xarray port_indices; /* port index based lookup. */ + refcount_t refcount; + struct completion disable_complete; + struct notifier_block esw_nb; +}; + +static struct mlx5_sf * +mlx5_sf_lookup_by_index(struct mlx5_sf_table *table, unsigned int port_index) +{ + return xa_load(&table->port_indices, port_index); +} + +static int mlx5_sf_id_insert(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + return xa_insert(&table->port_indices, sf->port_index, sf, GFP_KERNEL); +} + +static void mlx5_sf_id_erase(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + xa_erase(&table->port_indices, sf->port_index); +} + +static struct mlx5_sf * +mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *extack) +{ + unsigned int dl_port_index; + struct mlx5_sf *sf; + u16 hw_fn_id; + int id_err; + int err; + + id_err = mlx5_sf_hw_table_sf_alloc(table->dev, sfnum); + if (id_err < 0) { + err = id_err; + goto id_err; + } + + sf = kzalloc(sizeof(*sf), GFP_KERNEL); + if (!sf) { + err = -ENOMEM; + goto alloc_err; + } + sf->id = id_err; + hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sf->id); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(table->dev, hw_fn_id); + sf->port_index = dl_port_index; + + err = mlx5_sf_id_insert(table, sf); + if (err) + goto insert_err; + + return sf; + +insert_err: + kfree(sf); +alloc_err: + mlx5_sf_hw_table_sf_free(table->dev, id_err); +id_err: + if (err == -EEXIST) + NL_SET_ERR_MSG_MOD(extack, "SF already exist. Choose different sfnum"); + return ERR_PTR(err); +} + +static void mlx5_sf_free(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + mlx5_sf_id_erase(table, sf); + mlx5_sf_hw_table_sf_free(table->dev, sf->id); + kfree(sf); +} + +static struct mlx5_sf_table *mlx5_sf_table_try_get(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table = dev->priv.sf_table; + + if (!table) + return NULL; + + return refcount_inc_not_zero(&table->refcount) ? table : NULL; +} + +static void mlx5_sf_table_put(struct mlx5_sf_table *table) +{ + if (refcount_dec_and_test(&table->refcount)) + complete(&table->disable_complete); +} + +static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, + const struct devlink_port_new_attrs *new_attr, + struct netlink_ext_ack *extack, + unsigned int *new_port_index) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_sf *sf; + u16 hw_fn_id; + int err; + + sf = mlx5_sf_alloc(table, new_attr->sfnum, extack); + if (IS_ERR(sf)) + return PTR_ERR(sf); + + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, sf->id); + err = mlx5_esw_offloads_sf_vport_enable(esw, &sf->dl_port, hw_fn_id, new_attr->sfnum); + if (err) + goto esw_err; + *new_port_index = sf->port_index; + return 0; + +esw_err: + mlx5_sf_free(table, sf); + return err; +} + +static void mlx5_sf_del(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + u16 hw_fn_id; + + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, sf->id); + mlx5_esw_offloads_sf_vport_disable(esw, hw_fn_id); + mlx5_sf_free(table, sf); +} + +static int +mlx5_sf_new_check_attr(struct mlx5_core_dev *dev, const struct devlink_port_new_attrs *new_attr, + struct netlink_ext_ack *extack) +{ + if (new_attr->flavour != DEVLINK_PORT_FLAVOUR_PCI_SF) { + NL_SET_ERR_MSG_MOD(extack, "Driver supports only SF port addition"); + return -EOPNOTSUPP; + } + if (new_attr->port_index_valid) { + NL_SET_ERR_MSG_MOD(extack, + "Driver does not support user defined port index assignment"); + return -EOPNOTSUPP; + } + if (!new_attr->sfnum_valid) { + NL_SET_ERR_MSG_MOD(extack, + "User must provide unique sfnum. Driver does not support auto assignment"); + return -EOPNOTSUPP; + } + if (new_attr->controller_valid && new_attr->controller) { + NL_SET_ERR_MSG_MOD(extack, "External controller is unsupported"); + return -EOPNOTSUPP; + } + if (new_attr->pfnum != PCI_FUNC(dev->pdev->devfn)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid pfnum supplied"); + return -EOPNOTSUPP; + } + return 0; +} + +int mlx5_devlink_sf_port_new(struct devlink *devlink, + const struct devlink_port_new_attrs *new_attr, + struct netlink_ext_ack *extack, + unsigned int *new_port_index) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_sf_table *table; + int err; + + err = mlx5_sf_new_check_attr(dev, new_attr, extack); + if (err) + return err; + + table = mlx5_sf_table_try_get(dev); + if (!table) { + NL_SET_ERR_MSG_MOD(extack, + "Port add is only supported in eswitch switchdev mode or SF ports are disabled."); + return -EOPNOTSUPP; + } + err = mlx5_sf_add(dev, table, new_attr, extack, new_port_index); + mlx5_sf_table_put(table); + return err; +} + +int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_sf_table *table; + struct mlx5_sf *sf; + int err = 0; + + table = mlx5_sf_table_try_get(dev); + if (!table) { + NL_SET_ERR_MSG_MOD(extack, + "Port del is only supported in eswitch switchdev mode or SF ports are disabled."); + return -EOPNOTSUPP; + } + sf = mlx5_sf_lookup_by_index(table, port_index); + if (!sf) { + err = -ENODEV; + goto sf_err; + } + + mlx5_sf_del(dev, table, sf); +sf_err: + mlx5_sf_table_put(table); + return err; +} + +static void mlx5_sf_destroy_all(struct mlx5_sf_table *table) +{ + struct mlx5_core_dev *dev = table->dev; + unsigned long index; + struct mlx5_sf *sf; + + xa_for_each(&table->port_indices, index, sf) + mlx5_sf_del(dev, table, sf); +} + +static void mlx5_sf_table_enable(struct mlx5_sf_table *table) +{ + if (!mlx5_sf_max_functions(table->dev)) + return; + + init_completion(&table->disable_complete); + refcount_set(&table->refcount, 1); +} + +static void mlx5_sf_table_disable(struct mlx5_sf_table *table) +{ + if (!mlx5_sf_max_functions(table->dev)) + return; + + if (!refcount_read(&table->refcount)) + return; + + /* Balances with refcount_set; drop the reference so that new user cmd cannot start. */ + mlx5_sf_table_put(table); + wait_for_completion(&table->disable_complete); + + /* At this point, no new user commands can start. + * It is safe to destroy all user created SFs. + */ + mlx5_sf_destroy_all(table); +} + +static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, esw_nb); + const struct mlx5_esw_event_info *mode = data; + + switch (mode->new_mode) { + case MLX5_ESWITCH_OFFLOADS: + mlx5_sf_table_enable(table); + break; + case MLX5_ESWITCH_NONE: + mlx5_sf_table_disable(table); + break; + default: + break; + }; + + return 0; +} + +static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev) +{ + return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && mlx5_sf_supported(dev); +} + +int mlx5_sf_table_init(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table; + int err; + + if (!mlx5_sf_table_supported(dev)) + return 0; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->dev = dev; + xa_init(&table->port_indices); + dev->priv.sf_table = table; + table->esw_nb.notifier_call = mlx5_sf_esw_event; + err = mlx5_esw_event_notifier_register(dev->priv.eswitch, &table->esw_nb); + if (err) + goto reg_err; + return 0; + +reg_err: + kfree(table); + dev->priv.sf_table = NULL; + return err; +} + +void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table = dev->priv.sf_table; + + if (!table) + return; + + mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb); + WARN_ON(refcount_read(&table->refcount)); + WARN_ON(!xa_empty(&table->port_indices)); + kfree(table); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c new file mode 100644 index 000000000000..c7757f399e8a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ +#include +#include "vhca_event.h" +#include "priv.h" +#include "sf.h" +#include "ecpf.h" + +struct mlx5_sf_hw { + u32 usr_sfnum; + u8 allocated: 1; +}; + +struct mlx5_sf_hw_table { + struct mlx5_core_dev *dev; + struct mlx5_sf_hw *sfs; + int max_local_functions; + u8 ecpu: 1; +}; + +u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id) +{ + return sw_id + mlx5_sf_start_function_id(dev); +} + +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + int sw_id = -ENOSPC; + u16 hw_fn_id; + int err; + int i; + + if (!table->max_local_functions) + return -EOPNOTSUPP; + + /* Check if sf with same sfnum already exists or not. */ + for (i = 0; i < table->max_local_functions; i++) { + if (table->sfs[i].allocated && table->sfs[i].usr_sfnum == usr_sfnum) + return -EEXIST; + } + + /* Find the free entry and allocate the entry from the array */ + for (i = 0; i < table->max_local_functions; i++) { + if (!table->sfs[i].allocated) { + table->sfs[i].usr_sfnum = usr_sfnum; + table->sfs[i].allocated = true; + sw_id = i; + break; + } + } + if (sw_id == -ENOSPC) { + err = -ENOSPC; + goto err; + } + + hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id); + err = mlx5_cmd_alloc_sf(table->dev, hw_fn_id); + if (err) + goto err; + + err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum); + if (err) + goto vhca_err; + + return sw_id; + +vhca_err: + mlx5_cmd_dealloc_sf(table->dev, hw_fn_id); +err: + table->sfs[i].allocated = false; + return err; +} + +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + u16 hw_fn_id; + + hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, id); + mlx5_cmd_dealloc_sf(table->dev, hw_fn_id); + table->sfs[id].allocated = false; +} + +int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_hw_table *table; + struct mlx5_sf_hw *sfs; + int max_functions; + + if (!mlx5_sf_supported(dev)) + return 0; + + max_functions = mlx5_sf_max_functions(dev); + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + sfs = kcalloc(max_functions, sizeof(*sfs), GFP_KERNEL); + if (!sfs) + goto table_err; + + table->dev = dev; + table->sfs = sfs; + table->max_local_functions = max_functions; + table->ecpu = mlx5_read_embedded_cpu(dev); + dev->priv.sf_hw_table = table; + mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions); + return 0; + +table_err: + kfree(table); + return -ENOMEM; +} + +void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + + if (!table) + return; + + kfree(table->sfs); + kfree(table); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h new file mode 100644 index 000000000000..7f3622375a9c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_PRIV_H__ +#define __MLX5_SF_PRIV_H__ + +#include + +int mlx5_cmd_alloc_sf(struct mlx5_core_dev *dev, u16 function_id); +int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id); + +u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id); + +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum); +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id); + +#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h index 623191679b49..31278dc42e72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h @@ -42,4 +42,41 @@ static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev) #endif +#ifdef CONFIG_MLX5_SF_MANAGER + +int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev); +void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev); + +int mlx5_sf_table_init(struct mlx5_core_dev *dev); +void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev); + +int mlx5_devlink_sf_port_new(struct devlink *devlink, + const struct devlink_port_new_attrs *add_attr, + struct netlink_ext_ack *extack, + unsigned int *new_port_index); +int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack); + +#else + +static inline int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) +{ +} + +static inline int mlx5_sf_table_init(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) +{ +} + +#endif + #endif diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 48e3638b1185..7e357c7f0d5e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -510,6 +510,8 @@ struct mlx5_eq_table; struct mlx5_irq_table; struct mlx5_vhca_state_notifier; struct mlx5_sf_dev_table; +struct mlx5_sf_hw_table; +struct mlx5_sf_table; struct mlx5_rate_limit { u32 rate; @@ -611,6 +613,10 @@ struct mlx5_priv { struct mlx5_sf_dev_table *sf_dev_table; struct mlx5_core_dev *parent_mdev; #endif +#ifdef CONFIG_MLX5_SF_MANAGER + struct mlx5_sf_hw_table *sf_hw_table; + struct mlx5_sf_table *sf_table; +#endif }; enum mlx5_device_state { -- cgit v1.2.3 From a05a7280f5453ed24c2001eb66b359776ab18cb5 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Thu, 21 Jan 2021 22:31:13 +0800 Subject: tcp: remove unused ICSK_TIME_EARLY_RETRANS Since the early retransmit has been removed by commit bec41a11dd3d ("tcp: remove early retransmit"), we also remove the unused ICSK_TIME_EARLY_RETRANS macro. Signed-off-by: Pengcheng Yang Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/1611239473-27304-1-git-send-email-yangpc@wangsu.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 111d7771b208..c11f80f328f1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -141,7 +141,6 @@ struct inet_connection_sock { #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ #define ICSK_TIME_DACK 2 /* Delayed ack timer */ #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ -#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ @@ -227,8 +226,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, } if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || - what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE || - what == ICSK_TIME_REO_TIMEOUT) { + what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { icsk->icsk_pending = what; icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); -- cgit v1.2.3 From e7ed11ee945438b737e2ae2370e35591e16ec371 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Wed, 20 Jan 2021 12:41:55 -0800 Subject: tcp: add TTL to SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_TTL to SCM_TIMESTAMPING_OPT_STATS that exports the time-to-live or hop limit of the latest incoming packet with SCM_TSTAMP_ACK. The value exported may not be from the packet that acks the sequence when incoming packets are aggregated. Exporting the time-to-live or hop limit value of incoming packets helps to estimate the hop count of the path of the flow that may change over time. Signed-off-by: Yousuk Seung Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Link: https://lore.kernel.org/r/20210120204155.552275-1-ysseung@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- include/linux/tcp.h | 3 ++- include/uapi/linux/tcp.h | 1 + net/core/dev.c | 2 +- net/core/skbuff.c | 6 ++++-- net/ipv4/tcp.c | 18 +++++++++++++++++- net/ipv4/tcp_input.c | 16 ++++++++-------- 7 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 186dad231e30..9313b5aaf45b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3859,7 +3859,7 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); -void __skb_tstamp_tx(struct sk_buff *orig_skb, +void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype); diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2f87377e9af7..48d8a363319e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -496,7 +496,8 @@ static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, - const struct sk_buff *orig_skb); + const struct sk_buff *orig_skb, + const struct sk_buff *ack_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 768e93bd5b51..16dfa40bdac3 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -314,6 +314,7 @@ enum { TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ + TCP_NLA_TTL, /* TTL or hop limit of a packet received */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/core/dev.c b/net/core/dev.c index d9ce02e95992..6df3f1bcdc68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4084,7 +4084,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) - __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 145503d3f06b..2af12f7e170c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4721,6 +4721,7 @@ err: EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); void __skb_tstamp_tx(struct sk_buff *orig_skb, + const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) { @@ -4743,7 +4744,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk, orig_skb); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb, + ack_skb); opt_stats = true; } else #endif @@ -4772,7 +4774,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { - return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, SCM_TSTAMP_SND); } EXPORT_SYMBOL_GPL(skb_tstamp_tx); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 856ae516ac18..a1a17b64f8cd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3767,11 +3767,24 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ 0; } +/* Returns TTL or hop limit of an incoming packet from skb. */ +static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return ip_hdr(skb)->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ipv6_hdr(skb)->hop_limit; + else + return 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, - const struct sk_buff *orig_skb) + const struct sk_buff *orig_skb, + const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3827,6 +3840,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); + if (ack_skb) + nla_put_u8(stats, TCP_NLA_TTL, + tcp_skb_ttl_or_hop_limit(ack_skb)); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a7dfca0a38cd..d4f66aba9fd8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3145,7 +3145,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) } static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, - u32 prior_snd_una) + const struct sk_buff *ack_skb, u32 prior_snd_una) { const struct skb_shared_info *shinfo; @@ -3157,7 +3157,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) { tcp_skb_tsorted_save(skb) { - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK); } tcp_skb_tsorted_restore(skb); } } @@ -3166,8 +3166,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, - u32 prior_snd_una, +static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, + u32 prior_fack, u32 prior_snd_una, struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3256,7 +3256,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; - tcp_ack_tstamp(sk, skb, prior_snd_una); + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) @@ -3274,7 +3274,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->snd_up = tp->snd_una; if (skb) { - tcp_ack_tstamp(sk, skb, prior_snd_una); + tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) flag |= FLAG_SACK_RENEGING; } @@ -3809,8 +3809,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, - flag & FLAG_ECE); + flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una, + &sack_state, flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); -- cgit v1.2.3 From 7eeba1706eba6def15f6cb2fc7b3c3b9a2651edc Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Wed, 20 Jan 2021 16:41:48 -0800 Subject: tcp: Add receive timestamp support for receive zerocopy. tcp_recvmsg() uses the CMSG mechanism to receive control information like packet receive timestamps. This patch adds CMSG fields to struct tcp_zerocopy_receive, and provides receive timestamps if available to the user. Signed-off-by: Arjun Roy Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 4 ++ net/ipv4/tcp.c | 116 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 88 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 16dfa40bdac3..42fc5a640df4 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -354,5 +354,9 @@ struct tcp_zerocopy_receive { __u64 copybuf_address; /* in: copybuf address (small reads) */ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ __u32 flags; /* in: flags */ + __u64 msg_control; /* ancillary data */ + __u64 msg_controllen; + __u32 msg_flags; + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ }; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7ba0bd0675cb..e1a17c6b473c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1745,6 +1745,20 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec64(skb->tstamp); + else + tss->ts[0] = (struct timespec64) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec64) {0}; +} + #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; @@ -1848,13 +1862,13 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, - struct tcp_zerocopy_receive *zc, int inq) + struct tcp_zerocopy_receive *zc, int inq, + struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; - struct scm_timestamping_internal tss_unused; - int err, cmsg_flags_unused; struct msghdr msg = {}; struct iovec iov; + int err; zc->length = 0; zc->recv_skip_hint = 0; @@ -1868,7 +1882,7 @@ static int receive_fallback_to_copy(struct sock *sk, return err; err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, - &tss_unused, &cmsg_flags_unused); + tss, &zc->msg_flags); if (err < 0) return err; @@ -1909,21 +1923,27 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, return (__s32)copylen; } -static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc, - struct sock *sk, - struct sk_buff *skb, - u32 *seq, - s32 copybuf_len) +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, + struct sock *sk, + struct sk_buff *skb, + u32 *seq, + s32 copybuf_len, + struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ - if (skb) + if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; - else + } else { skb = tcp_recv_skb(sk, *seq, &offset); + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } + } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); @@ -2010,9 +2030,37 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, err); } +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); +static void tcp_zc_finalize_rx_tstamp(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) +{ + unsigned long msg_control_addr; + struct msghdr cmsg_dummy; + + msg_control_addr = (unsigned long)zc->msg_control; + cmsg_dummy.msg_control = (void *)msg_control_addr; + cmsg_dummy.msg_controllen = + (__kernel_size_t)zc->msg_controllen; + cmsg_dummy.msg_flags = in_compat_syscall() + ? MSG_CMSG_COMPAT : 0; + zc->msg_flags = 0; + if (zc->msg_control == msg_control_addr && + zc->msg_controllen == cmsg_dummy.msg_controllen) { + tcp_recv_timestamp(&cmsg_dummy, sk, tss); + zc->msg_control = (__u64) + ((uintptr_t)cmsg_dummy.msg_control); + zc->msg_controllen = + (__u64)cmsg_dummy.msg_controllen; + zc->msg_flags = (__u32)cmsg_dummy.msg_flags; + } +} + #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, - struct tcp_zerocopy_receive *zc) + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; @@ -2029,6 +2077,7 @@ static int tcp_zerocopy_receive(struct sock *sk, int ret; zc->copybuf_len = 0; + zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; @@ -2039,7 +2088,7 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) - return receive_fallback_to_copy(sk, zc, inq); + return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; @@ -2084,6 +2133,11 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) @@ -2126,8 +2180,7 @@ out: mmap_read_unlock(current->mm); /* Try to copy straggler data. */ if (!ret) - copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq, - copybuf_len); + copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); @@ -2148,20 +2201,6 @@ out: } #endif -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) -{ - if (skb->tstamp) - tss->ts[0] = ktime_to_timespec64(skb->tstamp); - else - tss->ts[0] = (struct timespec64) {0}; - - if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); - else - tss->ts[2] = (struct timespec64) {0}; -} - /* Similar to __sock_recv_timestamp, but does not require an skb */ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) @@ -4105,6 +4144,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { + struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; @@ -4120,11 +4160,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (copy_from_user(&zc, optval, len)) return -EFAULT; lock_sock(sk); - err = tcp_zerocopy_receive(sk, &zc); + err = tcp_zerocopy_receive(sk, &zc, &tss); release_sock(sk); - if (len >= offsetofend(struct tcp_zerocopy_receive, err)) - goto zerocopy_rcv_sk_err; + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; switch (len) { + case offsetofend(struct tcp_zerocopy_receive, msg_flags): + goto zerocopy_rcv_cmsg; + case offsetofend(struct tcp_zerocopy_receive, msg_controllen): + case offsetofend(struct tcp_zerocopy_receive, msg_control): + case offsetofend(struct tcp_zerocopy_receive, flags): + case offsetofend(struct tcp_zerocopy_receive, copybuf_len): + case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): @@ -4133,6 +4180,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, default: goto zerocopy_rcv_out; } +zerocopy_rcv_cmsg: + if (zc.msg_flags & TCP_CMSG_TS) + tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); + else + zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); -- cgit v1.2.3 From ca1e4ab199933e1af3f9a86d31060b7f9181c3fc Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 19 Jan 2021 14:08:11 +0200 Subject: net: sched: Add multi-queue support to sch_tree_lock The existing qdiscs that set TCQ_F_MQROOT don't use sch_tree_lock. However, hardware-offloaded HTB will start setting this flag while also using sch_tree_lock. The current implementation of sch_tree_lock basically locks on qdisc->dev_queue->qdisc, and it works fine when the tree is attached to some queue. However, it's not the case for MQROOT qdiscs: such a qdisc is the root itself, and its dev_queue just points to queue 0, while not actually being used, because there are real per-queue qdiscs. This patch changes the logic of sch_tree_lock and sch_tree_unlock to lock the qdisc itself if it's the MQROOT. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e7bee99aebce..296919010552 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -552,14 +552,20 @@ static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) return qdisc->dev_queue->dev; } -static inline void sch_tree_lock(const struct Qdisc *q) +static inline void sch_tree_lock(struct Qdisc *q) { - spin_lock_bh(qdisc_root_sleeping_lock(q)); + if (q->flags & TCQ_F_MQROOT) + spin_lock_bh(qdisc_lock(q)); + else + spin_lock_bh(qdisc_root_sleeping_lock(q)); } -static inline void sch_tree_unlock(const struct Qdisc *q) +static inline void sch_tree_unlock(struct Qdisc *q) { - spin_unlock_bh(qdisc_root_sleeping_lock(q)); + if (q->flags & TCQ_F_MQROOT) + spin_unlock_bh(qdisc_lock(q)); + else + spin_unlock_bh(qdisc_root_sleeping_lock(q)); } extern struct Qdisc noop_qdisc; -- cgit v1.2.3 From 4dd78a73738afa92d33a226ec477b42938b31c83 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 19 Jan 2021 14:08:12 +0200 Subject: net: sched: Add extack to Qdisc_class_ops.delete In a following commit, sch_htb will start using extack in the delete class operation to pass hardware errors in offload mode. This commit prepares for that by adding the extack parameter to this callback and converting usage of the existing qdiscs. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 3 ++- net/sched/sch_api.c | 7 ++++--- net/sched/sch_atm.c | 3 ++- net/sched/sch_cbq.c | 3 ++- net/sched/sch_drr.c | 3 ++- net/sched/sch_dsmark.c | 3 ++- net/sched/sch_hfsc.c | 3 ++- net/sched/sch_htb.c | 3 ++- net/sched/sch_qfq.c | 3 ++- net/sched/sch_sfb.c | 3 ++- 10 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 296919010552..070f01bf17eb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -210,7 +210,8 @@ struct Qdisc_class_ops { int (*change)(struct Qdisc *, u32, u32, struct nlattr **, unsigned long *, struct netlink_ext_ack *); - int (*delete)(struct Qdisc *, unsigned long); + int (*delete)(struct Qdisc *, unsigned long, + struct netlink_ext_ack *); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6fe4e5cc807c..e2e4353db8a7 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1866,7 +1866,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, static int tclass_del_notify(struct net *net, const struct Qdisc_class_ops *cops, struct sk_buff *oskb, struct nlmsghdr *n, - struct Qdisc *q, unsigned long cl) + struct Qdisc *q, unsigned long cl, + struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct sk_buff *skb; @@ -1885,7 +1886,7 @@ static int tclass_del_notify(struct net *net, return -EINVAL; } - err = cops->delete(q, cl); + err = cops->delete(q, cl, extack); if (err) { kfree_skb(skb); return err; @@ -2088,7 +2089,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, goto out; break; case RTM_DELTCLASS: - err = tclass_del_notify(net, cops, skb, n, q, cl); + err = tclass_del_notify(net, cops, skb, n, q, cl, extack); /* Unbind the class with flilters with 0 */ tc_bind_tclass(q, portid, clid, 0); goto out; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 007bd2d9f1ff..d0c9a57398fc 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -320,7 +320,8 @@ err_out: return error; } -static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) +static int atm_tc_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)arg; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 53d45e029c36..320b3d31fa97 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1675,7 +1675,8 @@ failure: return err; } -static int cbq_delete(struct Qdisc *sch, unsigned long arg) +static int cbq_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index dde564670ad8..fc1e47069593 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -146,7 +146,8 @@ static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) kfree(cl); } -static int drr_delete_class(struct Qdisc *sch, unsigned long arg) +static int drr_delete_class(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2b88710994d7..cd2748e2d4a2 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -150,7 +150,8 @@ errout: return err; } -static int dsmark_delete(struct Qdisc *sch, unsigned long arg) +static int dsmark_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct dsmark_qdisc_data *p = qdisc_priv(sch); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d1902fca9844..bf0034c66e35 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1090,7 +1090,8 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) } static int -hfsc_delete_class(struct Qdisc *sch, unsigned long arg) +hfsc_delete_class(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index cd70dbcbd72f..a8fc97b05bd8 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1246,7 +1246,8 @@ static void htb_destroy(struct Qdisc *sch) __qdisc_reset_queue(&q->direct_queue); } -static int htb_delete(struct Qdisc *sch, unsigned long arg) +static int htb_delete(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6335230a971e..1db9d4a2ef5e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -529,7 +529,8 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) kfree(cl); } -static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, + struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index da047a37a3bf..dde829d4b9f8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -649,7 +649,8 @@ static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOSYS; } -static int sfb_delete(struct Qdisc *sch, unsigned long cl) +static int sfb_delete(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) { return -ENOSYS; } -- cgit v1.2.3 From d03b195b5aa015f6c11988b86a3625f8d5dbac52 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 19 Jan 2021 14:08:13 +0200 Subject: sch_htb: Hierarchical QoS hardware offload HTB doesn't scale well because of contention on a single lock, and it also consumes CPU. This patch adds support for offloading HTB to hardware that supports hierarchical rate limiting. In the offload mode, HTB passes control commands to the driver using ndo_setup_tc. The driver has to replicate the whole hierarchy of classes and their settings (rate, ceil) in the NIC. Every modification of the HTB tree caused by the admin results in ndo_setup_tc being called. After this setup, the HTB algorithm is done completely in the NIC. An SQ (send queue) is created for every leaf class and attached to the hierarchy, so that the NIC can calculate and obey aggregated rate limits, too. In the future, it can be changed, so that multiple SQs will back a single leaf class. ndo_select_queue is responsible for selecting the right queue that serves the traffic class of each packet. The data path works as follows: a packet is classified by clsact, the driver selects a hardware queue according to its class, and the packet is enqueued into this queue's qdisc. This solution addresses two main problems of scaling HTB: 1. Contention by flow classification. Currently the filters are attached to the HTB instance as follows: # tc filter add dev eth0 parent 1:0 protocol ip flower dst_port 80 classid 1:10 It's possible to move classification to clsact egress hook, which is thread-safe and lock-free: # tc filter add dev eth0 egress protocol ip flower dst_port 80 action skbedit priority 1:10 This way classification still happens in software, but the lock contention is eliminated, and it happens before selecting the TX queue, allowing the driver to translate the class to the corresponding hardware queue in ndo_select_queue. Note that this is already compatible with non-offloaded HTB and doesn't require changes to the kernel nor iproute2. 2. Contention by handling packets. HTB is not multi-queue, it attaches to a whole net device, and handling of all packets takes the same lock. When HTB is offloaded, it registers itself as a multi-queue qdisc, similarly to mq: HTB is attached to the netdev, and each queue has its own qdisc. Some features of HTB may be not supported by some particular hardware, for example, the maximum number of classes may be limited, the granularity of rate and ceil parameters may be different, etc. - so, the offload is not enabled by default, a new parameter is used to enable it: # tc qdisc replace dev eth0 root handle 1: htb offload Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 36 +++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_htb.c | 501 +++++++++++++++++++++++++++++++++-- tools/include/uapi/linux/pkt_sched.h | 1 + 5 files changed, 512 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef517254367d..9e8572533d8e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -858,6 +858,7 @@ enum tc_setup_type { TC_SETUP_QDISC_ETS, TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, + TC_SETUP_QDISC_HTB, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0f2a9c44171c..255e4f4b521f 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -783,6 +783,42 @@ struct tc_mq_qopt_offload { }; }; +enum tc_htb_command { + /* Root */ + TC_HTB_CREATE, /* Initialize HTB offload. */ + TC_HTB_DESTROY, /* Destroy HTB offload. */ + + /* Classes */ + /* Allocate qid and create leaf. */ + TC_HTB_LEAF_ALLOC_QUEUE, + /* Convert leaf to inner, preserve and return qid, create new leaf. */ + TC_HTB_LEAF_TO_INNER, + /* Delete leaf, while siblings remain. */ + TC_HTB_LEAF_DEL, + /* Delete leaf, convert parent to leaf, preserving qid. */ + TC_HTB_LEAF_DEL_LAST, + /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ + TC_HTB_LEAF_DEL_LAST_FORCE, + /* Modify parameters of a node. */ + TC_HTB_NODE_MODIFY, + + /* Class qdisc */ + TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ +}; + +struct tc_htb_qopt_offload { + struct netlink_ext_ack *extack; + enum tc_htb_command command; + u16 classid; + u32 parent_classid; + u16 qid; + u16 moved_qid; + u64 rate; + u64 ceil; +}; + +#define TC_HTB_CLASSID_ROOT U32_MAX + enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9e7c2c607845..79a699f106b1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -434,6 +434,7 @@ enum { TCA_HTB_RATE64, TCA_HTB_CEIL64, TCA_HTB_PAD, + TCA_HTB_OFFLOAD, __TCA_HTB_MAX, }; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index a8fc97b05bd8..d1b60fe3d311 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -174,6 +174,11 @@ struct htb_sched { int row_mask[TC_HTB_MAXDEPTH]; struct htb_level hlevel[TC_HTB_MAXDEPTH]; + + struct Qdisc **direct_qdiscs; + unsigned int num_direct_qdiscs; + + bool offload; }; /* find class in global hash table using given handle */ @@ -957,7 +962,7 @@ static void htb_reset(struct Qdisc *sch) if (cl->level) memset(&cl->inner, 0, sizeof(cl->inner)); else { - if (cl->leaf.q) + if (cl->leaf.q && !q->offload) qdisc_reset(cl->leaf.q); } cl->prio_activity = 0; @@ -980,6 +985,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, [TCA_HTB_RATE64] = { .type = NLA_U64 }, [TCA_HTB_CEIL64] = { .type = NLA_U64 }, + [TCA_HTB_OFFLOAD] = { .type = NLA_FLAG }, }; static void htb_work_func(struct work_struct *work) @@ -992,12 +998,27 @@ static void htb_work_func(struct work_struct *work) rcu_read_unlock(); } +static void htb_set_lockdep_class_child(struct Qdisc *q) +{ + static struct lock_class_key child_key; + + lockdep_set_class(qdisc_lock(q), &child_key); +} + +static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt) +{ + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt); +} + static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct net_device *dev = qdisc_dev(sch); + struct tc_htb_qopt_offload offload_opt; struct htb_sched *q = qdisc_priv(sch); struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; + unsigned int ntx; int err; qdisc_watchdog_init(&q->watchdog, sch); @@ -1022,9 +1043,26 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, if (gopt->version != HTB_VER >> 16) return -EINVAL; + q->offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]); + + if (q->offload) { + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + q->num_direct_qdiscs = dev->real_num_tx_queues; + q->direct_qdiscs = kcalloc(q->num_direct_qdiscs, + sizeof(*q->direct_qdiscs), + GFP_KERNEL); + if (!q->direct_qdiscs) + return -ENOMEM; + } + err = qdisc_class_hash_init(&q->clhash); if (err < 0) - return err; + goto err_free_direct_qdiscs; qdisc_skb_head_init(&q->direct_queue); @@ -1037,7 +1075,107 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, q->rate2quantum = 1; q->defcls = gopt->defcls; + if (!q->offload) + return 0; + + for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) { + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *qdisc; + + qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 0), extack); + if (!qdisc) { + err = -ENOMEM; + goto err_free_qdiscs; + } + + htb_set_lockdep_class_child(qdisc); + q->direct_qdiscs[ntx] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + } + + sch->flags |= TCQ_F_MQROOT; + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_CREATE, + .parent_classid = TC_H_MAJ(sch->handle) >> 16, + .classid = TC_H_MIN(q->defcls), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) + goto err_free_qdiscs; + return 0; + +err_free_qdiscs: + /* TC_HTB_CREATE call failed, avoid any further calls to the driver. */ + q->offload = false; + + for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx]; + ntx++) + qdisc_put(q->direct_qdiscs[ntx]); + + qdisc_class_hash_destroy(&q->clhash); + /* Prevent use-after-free and double-free when htb_destroy gets called. + */ + q->clhash.hash = NULL; + q->clhash.hashsize = 0; + +err_free_direct_qdiscs: + kfree(q->direct_qdiscs); + q->direct_qdiscs = NULL; + return err; +} + +static void htb_attach_offload(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct htb_sched *q = qdisc_priv(sch); + unsigned int ntx; + + for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) { + struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx]; + + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + qdisc_put(old); + qdisc_hash_add(qdisc, false); + } + for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) { + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL); + + qdisc_put(old); + } + + kfree(q->direct_qdiscs); + q->direct_qdiscs = NULL; +} + +static void htb_attach_software(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + /* Resemble qdisc_graft behavior. */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *old = dev_graft_qdisc(dev_queue, sch); + + qdisc_refcount_inc(sch); + + qdisc_put(old); + } +} + +static void htb_attach(struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + + if (q->offload) + htb_attach_offload(sch); + else + htb_attach_software(sch); } static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -1046,6 +1184,11 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest; struct tc_htb_glob gopt; + if (q->offload) + sch->flags |= TCQ_F_OFFLOADED; + else + sch->flags &= ~TCQ_F_OFFLOADED; + sch->qstats.overlimits = q->overlimits; /* Its safe to not acquire qdisc lock. As we hold RTNL, * no change can happen on the qdisc parameters. @@ -1063,6 +1206,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) goto nla_put_failure; + if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD)) + goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1144,19 +1289,97 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); } +static struct netdev_queue * +htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_htb_qopt_offload offload_opt; + int err; + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_LEAF_QUERY_QUEUE, + .classid = TC_H_MIN(tcm->tcm_parent), + }; + err = htb_offload(dev, &offload_opt); + if (err || offload_opt.qid >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, offload_opt.qid); +} + +static struct Qdisc * +htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q) +{ + struct net_device *dev = dev_queue->dev; + struct Qdisc *old_q; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + old_q = dev_graft_qdisc(dev_queue, new_q); + if (new_q) + new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + if (dev->flags & IFF_UP) + dev_activate(dev); + + return old_q; +} + +static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new) +{ + struct netdev_queue *queue_old, *queue_new; + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + + queue_old = netdev_get_tx_queue(dev, qid_old); + queue_new = netdev_get_tx_queue(dev, qid_new); + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + qdisc = dev_graft_qdisc(queue_old, NULL); + qdisc->dev_queue = queue_new; + qdisc = dev_graft_qdisc(queue_new, qdisc); + if (dev->flags & IFF_UP) + dev_activate(dev); + + WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN)); +} + static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { + struct netdev_queue *dev_queue = sch->dev_queue; struct htb_class *cl = (struct htb_class *)arg; + struct htb_sched *q = qdisc_priv(sch); + struct Qdisc *old_q; if (cl->level) return -EINVAL; - if (new == NULL && - (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->common.classid, extack)) == NULL) - return -ENOBUFS; + + if (q->offload) { + dev_queue = new->dev_queue; + WARN_ON(dev_queue != cl->leaf.q->dev_queue); + } + + if (!new) { + new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, + cl->common.classid, extack); + if (!new) + return -ENOBUFS; + } + + if (q->offload) { + htb_set_lockdep_class_child(new); + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + qdisc_refcount_inc(new); + old_q = htb_graft_helper(dev_queue, new); + } *old = qdisc_replace(sch, new, &cl->leaf.q); + + if (q->offload) { + WARN_ON(old_q != *old); + qdisc_put(old_q); + } + return 0; } @@ -1184,9 +1407,10 @@ static inline int htb_parent_last_child(struct htb_class *cl) return 1; } -static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, +static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl, struct Qdisc *new_q) { + struct htb_sched *q = qdisc_priv(sch); struct htb_class *parent = cl->parent; WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity); @@ -1204,6 +1428,71 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->cmode = HTB_CAN_SEND; } +static void htb_parent_to_leaf_offload(struct Qdisc *sch, + struct netdev_queue *dev_queue, + struct Qdisc *new_q) +{ + struct Qdisc *old_q; + + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + qdisc_refcount_inc(new_q); + old_q = htb_graft_helper(dev_queue, new_q); + WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); +} + +static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, + bool last_child, bool destroying, + struct netlink_ext_ack *extack) +{ + struct tc_htb_qopt_offload offload_opt; + struct Qdisc *q = cl->leaf.q; + struct Qdisc *old = NULL; + int err; + + if (cl->level) + return -EINVAL; + + WARN_ON(!q); + if (!destroying) { + /* On destroy of HTB, two cases are possible: + * 1. q is a normal qdisc, but q->dev_queue has noop qdisc. + * 2. q is a noop qdisc (for nodes that were inner), + * q->dev_queue is noop_netdev_queue. + */ + old = htb_graft_helper(q->dev_queue, NULL); + WARN_ON(!old); + WARN_ON(old != q); + } + + offload_opt = (struct tc_htb_qopt_offload) { + .command = !last_child ? TC_HTB_LEAF_DEL : + destroying ? TC_HTB_LEAF_DEL_LAST_FORCE : + TC_HTB_LEAF_DEL_LAST, + .classid = cl->common.classid, + .extack = extack, + }; + err = htb_offload(qdisc_dev(sch), &offload_opt); + + if (!err || destroying) + qdisc_put(old); + else + htb_graft_helper(q->dev_queue, old); + + if (last_child) + return err; + + if (!err && offload_opt.moved_qid != 0) { + if (destroying) + q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch), + offload_opt.qid); + else + htb_offload_move_qdisc(sch, offload_opt.moved_qid, + offload_opt.qid); + } + + return err; +} + static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) { if (!cl->level) { @@ -1217,8 +1506,11 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) static void htb_destroy(struct Qdisc *sch) { + struct net_device *dev = qdisc_dev(sch); + struct tc_htb_qopt_offload offload_opt; struct htb_sched *q = qdisc_priv(sch); struct hlist_node *next; + bool nonempty, changed; struct htb_class *cl; unsigned int i; @@ -1237,13 +1529,58 @@ static void htb_destroy(struct Qdisc *sch) cl->block = NULL; } } - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], - common.hnode) - htb_destroy_class(sch, cl); - } + + do { + nonempty = false; + changed = false; + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], + common.hnode) { + bool last_child; + + if (!q->offload) { + htb_destroy_class(sch, cl); + continue; + } + + nonempty = true; + + if (cl->level) + continue; + + changed = true; + + last_child = htb_parent_last_child(cl); + htb_destroy_class_offload(sch, cl, last_child, + true, NULL); + qdisc_class_hash_remove(&q->clhash, + &cl->common); + if (cl->parent) + cl->parent->children--; + if (last_child) + htb_parent_to_leaf(sch, cl, NULL); + htb_destroy_class(sch, cl); + } + } + } while (changed); + WARN_ON(nonempty); + qdisc_class_hash_destroy(&q->clhash); __qdisc_reset_queue(&q->direct_queue); + + if (!q->offload) + return; + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_DESTROY, + }; + htb_offload(dev, &offload_opt); + + if (!q->direct_qdiscs) + return; + for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++) + qdisc_put(q->direct_qdiscs[i]); + kfree(q->direct_qdiscs); } static int htb_delete(struct Qdisc *sch, unsigned long arg, @@ -1253,6 +1590,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, struct htb_class *cl = (struct htb_class *)arg; struct Qdisc *new_q = NULL; int last_child = 0; + int err; /* TODO: why don't allow to delete subtree ? references ? does * tc subsys guarantee us that in htb_destroy it holds no class @@ -1261,11 +1599,28 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->children || cl->filter_cnt) return -EBUSY; - if (!cl->level && htb_parent_last_child(cl)) { - new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + if (!cl->level && htb_parent_last_child(cl)) + last_child = 1; + + if (q->offload) { + err = htb_destroy_class_offload(sch, cl, last_child, false, + extack); + if (err) + return err; + } + + if (last_child) { + struct netdev_queue *dev_queue; + + dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue; + new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); - last_child = 1; + if (q->offload) { + if (new_q) + htb_set_lockdep_class_child(new_q); + htb_parent_to_leaf_offload(sch, dev_queue, new_q); + } } sch_tree_lock(sch); @@ -1286,7 +1641,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, &q->hlevel[cl->level].wait_pq); if (last_child) - htb_parent_to_leaf(q, cl, new_q); + htb_parent_to_leaf(sch, cl, new_q); sch_tree_unlock(sch); @@ -1301,9 +1656,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, int err = -EINVAL; struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; + struct tc_htb_qopt_offload offload_opt; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HTB_MAX + 1]; struct Qdisc *parent_qdisc = NULL; + struct netdev_queue *dev_queue; struct tc_htb_opt *hopt; u64 rate64, ceil64; int warn = 0; @@ -1336,8 +1693,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB], NULL)); + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + if (!cl) { /* new class */ - struct Qdisc *new_q; + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *new_q, *old_q; int prio; struct { struct nlattr nla; @@ -1380,11 +1741,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NULL, qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); - if (err) { - tcf_block_put(cl->block); - kfree(cl); - goto failure; - } + if (err) + goto err_block_put; } cl->children = 0; @@ -1393,12 +1751,74 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) RB_CLEAR_NODE(&cl->node[prio]); + cl->common.classid = classid; + + /* Make sure nothing interrupts us in between of two + * ndo_setup_tc calls. + */ + ASSERT_RTNL(); + /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) * so that can't be used inside of sch_tree_lock * -- thanks to Karlis Peisenieks */ - new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + if (!q->offload) { + dev_queue = sch->dev_queue; + } else if (!(parent && !parent->level)) { + /* Assign a dev_queue to this classid. */ + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_LEAF_ALLOC_QUEUE, + .classid = cl->common.classid, + .parent_classid = parent ? + TC_H_MIN(parent->common.classid) : + TC_HTB_CLASSID_ROOT, + .rate = max_t(u64, hopt->rate.rate, rate64), + .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) { + pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n", + err); + goto err_kill_estimator; + } + dev_queue = netdev_get_tx_queue(dev, offload_opt.qid); + } else { /* First child. */ + dev_queue = parent->leaf.q->dev_queue; + old_q = htb_graft_helper(dev_queue, NULL); + WARN_ON(old_q != parent->leaf.q); + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_LEAF_TO_INNER, + .classid = cl->common.classid, + .parent_classid = + TC_H_MIN(parent->common.classid), + .rate = max_t(u64, hopt->rate.rate, rate64), + .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) { + pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n", + err); + htb_graft_helper(dev_queue, old_q); + goto err_kill_estimator; + } + qdisc_put(old_q); + } + new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, classid, NULL); + if (q->offload) { + if (new_q) { + htb_set_lockdep_class_child(new_q); + /* One ref for cl->leaf.q, the other for + * dev_queue->qdisc. + */ + qdisc_refcount_inc(new_q); + } + old_q = htb_graft_helper(dev_queue, new_q); + /* No qdisc_put needed. */ + WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); + } sch_tree_lock(sch); if (parent && !parent->level) { /* turn parent into inner node */ @@ -1416,10 +1836,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, : TC_HTB_MAXDEPTH) - 1; memset(&parent->inner, 0, sizeof(parent->inner)); } + /* leaf (we) needs elementary qdisc */ cl->leaf.q = new_q ? new_q : &noop_qdisc; - cl->common.classid = classid; cl->parent = parent; /* set class to be in HTB_CAN_SEND state */ @@ -1445,12 +1865,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (err) return err; } - sch_tree_lock(sch); - } - rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + if (q->offload) { + struct net_device *dev = qdisc_dev(sch); + + offload_opt = (struct tc_htb_qopt_offload) { + .command = TC_HTB_NODE_MODIFY, + .classid = cl->common.classid, + .rate = max_t(u64, hopt->rate.rate, rate64), + .ceil = max_t(u64, hopt->ceil.rate, ceil64), + .extack = extack, + }; + err = htb_offload(dev, &offload_opt); + if (err) + /* Estimator was replaced, and rollback may fail + * as well, so we don't try to recover it, and + * the estimator won't work property with the + * offload anyway, because bstats are updated + * only when the stats are queried. + */ + return err; + } - ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + sch_tree_lock(sch); + } psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); @@ -1493,6 +1931,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, *arg = (unsigned long)cl; return 0; +err_kill_estimator: + gen_kill_estimator(&cl->rate_est); +err_block_put: + tcf_block_put(cl->block); + kfree(cl); failure: return err; } @@ -1558,6 +2001,7 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) } static const struct Qdisc_class_ops htb_class_ops = { + .select_queue = htb_select_queue, .graft = htb_graft, .leaf = htb_leaf, .qlen_notify = htb_qlen_notify, @@ -1580,6 +2024,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .dequeue = htb_dequeue, .peek = qdisc_peek_dequeued, .init = htb_init, + .attach = htb_attach, .reset = htb_reset, .destroy = htb_destroy, .dump = htb_dump, diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h index 0d18b1d1fbbc..5c903abc9fa5 100644 --- a/tools/include/uapi/linux/pkt_sched.h +++ b/tools/include/uapi/linux/pkt_sched.h @@ -414,6 +414,7 @@ enum { TCA_HTB_RATE64, TCA_HTB_CEIL64, TCA_HTB_PAD, + TCA_HTB_OFFLOAD, __TCA_HTB_MAX, }; -- cgit v1.2.3 From 214baf22870cfa437522f3bd4fbae56338674b04 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 19 Jan 2021 14:08:15 +0200 Subject: net/mlx5e: Support HTB offload This commit adds support for HTB offload in the mlx5e driver. Performance: NIC: Mellanox ConnectX-6 Dx CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz (24 cores with HT) 100 Gbit/s line rate, 500 UDP streams @ ~200 Mbit/s each 48 traffic classes, flower used for steering No shaping (rate limits set to 4 Gbit/s per TC) - checking for max throughput. Baseline: 98.7 Gbps, 8.25 Mpps HTB: 6.7 Gbps, 0.56 Mpps HTB offload: 95.6 Gbps, 8.00 Mpps Limitations: 1. 256 leaf nodes, 3 levels of depth. 2. Granularity for ceil is 1 Mbit/s. Rates are converted to weights, and the bandwidth is split among the siblings according to these weights. Other parameters for classes are not supported. Ethtool statistics support for QoS SQs are also added. The counters are called qos_txN_*, where N is the QoS queue number (starting from 0, the numeration is separate from the normal SQs), and * is the counter name (the counters are the same as for the normal SQs). Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 6 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 27 +- .../net/ethernet/mellanox/mlx5/core/en/params.h | 2 + drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 984 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en/qos.h | 44 + .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 21 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 176 +++- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 100 +++ drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 + drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 47 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 26 + drivers/net/ethernet/mellanox/mlx5/core/qos.c | 85 ++ drivers/net/ethernet/mellanox/mlx5/core/qos.h | 30 + include/linux/mlx5/mlx5_ifc.h | 13 +- 15 files changed, 1516 insertions(+), 49 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/qos.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/qos.h create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qos.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qos.h (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 134bd038ae8a..fcfc0b114985 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -16,7 +16,8 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \ - diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o fw_reset.o + diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \ + fw_reset.o qos.o # # Netdev basic @@ -25,7 +26,8 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \ en_selftest.o en/port.o en/monitor_stats.o en/health.o \ en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \ - en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o + en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \ + en/qos.o # # Netdev extra diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 055baf3b6cb1..26e578a973e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -55,6 +55,7 @@ #include "en_stats.h" #include "en/dcbnl.h" #include "en/fs.h" +#include "en/qos.h" #include "lib/hv_vhca.h" extern const struct net_device_ops mlx5e_netdev_ops; @@ -161,6 +162,9 @@ do { \ ##__VA_ARGS__); \ } while (0) +#define mlx5e_state_dereference(priv, p) \ + rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock)) + enum mlx5e_rq_group { MLX5E_RQ_GROUP_REGULAR, MLX5E_RQ_GROUP_XSK, @@ -663,11 +667,13 @@ struct mlx5e_channel { struct mlx5e_xdpsq rq_xdpsq; struct mlx5e_txqsq sq[MLX5E_MAX_NUM_TC]; struct mlx5e_icosq icosq; /* internal control operations */ + struct mlx5e_txqsq __rcu * __rcu *qos_sqs; bool xdp; struct napi_struct napi; struct device *pdev; struct net_device *netdev; __be32 mkey_be; + u16 qos_sqs_size; u8 num_tc; u8 lag_port; @@ -756,6 +762,8 @@ struct mlx5e_modify_sq_param { int next_state; int rl_update; int rl_index; + bool qos_update; + u16 qos_queue_group_id; }; #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) @@ -788,10 +796,20 @@ struct mlx5e_scratchpad { cpumask_var_t cpumask; }; +struct mlx5e_htb { + DECLARE_HASHTABLE(qos_tc2node, order_base_2(MLX5E_QOS_MAX_LEAF_NODES)); + DECLARE_BITMAP(qos_used_qids, MLX5E_QOS_MAX_LEAF_NODES); + struct mlx5e_sq_stats **qos_sq_stats; + u16 max_qos_sqs; + u16 maj_id; + u16 defcls; +}; + struct mlx5e_priv { /* priv data path fields - start */ /* +1 for port ptp ts */ - struct mlx5e_txqsq *txq2sq[(MLX5E_MAX_NUM_CHANNELS + 1) * MLX5E_MAX_NUM_TC]; + struct mlx5e_txqsq *txq2sq[(MLX5E_MAX_NUM_CHANNELS + 1) * MLX5E_MAX_NUM_TC + + MLX5E_QOS_MAX_LEAF_NODES]; int channel_tc2realtxq[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC]; int port_ptp_tc2realtxq[MLX5E_MAX_NUM_TC]; #ifdef CONFIG_MLX5_CORE_EN_DCB @@ -859,6 +877,7 @@ struct mlx5e_priv { struct mlx5e_hv_vhca_stats_agent stats_agent; #endif struct mlx5e_scratchpad scratchpad; + struct mlx5e_htb htb; }; struct mlx5e_rx_handlers { @@ -986,6 +1005,7 @@ int mlx5e_safe_switch_channels(struct mlx5e_priv *priv, struct mlx5e_channels *new_chs, mlx5e_fp_preactivate preactivate, void *context); +int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv); int mlx5e_num_channels_changed(struct mlx5e_priv *priv); int mlx5e_num_channels_changed_ctx(struct mlx5e_priv *priv, void *context); void mlx5e_activate_priv_channels(struct mlx5e_priv *priv); @@ -1010,6 +1030,9 @@ void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq); int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, struct mlx5e_modify_sq_param *p); +int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix, + struct mlx5e_params *params, struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, u16 qos_qid); void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq); void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq); void mlx5e_free_txqsq(struct mlx5e_txqsq *sq); @@ -1020,8 +1043,10 @@ struct mlx5e_create_sq_param; int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, struct mlx5e_sq_param *param, struct mlx5e_create_sq_param *csp, + u16 qos_queue_group_id, u32 *sqn); void mlx5e_tx_err_cqe_work(struct work_struct *recover_work); +void mlx5e_close_txqsq(struct mlx5e_txqsq *sq); static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index 807147d97a0f..ea2cfb04b31a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -118,6 +118,8 @@ void mlx5e_build_rq_param(struct mlx5e_priv *priv, struct mlx5e_rq_param *param); void mlx5e_build_sq_param_common(struct mlx5e_priv *priv, struct mlx5e_sq_param *param); +void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_sq_param *param); void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 2a2bac30daaa..eeddd1137dda 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -261,7 +261,7 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_port_ptp *c, u32 tisn, csp.min_inline_mode = txqsq->min_inline_mode; csp.ts_cqe_to_dest_cqn = ptpsq->ts_cq.mcq.cqn; - err = mlx5e_create_sq_rdy(c->mdev, sqp, &csp, &txqsq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, sqp, &csp, 0, &txqsq->sqn); if (err) goto err_free_txqsq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c new file mode 100644 index 000000000000..12d7ad061237 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "en.h" +#include "params.h" +#include "../qos.h" + +#define BYTES_IN_MBIT 125000 + +int mlx5e_qos_max_leaf_nodes(struct mlx5_core_dev *mdev) +{ + return min(MLX5E_QOS_MAX_LEAF_NODES, mlx5_qos_max_leaf_nodes(mdev)); +} + +int mlx5e_qos_cur_leaf_nodes(struct mlx5e_priv *priv) +{ + int last = find_last_bit(priv->htb.qos_used_qids, mlx5e_qos_max_leaf_nodes(priv->mdev)); + + return last == mlx5e_qos_max_leaf_nodes(priv->mdev) ? 0 : last + 1; +} + +/* Software representation of the QoS tree (internal to this file) */ + +static int mlx5e_find_unused_qos_qid(struct mlx5e_priv *priv) +{ + int size = mlx5e_qos_max_leaf_nodes(priv->mdev); + int res; + + WARN_ONCE(!mutex_is_locked(&priv->state_lock), "%s: state_lock is not held\n", __func__); + res = find_first_zero_bit(priv->htb.qos_used_qids, size); + + return res == size ? -ENOSPC : res; +} + +struct mlx5e_qos_node { + struct hlist_node hnode; + struct rcu_head rcu; + struct mlx5e_qos_node *parent; + u64 rate; + u32 bw_share; + u32 max_average_bw; + u32 hw_id; + u32 classid; /* 16-bit, except root. */ + u16 qid; +}; + +#define MLX5E_QOS_QID_INNER 0xffff +#define MLX5E_HTB_CLASSID_ROOT 0xffffffff + +static struct mlx5e_qos_node * +mlx5e_sw_node_create_leaf(struct mlx5e_priv *priv, u16 classid, u16 qid, + struct mlx5e_qos_node *parent) +{ + struct mlx5e_qos_node *node; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return ERR_PTR(-ENOMEM); + + node->parent = parent; + + node->qid = qid; + __set_bit(qid, priv->htb.qos_used_qids); + + node->classid = classid; + hash_add_rcu(priv->htb.qos_tc2node, &node->hnode, classid); + + mlx5e_update_tx_netdev_queues(priv); + + return node; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_create_root(struct mlx5e_priv *priv) +{ + struct mlx5e_qos_node *node; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return ERR_PTR(-ENOMEM); + + node->qid = MLX5E_QOS_QID_INNER; + node->classid = MLX5E_HTB_CLASSID_ROOT; + hash_add_rcu(priv->htb.qos_tc2node, &node->hnode, node->classid); + + return node; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_find(struct mlx5e_priv *priv, u32 classid) +{ + struct mlx5e_qos_node *node = NULL; + + hash_for_each_possible(priv->htb.qos_tc2node, node, hnode, classid) { + if (node->classid == classid) + break; + } + + return node; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_find_rcu(struct mlx5e_priv *priv, u32 classid) +{ + struct mlx5e_qos_node *node = NULL; + + hash_for_each_possible_rcu(priv->htb.qos_tc2node, node, hnode, classid) { + if (node->classid == classid) + break; + } + + return node; +} + +static void mlx5e_sw_node_delete(struct mlx5e_priv *priv, struct mlx5e_qos_node *node) +{ + hash_del_rcu(&node->hnode); + if (node->qid != MLX5E_QOS_QID_INNER) { + __clear_bit(node->qid, priv->htb.qos_used_qids); + mlx5e_update_tx_netdev_queues(priv); + } + kfree_rcu(node, rcu); +} + +/* TX datapath API */ + +static u16 mlx5e_qid_from_qos(struct mlx5e_channels *chs, u16 qid) +{ + /* These channel params are safe to access from the datapath, because: + * 1. This function is called only after checking priv->htb.maj_id != 0, + * and the number of queues can't change while HTB offload is active. + * 2. When priv->htb.maj_id becomes 0, synchronize_rcu waits for + * mlx5e_select_queue to finish while holding priv->state_lock, + * preventing other code from changing the number of queues. + */ + bool is_ptp = MLX5E_GET_PFLAG(&chs->params, MLX5E_PFLAG_TX_PORT_TS); + + return (chs->params.num_channels + is_ptp) * chs->params.num_tc + qid; +} + +int mlx5e_get_txq_by_classid(struct mlx5e_priv *priv, u16 classid) +{ + struct mlx5e_qos_node *node; + u16 qid; + int res; + + rcu_read_lock(); + + node = mlx5e_sw_node_find_rcu(priv, classid); + if (!node) { + res = -ENOENT; + goto out; + } + qid = READ_ONCE(node->qid); + if (qid == MLX5E_QOS_QID_INNER) { + res = -EINVAL; + goto out; + } + res = mlx5e_qid_from_qos(&priv->channels, qid); + +out: + rcu_read_unlock(); + return res; +} + +static struct mlx5e_txqsq *mlx5e_get_qos_sq(struct mlx5e_priv *priv, int qid) +{ + struct mlx5e_params *params = &priv->channels.params; + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_channel *c; + int ix; + + ix = qid % params->num_channels; + qid /= params->num_channels; + c = priv->channels.c[ix]; + + qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs); + return mlx5e_state_dereference(priv, qos_sqs[qid]); +} + +/* SQ lifecycle */ + +static int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs, + struct mlx5e_qos_node *node) +{ + struct mlx5e_create_cq_param ccp = {}; + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_sq_param param_sq; + struct mlx5e_cq_param param_cq; + int txq_ix, ix, qid, err = 0; + struct mlx5e_params *params; + struct mlx5e_channel *c; + struct mlx5e_txqsq *sq; + + params = &chs->params; + + txq_ix = mlx5e_qid_from_qos(chs, node->qid); + + WARN_ON(node->qid > priv->htb.max_qos_sqs); + if (node->qid == priv->htb.max_qos_sqs) { + struct mlx5e_sq_stats *stats, **stats_list = NULL; + + if (priv->htb.max_qos_sqs == 0) { + stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), + sizeof(*stats_list), + GFP_KERNEL); + if (!stats_list) + return -ENOMEM; + } + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) { + kvfree(stats_list); + return -ENOMEM; + } + if (stats_list) + WRITE_ONCE(priv->htb.qos_sq_stats, stats_list); + WRITE_ONCE(priv->htb.qos_sq_stats[node->qid], stats); + /* Order max_qos_sqs increment after writing the array pointer. + * Pairs with smp_load_acquire in en_stats.c. + */ + smp_store_release(&priv->htb.max_qos_sqs, priv->htb.max_qos_sqs + 1); + } + + ix = node->qid % params->num_channels; + qid = node->qid / params->num_channels; + c = chs->c[ix]; + + qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs); + sq = kzalloc(sizeof(*sq), GFP_KERNEL); + + if (!sq) + return -ENOMEM; + + mlx5e_build_create_cq_param(&ccp, c); + + memset(¶m_sq, 0, sizeof(param_sq)); + memset(¶m_cq, 0, sizeof(param_cq)); + mlx5e_build_sq_param(priv, params, ¶m_sq); + mlx5e_build_tx_cq_param(priv, params, ¶m_cq); + err = mlx5e_open_cq(priv, params->tx_cq_moderation, ¶m_cq, &ccp, &sq->cq); + if (err) + goto err_free_sq; + err = mlx5e_open_txqsq(c, priv->tisn[c->lag_port][0], txq_ix, params, + ¶m_sq, sq, 0, node->hw_id, node->qid); + if (err) + goto err_close_cq; + + rcu_assign_pointer(qos_sqs[qid], sq); + + return 0; + +err_close_cq: + mlx5e_close_cq(&sq->cq); +err_free_sq: + kfree(sq); + return err; +} + +static void mlx5e_activate_qos_sq(struct mlx5e_priv *priv, struct mlx5e_qos_node *node) +{ + struct mlx5e_txqsq *sq; + + sq = mlx5e_get_qos_sq(priv, node->qid); + + WRITE_ONCE(priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, node->qid)], sq); + + /* Make the change to txq2sq visible before the queue is started. + * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE, + * which pairs with this barrier. + */ + smp_wmb(); + + qos_dbg(priv->mdev, "Activate QoS SQ qid %u\n", node->qid); + mlx5e_activate_txqsq(sq); +} + +static void mlx5e_deactivate_qos_sq(struct mlx5e_priv *priv, u16 qid) +{ + struct mlx5e_txqsq *sq; + + sq = mlx5e_get_qos_sq(priv, qid); + if (!sq) /* Handle the case when the SQ failed to open. */ + return; + + qos_dbg(priv->mdev, "Deactivate QoS SQ qid %u\n", qid); + mlx5e_deactivate_txqsq(sq); + + /* The queue is disabled, no synchronization with datapath is needed. */ + priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, qid)] = NULL; +} + +static void mlx5e_close_qos_sq(struct mlx5e_priv *priv, u16 qid) +{ + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_params *params; + struct mlx5e_channel *c; + struct mlx5e_txqsq *sq; + int ix; + + params = &priv->channels.params; + + ix = qid % params->num_channels; + qid /= params->num_channels; + c = priv->channels.c[ix]; + qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs); + sq = rcu_replace_pointer(qos_sqs[qid], NULL, lockdep_is_held(&priv->state_lock)); + if (!sq) /* Handle the case when the SQ failed to open. */ + return; + + synchronize_rcu(); /* Sync with NAPI. */ + + mlx5e_close_txqsq(sq); + mlx5e_close_cq(&sq->cq); + kfree(sq); +} + +void mlx5e_qos_close_queues(struct mlx5e_channel *c) +{ + struct mlx5e_txqsq __rcu **qos_sqs; + int i; + + qos_sqs = rcu_replace_pointer(c->qos_sqs, NULL, lockdep_is_held(&c->priv->state_lock)); + if (!qos_sqs) + return; + synchronize_rcu(); /* Sync with NAPI. */ + + for (i = 0; i < c->qos_sqs_size; i++) { + struct mlx5e_txqsq *sq; + + sq = mlx5e_state_dereference(c->priv, qos_sqs[i]); + if (!sq) /* Handle the case when the SQ failed to open. */ + continue; + + mlx5e_close_txqsq(sq); + mlx5e_close_cq(&sq->cq); + kfree(sq); + } + + kvfree(qos_sqs); +} + +static void mlx5e_qos_close_all_queues(struct mlx5e_channels *chs) +{ + int i; + + for (i = 0; i < chs->num; i++) + mlx5e_qos_close_queues(chs->c[i]); +} + +static int mlx5e_qos_alloc_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs) +{ + u16 qos_sqs_size; + int i; + + qos_sqs_size = DIV_ROUND_UP(mlx5e_qos_max_leaf_nodes(priv->mdev), chs->num); + + for (i = 0; i < chs->num; i++) { + struct mlx5e_txqsq **sqs; + + sqs = kvcalloc(qos_sqs_size, sizeof(struct mlx5e_txqsq *), GFP_KERNEL); + if (!sqs) + goto err_free; + + WRITE_ONCE(chs->c[i]->qos_sqs_size, qos_sqs_size); + smp_wmb(); /* Pairs with mlx5e_napi_poll. */ + rcu_assign_pointer(chs->c[i]->qos_sqs, sqs); + } + + return 0; + +err_free: + while (--i >= 0) { + struct mlx5e_txqsq **sqs; + + sqs = rcu_replace_pointer(chs->c[i]->qos_sqs, NULL, + lockdep_is_held(&priv->state_lock)); + + synchronize_rcu(); /* Sync with NAPI. */ + kvfree(sqs); + } + return -ENOMEM; +} + +int mlx5e_qos_open_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs) +{ + struct mlx5e_qos_node *node = NULL; + int bkt, err; + + if (!priv->htb.maj_id) + return 0; + + err = mlx5e_qos_alloc_queues(priv, chs); + if (err) + return err; + + hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) { + if (node->qid == MLX5E_QOS_QID_INNER) + continue; + err = mlx5e_open_qos_sq(priv, chs, node); + if (err) { + mlx5e_qos_close_all_queues(chs); + return err; + } + } + + return 0; +} + +void mlx5e_qos_activate_queues(struct mlx5e_priv *priv) +{ + struct mlx5e_qos_node *node = NULL; + int bkt; + + hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) { + if (node->qid == MLX5E_QOS_QID_INNER) + continue; + mlx5e_activate_qos_sq(priv, node); + } +} + +void mlx5e_qos_deactivate_queues(struct mlx5e_channel *c) +{ + struct mlx5e_params *params = &c->priv->channels.params; + struct mlx5e_txqsq __rcu **qos_sqs; + int i; + + qos_sqs = mlx5e_state_dereference(c->priv, c->qos_sqs); + if (!qos_sqs) + return; + + for (i = 0; i < c->qos_sqs_size; i++) { + u16 qid = params->num_channels * i + c->ix; + struct mlx5e_txqsq *sq; + + sq = mlx5e_state_dereference(c->priv, qos_sqs[i]); + if (!sq) /* Handle the case when the SQ failed to open. */ + continue; + + qos_dbg(c->mdev, "Deactivate QoS SQ qid %u\n", qid); + mlx5e_deactivate_txqsq(sq); + + /* The queue is disabled, no synchronization with datapath is needed. */ + c->priv->txq2sq[mlx5e_qid_from_qos(&c->priv->channels, qid)] = NULL; + } +} + +static void mlx5e_qos_deactivate_all_queues(struct mlx5e_channels *chs) +{ + int i; + + for (i = 0; i < chs->num; i++) + mlx5e_qos_deactivate_queues(chs->c[i]); +} + +/* HTB API */ + +int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *root; + bool opened; + int err; + + qos_dbg(priv->mdev, "TC_HTB_CREATE handle %04x:, default :%04x\n", htb_maj_id, htb_defcls); + + if (!mlx5_qos_is_supported(priv->mdev)) { + NL_SET_ERR_MSG_MOD(extack, + "Missing QoS capabilities. Try disabling SRIOV or use a supported device."); + return -EOPNOTSUPP; + } + + opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + if (opened) { + err = mlx5e_qos_alloc_queues(priv, &priv->channels); + if (err) + return err; + } + + root = mlx5e_sw_node_create_root(priv); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto err_free_queues; + } + + err = mlx5_qos_create_root_node(priv->mdev, &root->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error. Try upgrading firmware."); + goto err_sw_node_delete; + } + + WRITE_ONCE(priv->htb.defcls, htb_defcls); + /* Order maj_id after defcls - pairs with + * mlx5e_select_queue/mlx5e_select_htb_queues. + */ + smp_store_release(&priv->htb.maj_id, htb_maj_id); + + return 0; + +err_sw_node_delete: + mlx5e_sw_node_delete(priv, root); + +err_free_queues: + if (opened) + mlx5e_qos_close_all_queues(&priv->channels); + return err; +} + +int mlx5e_htb_root_del(struct mlx5e_priv *priv) +{ + struct mlx5e_qos_node *root; + int err; + + qos_dbg(priv->mdev, "TC_HTB_DESTROY\n"); + + WRITE_ONCE(priv->htb.maj_id, 0); + synchronize_rcu(); /* Sync with mlx5e_select_htb_queue and TX data path. */ + + root = mlx5e_sw_node_find(priv, MLX5E_HTB_CLASSID_ROOT); + if (!root) { + qos_err(priv->mdev, "Failed to find the root node in the QoS tree\n"); + return -ENOENT; + } + err = mlx5_qos_destroy_node(priv->mdev, root->hw_id); + if (err) + qos_err(priv->mdev, "Failed to destroy root node %u, err = %d\n", + root->hw_id, err); + mlx5e_sw_node_delete(priv, root); + + mlx5e_qos_deactivate_all_queues(&priv->channels); + mlx5e_qos_close_all_queues(&priv->channels); + + return err; +} + +static int mlx5e_htb_convert_rate(struct mlx5e_priv *priv, u64 rate, + struct mlx5e_qos_node *parent, u32 *bw_share) +{ + u64 share = 0; + + while (parent->classid != MLX5E_HTB_CLASSID_ROOT && !parent->max_average_bw) + parent = parent->parent; + + if (parent->max_average_bw) + share = div64_u64(div_u64(rate * 100, BYTES_IN_MBIT), + parent->max_average_bw); + else + share = 101; + + *bw_share = share == 0 ? 1 : share > 100 ? 0 : share; + + qos_dbg(priv->mdev, "Convert: rate %llu, parent ceil %llu -> bw_share %u\n", + rate, (u64)parent->max_average_bw * BYTES_IN_MBIT, *bw_share); + + return 0; +} + +static void mlx5e_htb_convert_ceil(struct mlx5e_priv *priv, u64 ceil, u32 *max_average_bw) +{ + *max_average_bw = div_u64(ceil, BYTES_IN_MBIT); + + qos_dbg(priv->mdev, "Convert: ceil %llu -> max_average_bw %u\n", + ceil, *max_average_bw); +} + +int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid, + u32 parent_classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node, *parent; + int qid; + int err; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_ALLOC_QUEUE classid %04x, parent %04x, rate %llu, ceil %llu\n", + classid, parent_classid, rate, ceil); + + qid = mlx5e_find_unused_qos_qid(priv); + if (qid < 0) { + NL_SET_ERR_MSG_MOD(extack, "Maximum amount of leaf classes is reached."); + return qid; + } + + parent = mlx5e_sw_node_find(priv, parent_classid); + if (!parent) + return -EINVAL; + + node = mlx5e_sw_node_create_leaf(priv, classid, qid, parent); + if (IS_ERR(node)) + return PTR_ERR(node); + + node->rate = rate; + mlx5e_htb_convert_rate(priv, rate, node->parent, &node->bw_share); + mlx5e_htb_convert_ceil(priv, ceil, &node->max_average_bw); + + err = mlx5_qos_create_leaf_node(priv->mdev, node->parent->hw_id, + node->bw_share, node->max_average_bw, + &node->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node."); + qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n", + classid, err); + mlx5e_sw_node_delete(priv, node); + return err; + } + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, node); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n", + classid, err); + } else { + mlx5e_activate_qos_sq(priv, node); + } + } + + return mlx5e_qid_from_qos(&priv->channels, node->qid); +} + +int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid, + u64 rate, u64 ceil, struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node, *child; + int err, tmp_err; + u32 new_hw_id; + u16 qid; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_TO_INNER classid %04x, upcoming child %04x, rate %llu, ceil %llu\n", + classid, child_classid, rate, ceil); + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + err = mlx5_qos_create_inner_node(priv->mdev, node->parent->hw_id, + node->bw_share, node->max_average_bw, + &new_hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating an inner node."); + qos_err(priv->mdev, "Failed to create an inner node (class %04x), err = %d\n", + classid, err); + return err; + } + + /* Intentionally reuse the qid for the upcoming first child. */ + child = mlx5e_sw_node_create_leaf(priv, child_classid, node->qid, node); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto err_destroy_hw_node; + } + + child->rate = rate; + mlx5e_htb_convert_rate(priv, rate, node, &child->bw_share); + mlx5e_htb_convert_ceil(priv, ceil, &child->max_average_bw); + + err = mlx5_qos_create_leaf_node(priv->mdev, new_hw_id, child->bw_share, + child->max_average_bw, &child->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node."); + qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n", + classid, err); + goto err_delete_sw_node; + } + + /* No fail point. */ + + qid = node->qid; + /* Pairs with mlx5e_get_txq_by_classid. */ + WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER); + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + mlx5e_deactivate_qos_sq(priv, qid); + mlx5e_close_qos_sq(priv, qid); + } + + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + node->hw_id = new_hw_id; + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, child); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n", + classid, err); + } else { + mlx5e_activate_qos_sq(priv, child); + } + } + + return 0; + +err_delete_sw_node: + child->qid = MLX5E_QOS_QID_INNER; + mlx5e_sw_node_delete(priv, child); + +err_destroy_hw_node: + tmp_err = mlx5_qos_destroy_node(priv->mdev, new_hw_id); + if (tmp_err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to roll back creation of an inner node %u (class %04x), err = %d\n", + new_hw_id, classid, tmp_err); + return err; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_find_by_qid(struct mlx5e_priv *priv, u16 qid) +{ + struct mlx5e_qos_node *node = NULL; + int bkt; + + hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) + if (node->qid == qid) + break; + + return node; +} + +static void mlx5e_reactivate_qos_sq(struct mlx5e_priv *priv, u16 qid, struct netdev_queue *txq) +{ + qos_dbg(priv->mdev, "Reactivate QoS SQ qid %u\n", qid); + netdev_tx_reset_queue(txq); + netif_tx_start_queue(txq); +} + +static void mlx5e_reset_qdisc(struct net_device *dev, u16 qid) +{ + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, qid); + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (!qdisc) + return; + + spin_lock_bh(qdisc_lock(qdisc)); + qdisc_reset(qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); +} + +int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid, + u16 *new_qid, struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node; + struct netdev_queue *txq; + u16 qid, moved_qid; + bool opened; + int err; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", classid); + + *old_qid = *new_qid = 0; + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + /* Store qid for reuse. */ + qid = node->qid; + + opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + if (opened) { + txq = netdev_get_tx_queue(priv->netdev, + mlx5e_qid_from_qos(&priv->channels, qid)); + mlx5e_deactivate_qos_sq(priv, qid); + mlx5e_close_qos_sq(priv, qid); + } + + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + mlx5e_sw_node_delete(priv, node); + + moved_qid = mlx5e_qos_cur_leaf_nodes(priv); + + if (moved_qid == 0) { + /* The last QoS SQ was just destroyed. */ + if (opened) + mlx5e_reactivate_qos_sq(priv, qid, txq); + return 0; + } + moved_qid--; + + if (moved_qid < qid) { + /* The highest QoS SQ was just destroyed. */ + WARN(moved_qid != qid - 1, "Gaps in queue numeration: destroyed queue %u, the highest queue is %u", + qid, moved_qid); + if (opened) + mlx5e_reactivate_qos_sq(priv, qid, txq); + return 0; + } + + WARN(moved_qid == qid, "Can't move node with qid %u to itself", qid); + qos_dbg(priv->mdev, "Moving QoS SQ %u to %u\n", moved_qid, qid); + + node = mlx5e_sw_node_find_by_qid(priv, moved_qid); + WARN(!node, "Could not find a node with qid %u to move to queue %u", + moved_qid, qid); + + /* Stop traffic to the old queue. */ + WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER); + __clear_bit(moved_qid, priv->htb.qos_used_qids); + + if (opened) { + txq = netdev_get_tx_queue(priv->netdev, + mlx5e_qid_from_qos(&priv->channels, moved_qid)); + mlx5e_deactivate_qos_sq(priv, moved_qid); + mlx5e_close_qos_sq(priv, moved_qid); + } + + /* Prevent packets from the old class from getting into the new one. */ + mlx5e_reset_qdisc(priv->netdev, moved_qid); + + __set_bit(qid, priv->htb.qos_used_qids); + WRITE_ONCE(node->qid, qid); + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, node); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x) while moving qid %u to %u, err = %d\n", + node->classid, moved_qid, qid, err); + } else { + mlx5e_activate_qos_sq(priv, node); + } + } + + mlx5e_update_tx_netdev_queues(priv); + if (opened) + mlx5e_reactivate_qos_sq(priv, moved_qid, txq); + + *old_qid = mlx5e_qid_from_qos(&priv->channels, moved_qid); + *new_qid = mlx5e_qid_from_qos(&priv->channels, qid); + return 0; +} + +int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node, *parent; + u32 old_hw_id, new_hw_id; + int err, saved_err = 0; + u16 qid; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL_LAST%s classid %04x\n", + force ? "_FORCE" : "", classid); + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + err = mlx5_qos_create_leaf_node(priv->mdev, node->parent->parent->hw_id, + node->parent->bw_share, + node->parent->max_average_bw, + &new_hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node."); + qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n", + classid, err); + if (!force) + return err; + saved_err = err; + } + + /* Store qid for reuse and prevent clearing the bit. */ + qid = node->qid; + /* Pairs with mlx5e_get_txq_by_classid. */ + WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER); + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + mlx5e_deactivate_qos_sq(priv, qid); + mlx5e_close_qos_sq(priv, qid); + } + + /* Prevent packets from the old class from getting into the new one. */ + mlx5e_reset_qdisc(priv->netdev, qid); + + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + parent = node->parent; + mlx5e_sw_node_delete(priv, node); + + node = parent; + WRITE_ONCE(node->qid, qid); + + /* Early return on error in force mode. Parent will still be an inner + * node to be deleted by a following delete operation. + */ + if (saved_err) + return saved_err; + + old_hw_id = node->hw_id; + node->hw_id = new_hw_id; + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, node); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n", + classid, err); + } else { + mlx5e_activate_qos_sq(priv, node); + } + } + + err = mlx5_qos_destroy_node(priv->mdev, old_hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + return 0; +} + +static int mlx5e_qos_update_children(struct mlx5e_priv *priv, struct mlx5e_qos_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *child; + int err = 0; + int bkt; + + hash_for_each(priv->htb.qos_tc2node, bkt, child, hnode) { + u32 old_bw_share = child->bw_share; + int err_one; + + if (child->parent != node) + continue; + + mlx5e_htb_convert_rate(priv, child->rate, node, &child->bw_share); + if (child->bw_share == old_bw_share) + continue; + + err_one = mlx5_qos_update_node(priv->mdev, child->hw_id, child->bw_share, + child->max_average_bw, child->hw_id); + if (!err && err_one) { + err = err_one; + + NL_SET_ERR_MSG_MOD(extack, "Firmware error when modifying a child node."); + qos_err(priv->mdev, "Failed to modify a child node (class %04x), err = %d\n", + node->classid, err); + } + } + + return err; +} + +int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack) +{ + u32 bw_share, max_average_bw; + struct mlx5e_qos_node *node; + bool ceil_changed = false; + int err; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_MODIFY classid %04x, rate %llu, ceil %llu\n", + classid, rate, ceil); + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + node->rate = rate; + mlx5e_htb_convert_rate(priv, rate, node->parent, &bw_share); + mlx5e_htb_convert_ceil(priv, ceil, &max_average_bw); + + err = mlx5_qos_update_node(priv->mdev, node->parent->hw_id, bw_share, + max_average_bw, node->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when modifying a node."); + qos_err(priv->mdev, "Failed to modify a node (class %04x), err = %d\n", + classid, err); + return err; + } + + if (max_average_bw != node->max_average_bw) + ceil_changed = true; + + node->bw_share = bw_share; + node->max_average_bw = max_average_bw; + + if (ceil_changed) + err = mlx5e_qos_update_children(priv, node, extack); + + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h new file mode 100644 index 000000000000..5af7991fcd19 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5E_EN_QOS_H +#define __MLX5E_EN_QOS_H + +#include + +#define MLX5E_QOS_MAX_LEAF_NODES 256 + +struct mlx5e_priv; +struct mlx5e_channels; +struct mlx5e_channel; + +int mlx5e_qos_max_leaf_nodes(struct mlx5_core_dev *mdev); +int mlx5e_qos_cur_leaf_nodes(struct mlx5e_priv *priv); + +/* TX datapath API */ +int mlx5e_get_txq_by_classid(struct mlx5e_priv *priv, u16 classid); +struct mlx5e_txqsq *mlx5e_get_sq(struct mlx5e_priv *priv, int qid); + +/* SQ lifecycle */ +int mlx5e_qos_open_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs); +void mlx5e_qos_activate_queues(struct mlx5e_priv *priv); +void mlx5e_qos_deactivate_queues(struct mlx5e_channel *c); +void mlx5e_qos_close_queues(struct mlx5e_channel *c); + +/* HTB API */ +int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls, + struct netlink_ext_ack *extack); +int mlx5e_htb_root_del(struct mlx5e_priv *priv); +int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid, + u32 parent_classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack); +int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid, + u64 rate, u64 ceil, struct netlink_ext_ack *extack); +int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid, + u16 *new_qid, struct netlink_ext_ack *extack); +int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force, + struct netlink_ext_ack *extack); +int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack); + +#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 2d37742a888c..2e5a0696374a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -447,6 +447,17 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, goto out; } + /* Don't allow changing the number of channels if HTB offload is active, + * because the numeration of the QoS SQs will change, while per-queue + * qdiscs are attached. + */ + if (priv->htb.maj_id) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the number of channels\n", + __func__); + goto out; + } + new_channels.params = priv->channels.params; new_channels.params.num_channels = count; @@ -1966,6 +1977,16 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable) if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) return -EOPNOTSUPP; + /* Don't allow changing the PTP state if HTB offload is active, because + * the numeration of the QoS SQs will change, while per-queue qdiscs are + * attached. + */ + if (priv->htb.maj_id) { + netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the PTP state\n", + __func__); + return -EINVAL; + } + new_channels.params = priv->channels.params; MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_TX_PORT_TS, enable); /* No need to verify SQ stop room as diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f33c38629886..b9a175982801 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -65,6 +65,7 @@ #include "en/devlink.h" #include "lib/mlx5.h" #include "en/ptp.h" +#include "qos.h" bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { @@ -1143,7 +1144,6 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); - sq->stats = &c->priv->channel_stats[c->ix].sq[tc]; INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); @@ -1233,6 +1233,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, struct mlx5e_modify_sq_param *p) { + u64 bitmask = 0; void *in; void *sqc; int inlen; @@ -1248,9 +1249,14 @@ int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, MLX5_SET(modify_sq_in, in, sq_state, p->curr_state); MLX5_SET(sqc, sqc, state, p->next_state); if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) { - MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); - MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); + bitmask |= 1; + MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); } + if (p->qos_update && p->next_state == MLX5_SQC_STATE_RDY) { + bitmask |= 1 << 2; + MLX5_SET(sqc, sqc, qos_queue_group_id, p->qos_queue_group_id); + } + MLX5_SET64(modify_sq_in, in, modify_bitmask, bitmask); err = mlx5_core_modify_sq(mdev, sqn, in); @@ -1267,6 +1273,7 @@ static void mlx5e_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn) int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, struct mlx5e_sq_param *param, struct mlx5e_create_sq_param *csp, + u16 qos_queue_group_id, u32 *sqn) { struct mlx5e_modify_sq_param msp = {0}; @@ -1278,6 +1285,10 @@ int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, msp.curr_state = MLX5_SQC_STATE_RST; msp.next_state = MLX5_SQC_STATE_RDY; + if (qos_queue_group_id) { + msp.qos_update = true; + msp.qos_queue_group_id = qos_queue_group_id; + } err = mlx5e_modify_sq(mdev, *sqn, &msp); if (err) mlx5e_destroy_sq(mdev, *sqn); @@ -1288,13 +1299,9 @@ int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, static int mlx5e_set_sq_maxrate(struct net_device *dev, struct mlx5e_txqsq *sq, u32 rate); -static int mlx5e_open_txqsq(struct mlx5e_channel *c, - u32 tisn, - int txq_ix, - struct mlx5e_params *params, - struct mlx5e_sq_param *param, - struct mlx5e_txqsq *sq, - int tc) +int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix, + struct mlx5e_params *params, struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, u16 qos_qid) { struct mlx5e_create_sq_param csp = {}; u32 tx_rate; @@ -1304,12 +1311,17 @@ static int mlx5e_open_txqsq(struct mlx5e_channel *c, if (err) return err; + if (qos_queue_group_id) + sq->stats = c->priv->htb.qos_sq_stats[qos_qid]; + else + sq->stats = &c->priv->channel_stats[c->ix].sq[tc]; + csp.tisn = tisn; csp.tis_lst_sz = 1; csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = sq->min_inline_mode; - err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, qos_queue_group_id, &sq->sqn); if (err) goto err_free_txqsq; @@ -1366,7 +1378,7 @@ void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq) } } -static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) +void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) { struct mlx5_core_dev *mdev = sq->mdev; struct mlx5_rate_limit rl = {0}; @@ -1403,7 +1415,7 @@ int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params, csp.cqn = sq->cq.mcq.cqn; csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = params->tx_min_inline_mode; - err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); if (err) goto err_free_icosq; @@ -1452,7 +1464,7 @@ int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, csp.wq_ctrl = &sq->wq_ctrl; csp.min_inline_mode = sq->min_inline_mode; set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn); + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); if (err) goto err_free_xdpsq; @@ -1703,7 +1715,7 @@ static int mlx5e_open_sqs(struct mlx5e_channel *c, int txq_ix = c->ix + tc * params->num_channels; err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix, - params, &cparam->txq_sq, &c->sq[tc], tc); + params, &cparam->txq_sq, &c->sq[tc], tc, 0, 0); if (err) goto err_close_sqs; } @@ -2044,6 +2056,8 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c) mlx5e_deactivate_icosq(&c->icosq); for (tc = 0; tc < c->num_tc; tc++) mlx5e_deactivate_txqsq(&c->sq[tc]); + + mlx5e_qos_deactivate_queues(c); } static void mlx5e_close_channel(struct mlx5e_channel *c) @@ -2051,6 +2065,7 @@ static void mlx5e_close_channel(struct mlx5e_channel *c) if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) mlx5e_close_xsk(c); mlx5e_close_queues(c); + mlx5e_qos_close_queues(c); netif_napi_del(&c->napi); kvfree(c); @@ -2198,9 +2213,8 @@ void mlx5e_build_sq_param_common(struct mlx5e_priv *priv, param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev)); } -static void mlx5e_build_sq_param(struct mlx5e_priv *priv, - struct mlx5e_params *params, - struct mlx5e_sq_param *param) +void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); @@ -2379,10 +2393,18 @@ int mlx5e_open_channels(struct mlx5e_priv *priv, goto err_close_channels; } + err = mlx5e_qos_open_queues(priv, chs); + if (err) + goto err_close_ptp; + mlx5e_health_channels_update(priv); kvfree(cparam); return 0; +err_close_ptp: + if (chs->port_ptp) + mlx5e_port_ptp_close(chs->port_ptp); + err_close_channels: for (i--; i >= 0; i--) mlx5e_close_channel(chs->c[i]); @@ -2915,11 +2937,31 @@ static void mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc) netdev_set_tc_queue(netdev, tc, nch, 0); } +int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) +{ + int qos_queues, nch, ntc, num_txqs, err; + + qos_queues = mlx5e_qos_cur_leaf_nodes(priv); + + nch = priv->channels.params.num_channels; + ntc = priv->channels.params.num_tc; + num_txqs = nch * ntc + qos_queues; + if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)) + num_txqs += ntc; + + mlx5e_dbg(DRV, priv, "Setting num_txqs %d\n", num_txqs); + err = netif_set_real_num_tx_queues(priv->netdev, num_txqs); + if (err) + netdev_warn(priv->netdev, "netif_set_real_num_tx_queues failed, %d\n", err); + + return err; +} + static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) { struct net_device *netdev = priv->netdev; - int num_txqs, num_rxqs, nch, ntc; int old_num_txqs, old_ntc; + int num_rxqs, nch, ntc; int err; old_num_txqs = netdev->real_num_tx_queues; @@ -2927,18 +2969,13 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) nch = priv->channels.params.num_channels; ntc = priv->channels.params.num_tc; - num_txqs = nch * ntc; - if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)) - num_txqs += ntc; num_rxqs = nch * priv->profile->rq_groups; mlx5e_netdev_set_tcs(netdev, nch, ntc); - err = netif_set_real_num_tx_queues(netdev, num_txqs); - if (err) { - netdev_warn(netdev, "netif_set_real_num_tx_queues failed, %d\n", err); + err = mlx5e_update_tx_netdev_queues(priv); + if (err) goto err_tcs; - } err = netif_set_real_num_rx_queues(netdev, num_rxqs); if (err) { netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err); @@ -3042,6 +3079,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv) mlx5e_update_num_tc_x_num_ch(priv); mlx5e_build_txq_maps(priv); mlx5e_activate_channels(&priv->channels); + mlx5e_qos_activate_queues(priv); mlx5e_xdp_tx_enable(priv); netif_tx_start_all_queues(priv->netdev); @@ -3608,6 +3646,14 @@ static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + /* MQPRIO is another toplevel qdisc that can't be attached + * simultaneously with the offloaded HTB. + */ + if (WARN_ON(priv->htb.maj_id)) { + err = -EINVAL; + goto out; + } + new_channels.params = priv->channels.params; new_channels.params.num_tc = tc ? tc : 1; @@ -3628,12 +3674,55 @@ out: return err; } +static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb) +{ + int res; + + switch (htb->command) { + case TC_HTB_CREATE: + return mlx5e_htb_root_add(priv, htb->parent_classid, htb->classid, + htb->extack); + case TC_HTB_DESTROY: + return mlx5e_htb_root_del(priv); + case TC_HTB_LEAF_ALLOC_QUEUE: + res = mlx5e_htb_leaf_alloc_queue(priv, htb->classid, htb->parent_classid, + htb->rate, htb->ceil, htb->extack); + if (res < 0) + return res; + htb->qid = res; + return 0; + case TC_HTB_LEAF_TO_INNER: + return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid, + htb->rate, htb->ceil, htb->extack); + case TC_HTB_LEAF_DEL: + return mlx5e_htb_leaf_del(priv, htb->classid, &htb->moved_qid, &htb->qid, + htb->extack); + case TC_HTB_LEAF_DEL_LAST: + case TC_HTB_LEAF_DEL_LAST_FORCE: + return mlx5e_htb_leaf_del_last(priv, htb->classid, + htb->command == TC_HTB_LEAF_DEL_LAST_FORCE, + htb->extack); + case TC_HTB_NODE_MODIFY: + return mlx5e_htb_node_modify(priv, htb->classid, htb->rate, htb->ceil, + htb->extack); + case TC_HTB_LEAF_QUERY_QUEUE: + res = mlx5e_get_txq_by_classid(priv, htb->classid); + if (res < 0) + return res; + htb->qid = res; + return 0; + default: + return -EOPNOTSUPP; + } +} + static LIST_HEAD(mlx5e_block_cb_list); static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct mlx5e_priv *priv = netdev_priv(dev); + int err; switch (type) { case TC_SETUP_BLOCK: { @@ -3647,6 +3736,11 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, } case TC_SETUP_QDISC_MQPRIO: return mlx5e_setup_tc_mqprio(priv, type_data); + case TC_SETUP_QDISC_HTB: + mutex_lock(&priv->state_lock); + err = mlx5e_setup_tc_htb(priv, type_data); + mutex_unlock(&priv->state_lock); + return err; default: return -EOPNOTSUPP; } @@ -3811,20 +3905,25 @@ static int set_feature_cvlan_filter(struct net_device *netdev, bool enable) return 0; } -#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) -static int set_feature_tc_num_filters(struct net_device *netdev, bool enable) +static int set_feature_hw_tc(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) if (!enable && mlx5e_tc_num_filters(priv, MLX5_TC_FLAG(NIC_OFFLOAD))) { netdev_err(netdev, "Active offloaded tc filters, can't turn hw_tc_offload off\n"); return -EINVAL; } +#endif + + if (!enable && priv->htb.maj_id) { + netdev_err(netdev, "Active HTB offload, can't turn hw_tc_offload off\n"); + return -EINVAL; + } return 0; } -#endif static int set_feature_rx_all(struct net_device *netdev, bool enable) { @@ -3922,9 +4021,7 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro); err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER, set_feature_cvlan_filter); -#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) - err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters); -#endif + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc); err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all); err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs); err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan); @@ -5028,6 +5125,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->hw_features |= NETIF_F_NTUPLE; #endif } + if (mlx5_qos_is_supported(mdev)) + netdev->features |= NETIF_F_HW_TC; netdev->features |= NETIF_F_HIGHDMA; netdev->features |= NETIF_F_HW_VLAN_STAG_FILTER; @@ -5333,6 +5432,7 @@ int mlx5e_netdev_init(struct net_device *netdev, return -ENOMEM; mutex_init(&priv->state_lock); + hash_init(priv->htb.qos_tc2node); INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work); @@ -5355,8 +5455,14 @@ err_free_cpumask: void mlx5e_netdev_cleanup(struct net_device *netdev, struct mlx5e_priv *priv) { + int i; + destroy_workqueue(priv->wq); free_cpumask_var(priv->scratchpad.cpumask); + + for (i = 0; i < priv->htb.max_qos_sqs; i++) + kfree(priv->htb.qos_sq_stats[i]); + kvfree(priv->htb.qos_sq_stats); } struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev, @@ -5366,13 +5472,17 @@ struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev, { struct net_device *netdev; unsigned int ptp_txqs = 0; + int qos_sqs = 0; int err; if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) ptp_txqs = profile->max_tc; + if (mlx5_qos_is_supported(mdev)) + qos_sqs = mlx5e_qos_max_leaf_nodes(mdev); + netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), - nch * profile->max_tc + ptp_txqs, + nch * profile->max_tc + ptp_txqs + qos_sqs, nch * profile->rq_groups); if (!netdev) { mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 2cf2042b37c7..92c5b81427b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -420,6 +420,25 @@ static void mlx5e_stats_grp_sw_update_stats_ptp(struct mlx5e_priv *priv, } } +static void mlx5e_stats_grp_sw_update_stats_qos(struct mlx5e_priv *priv, + struct mlx5e_sw_stats *s) +{ + struct mlx5e_sq_stats **stats; + u16 max_qos_sqs; + int i; + + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs); + stats = READ_ONCE(priv->htb.qos_sq_stats); + + for (i = 0; i < max_qos_sqs; i++) { + mlx5e_stats_grp_sw_update_stats_sq(s, READ_ONCE(stats[i])); + + /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */ + barrier(); + } +} + static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) { struct mlx5e_sw_stats *s = &priv->stats.sw; @@ -449,6 +468,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) } } mlx5e_stats_grp_sw_update_stats_ptp(priv, s); + mlx5e_stats_grp_sw_update_stats_qos(priv, s); } static const struct counter_desc q_stats_desc[] = { @@ -1740,6 +1760,41 @@ static const struct counter_desc ptp_cq_stats_desc[] = { { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort_abs_diff_ns) }, }; +static const struct counter_desc qos_sq_stats_desc[] = { + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_partial) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, nop) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, mpwqe_blks) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, mpwqe_pkts) }, +#ifdef CONFIG_MLX5_EN_TLS + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_ctx) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_ooo) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_dump_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_dump_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_resync_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_skip_no_sync_data) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_drop_no_sync_data) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_drop_bypass_req) }, +#endif + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_none) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, stopped) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, dropped) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, xmit_more) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, recover) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, wake) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, +}; + #define NUM_RQ_STATS ARRAY_SIZE(rq_stats_desc) #define NUM_SQ_STATS ARRAY_SIZE(sq_stats_desc) #define NUM_XDPSQ_STATS ARRAY_SIZE(xdpsq_stats_desc) @@ -1750,6 +1805,49 @@ static const struct counter_desc ptp_cq_stats_desc[] = { #define NUM_PTP_SQ_STATS ARRAY_SIZE(ptp_sq_stats_desc) #define NUM_PTP_CH_STATS ARRAY_SIZE(ptp_ch_stats_desc) #define NUM_PTP_CQ_STATS ARRAY_SIZE(ptp_cq_stats_desc) +#define NUM_QOS_SQ_STATS ARRAY_SIZE(qos_sq_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(qos) +{ + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + return NUM_QOS_SQ_STATS * smp_load_acquire(&priv->htb.max_qos_sqs); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(qos) +{ + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + u16 max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs); + int i, qid; + + for (qid = 0; qid < max_qos_sqs; qid++) + for (i = 0; i < NUM_QOS_SQ_STATS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + qos_sq_stats_desc[i].format, qid); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qos) +{ + struct mlx5e_sq_stats **stats; + u16 max_qos_sqs; + int i, qid; + + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs); + stats = READ_ONCE(priv->htb.qos_sq_stats); + + for (qid = 0; qid < max_qos_sqs; qid++) { + struct mlx5e_sq_stats *s = READ_ONCE(stats[qid]); + + for (i = 0; i < NUM_QOS_SQ_STATS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(s, qos_sq_stats_desc, i); + } + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qos) { return; } static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ptp) { @@ -1932,6 +2030,7 @@ MLX5E_DEFINE_STATS_GRP(per_port_buff_congest, 0); MLX5E_DEFINE_STATS_GRP(eth_ext, 0); static MLX5E_DEFINE_STATS_GRP(tls, 0); static MLX5E_DEFINE_STATS_GRP(ptp, 0); +static MLX5E_DEFINE_STATS_GRP(qos, 0); /* The stats groups order is opposite to the update_stats() order calls */ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = { @@ -1955,6 +2054,7 @@ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = { &MLX5E_STATS_GRP(channels), &MLX5E_STATS_GRP(per_port_buff_congest), &MLX5E_STATS_GRP(ptp), + &MLX5E_STATS_GRP(qos), }; unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index e41fc11f2ce7..93c41312fb03 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -55,6 +55,8 @@ #define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld) + struct counter_desc { char format[ETH_GSTRING_LEN]; size_t offset; /* Byte offset */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 74f233eece54..da6a358a8a10 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -106,28 +106,53 @@ return_txq: return priv->port_ptp_tc2realtxq[up]; } +static int mlx5e_select_htb_queue(struct mlx5e_priv *priv, struct sk_buff *skb, + u16 htb_maj_id) +{ + u16 classid; + + if ((TC_H_MAJ(skb->priority) >> 16) == htb_maj_id) + classid = TC_H_MIN(skb->priority); + else + classid = READ_ONCE(priv->htb.defcls); + + if (!classid) + return 0; + + return mlx5e_get_txq_by_classid(priv, classid); +} + u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct mlx5e_priv *priv = netdev_priv(dev); + int num_tc_x_num_ch; int txq_ix; int up = 0; int ch_ix; - if (unlikely(priv->channels.port_ptp)) { - int num_tc_x_num_ch; + /* Sync with mlx5e_update_num_tc_x_num_ch - avoid refetching. */ + num_tc_x_num_ch = READ_ONCE(priv->num_tc_x_num_ch); + if (unlikely(dev->real_num_tx_queues > num_tc_x_num_ch)) { + /* Order maj_id before defcls - pairs with mlx5e_htb_root_add. */ + u16 htb_maj_id = smp_load_acquire(&priv->htb.maj_id); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && - mlx5e_use_ptpsq(skb)) - return mlx5e_select_ptpsq(dev, skb); + if (unlikely(htb_maj_id)) { + txq_ix = mlx5e_select_htb_queue(priv, skb, htb_maj_id); + if (txq_ix > 0) + return txq_ix; + } - /* Sync with mlx5e_update_num_tc_x_num_ch - avoid refetching. */ - num_tc_x_num_ch = READ_ONCE(priv->num_tc_x_num_ch); + if (unlikely(priv->channels.port_ptp)) + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && + mlx5e_use_ptpsq(skb)) + return mlx5e_select_ptpsq(dev, skb); txq_ix = netdev_pick_tx(dev, skb, NULL); - /* Fix netdev_pick_tx() not to choose ptp_channel txqs. + /* Fix netdev_pick_tx() not to choose ptp_channel and HTB txqs. * If they are selected, switch to regular queues. - * Driver to select these queues only at mlx5e_select_ptpsq(). + * Driver to select these queues only at mlx5e_select_ptpsq() + * and mlx5e_select_htb_queue(). */ if (unlikely(txq_ix >= num_tc_x_num_ch)) txq_ix %= num_tc_x_num_ch; @@ -702,6 +727,10 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) u16 pi; sq = priv->txq2sq[skb_get_queue_mapping(skb)]; + if (unlikely(!sq)) { + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } /* May send SKBs and WQEs. */ if (unlikely(!mlx5e_accel_tx_begin(dev, sq, skb, &accel))) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index a3cfe06d5116..d54da3797c30 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -115,17 +115,21 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) napi); struct mlx5e_ch_stats *ch_stats = c->stats; struct mlx5e_xdpsq *xsksq = &c->xsksq; + struct mlx5e_txqsq __rcu **qos_sqs; struct mlx5e_rq *xskrq = &c->xskrq; struct mlx5e_rq *rq = &c->rq; bool aff_change = false; bool busy_xsk = false; bool busy = false; int work_done = 0; + u16 qos_sqs_size; bool xsk_open; int i; rcu_read_lock(); + qos_sqs = rcu_dereference(c->qos_sqs); + xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state); ch_stats->poll++; @@ -133,6 +137,18 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) for (i = 0; i < c->num_tc; i++) busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget); + if (unlikely(qos_sqs)) { + smp_rmb(); /* Pairs with mlx5e_qos_alloc_queues. */ + qos_sqs_size = READ_ONCE(c->qos_sqs_size); + + for (i = 0; i < qos_sqs_size; i++) { + struct mlx5e_txqsq *sq = rcu_dereference(qos_sqs[i]); + + if (sq) + busy |= mlx5e_poll_tx_cq(&sq->cq, budget); + } + } + busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq); if (c->xdp) @@ -186,6 +202,16 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) mlx5e_handle_tx_dim(&c->sq[i]); mlx5e_cq_arm(&c->sq[i].cq); } + if (unlikely(qos_sqs)) { + for (i = 0; i < qos_sqs_size; i++) { + struct mlx5e_txqsq *sq = rcu_dereference(qos_sqs[i]); + + if (sq) { + mlx5e_handle_tx_dim(sq); + mlx5e_cq_arm(&sq->cq); + } + } + } mlx5e_handle_rx_dim(rq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/qos.c new file mode 100644 index 000000000000..0777be24a307 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/qos.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "qos.h" + +#define MLX5_QOS_DEFAULT_DWRR_UID 0 + +bool mlx5_qos_is_supported(struct mlx5_core_dev *mdev) +{ + if (!MLX5_CAP_GEN(mdev, qos)) + return false; + if (!MLX5_CAP_QOS(mdev, nic_sq_scheduling)) + return false; + if (!MLX5_CAP_QOS(mdev, nic_bw_share)) + return false; + if (!MLX5_CAP_QOS(mdev, nic_rate_limit)) + return false; + return true; +} + +int mlx5_qos_max_leaf_nodes(struct mlx5_core_dev *mdev) +{ + return 1 << MLX5_CAP_QOS(mdev, log_max_qos_nic_queue_group); +} + +int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw); + + return mlx5_create_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, + sched_ctx, id); +} + +int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + void *attr; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw); + + attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); + MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR); + + return mlx5_create_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, + sched_ctx, id); +} + +int mlx5_qos_create_root_node(struct mlx5_core_dev *mdev, u32 *id) +{ + return mlx5_qos_create_inner_node(mdev, MLX5_QOS_DEFAULT_DWRR_UID, 0, 0, id); +} + +int mlx5_qos_update_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 id) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + u32 bitmask = 0; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw); + + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE; + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW; + + return mlx5_modify_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, + sched_ctx, id, bitmask); +} + +int mlx5_qos_destroy_node(struct mlx5_core_dev *mdev, u32 id) +{ + return mlx5_destroy_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, id); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/qos.h new file mode 100644 index 000000000000..125e4e47e6f7 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/qos.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_QOS_H +#define __MLX5_QOS_H + +#include "mlx5_core.h" + +#define MLX5_DEBUG_QOS_MASK BIT(4) + +#define qos_err(mdev, fmt, ...) \ + mlx5_core_err(mdev, "QoS: " fmt, ##__VA_ARGS__) +#define qos_warn(mdev, fmt, ...) \ + mlx5_core_warn(mdev, "QoS: " fmt, ##__VA_ARGS__) +#define qos_dbg(mdev, fmt, ...) \ + mlx5_core_dbg_mask(mdev, MLX5_DEBUG_QOS_MASK, "QoS: " fmt, ##__VA_ARGS__) + +bool mlx5_qos_is_supported(struct mlx5_core_dev *mdev); +int mlx5_qos_max_leaf_nodes(struct mlx5_core_dev *mdev); + +int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id); +int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id); +int mlx5_qos_create_root_node(struct mlx5_core_dev *mdev, u32 *id); +int mlx5_qos_update_node(struct mlx5_core_dev *mdev, u32 parent_id, u32 bw_share, + u32 max_avg_bw, u32 id); +int mlx5_qos_destroy_node(struct mlx5_core_dev *mdev, u32 id); + +#endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 823411e288c0..71ae6aac3410 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -842,11 +842,16 @@ struct mlx5_ifc_qos_cap_bits { u8 reserved_at_4[0x1]; u8 packet_pacing_burst_bound[0x1]; u8 packet_pacing_typical_size[0x1]; - u8 reserved_at_7[0x4]; + u8 reserved_at_7[0x1]; + u8 nic_sq_scheduling[0x1]; + u8 nic_bw_share[0x1]; + u8 nic_rate_limit[0x1]; u8 packet_pacing_uid[0x1]; u8 reserved_at_c[0x14]; - u8 reserved_at_20[0x20]; + u8 reserved_at_20[0xb]; + u8 log_max_qos_nic_queue_group[0x5]; + u8 reserved_at_30[0x10]; u8 packet_pacing_max_rate[0x20]; @@ -3347,7 +3352,7 @@ struct mlx5_ifc_sqc_bits { u8 reserved_at_e0[0x10]; u8 packet_pacing_rate_limit_index[0x10]; u8 tis_lst_sz[0x10]; - u8 reserved_at_110[0x10]; + u8 qos_queue_group_id[0x10]; u8 reserved_at_120[0x40]; @@ -3362,6 +3367,7 @@ enum { SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT = 0x1, SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2, SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3, + SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP = 0x4, }; enum { @@ -4805,6 +4811,7 @@ struct mlx5_ifc_query_scheduling_element_out_bits { enum { SCHEDULING_HIERARCHY_E_SWITCH = 0x2, + SCHEDULING_HIERARCHY_NIC = 0x3, }; struct mlx5_ifc_query_scheduling_element_in_bits { -- cgit v1.2.3 From 321f7ab0d45899fe0313139822d69c2d5adbb760 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 21 Jan 2021 15:10:23 +0200 Subject: mlxsw: Register physical ports as a devlink resource The switch ASIC has a limited capacity of physical ('flavour physical' in devlink terminology) ports that it can support. While each system is brought up with a different number of ports, this number can be increased via splitting up to the ASIC's limit. Expose physical ports as a devlink resource so that user space will have visibility to the maximum number of ports that can be supported and the current occupancy. In addition, add a "Generic Resources" section in devlink-resource documentation so the different drivers will be aligned by the same resource name when exposing to user space. Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../networking/devlink/devlink-resource.rst | 14 ++++ drivers/net/ethernet/mellanox/mlxsw/core.c | 77 +++++++++++++++++++--- drivers/net/ethernet/mellanox/mlxsw/core.h | 5 ++ drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 2 +- include/net/devlink.h | 2 + net/core/devlink.c | 4 ++ 6 files changed, 93 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/networking/devlink/devlink-resource.rst b/Documentation/networking/devlink/devlink-resource.rst index 93e92d2f0752..3d5ae51e65a2 100644 --- a/Documentation/networking/devlink/devlink-resource.rst +++ b/Documentation/networking/devlink/devlink-resource.rst @@ -23,6 +23,20 @@ current size and related sub resources. To access a sub resource, you specify the path of the resource. For example ``/IPv4/fib`` is the id for the ``fib`` sub-resource under the ``IPv4`` resource. +Generic Resources +================= + +Generic resources are used to describe resources that can be shared by multiple +device drivers and their description must be added to the following table: + +.. list-table:: List of Generic Resources + :widths: 10 90 + + * - Name + - Description + * - ``physical_ports`` + - A limited capacity of physical ports that the switch ASIC can support + example usage ------------- diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 685037e052af..52fdc34251ba 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -84,6 +84,7 @@ struct mlxsw_core { struct mlxsw_thermal *thermal; struct mlxsw_core_port *ports; unsigned int max_ports; + atomic_t active_ports_count; bool fw_flash_in_progress; struct { struct devlink_health_reporter *fw_fatal; @@ -96,8 +97,36 @@ struct mlxsw_core { #define MLXSW_PORT_MAX_PORTS_DEFAULT 0x40 -static int mlxsw_ports_init(struct mlxsw_core *mlxsw_core) +static u64 mlxsw_ports_occ_get(void *priv) { + struct mlxsw_core *mlxsw_core = priv; + + return atomic_read(&mlxsw_core->active_ports_count); +} + +static int mlxsw_core_resources_ports_register(struct mlxsw_core *mlxsw_core) +{ + struct devlink *devlink = priv_to_devlink(mlxsw_core); + struct devlink_resource_size_params ports_num_params; + u32 max_ports; + + max_ports = mlxsw_core->max_ports - 1; + devlink_resource_size_params_init(&ports_num_params, max_ports, + max_ports, 1, + DEVLINK_RESOURCE_UNIT_ENTRY); + + return devlink_resource_register(devlink, + DEVLINK_RESOURCE_GENERIC_NAME_PORTS, + max_ports, MLXSW_CORE_RESOURCE_PORTS, + DEVLINK_RESOURCE_ID_PARENT_TOP, + &ports_num_params); +} + +static int mlxsw_ports_init(struct mlxsw_core *mlxsw_core, bool reload) +{ + struct devlink *devlink = priv_to_devlink(mlxsw_core); + int err; + /* Switch ports are numbered from 1 to queried value */ if (MLXSW_CORE_RES_VALID(mlxsw_core, MAX_SYSTEM_PORT)) mlxsw_core->max_ports = MLXSW_CORE_RES_GET(mlxsw_core, @@ -110,11 +139,30 @@ static int mlxsw_ports_init(struct mlxsw_core *mlxsw_core) if (!mlxsw_core->ports) return -ENOMEM; + if (!reload) { + err = mlxsw_core_resources_ports_register(mlxsw_core); + if (err) + goto err_resources_ports_register; + } + atomic_set(&mlxsw_core->active_ports_count, 0); + devlink_resource_occ_get_register(devlink, MLXSW_CORE_RESOURCE_PORTS, + mlxsw_ports_occ_get, mlxsw_core); + return 0; + +err_resources_ports_register: + kfree(mlxsw_core->ports); + return err; } -static void mlxsw_ports_fini(struct mlxsw_core *mlxsw_core) +static void mlxsw_ports_fini(struct mlxsw_core *mlxsw_core, bool reload) { + struct devlink *devlink = priv_to_devlink(mlxsw_core); + + devlink_resource_occ_get_unregister(devlink, MLXSW_CORE_RESOURCE_PORTS); + if (!reload) + devlink_resources_unregister(priv_to_devlink(mlxsw_core), NULL); + kfree(mlxsw_core->ports); } @@ -1897,7 +1945,7 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info, goto err_register_resources; } - err = mlxsw_ports_init(mlxsw_core); + err = mlxsw_ports_init(mlxsw_core, reload); if (err) goto err_ports_init; @@ -1986,7 +2034,7 @@ err_devlink_register: err_emad_init: kfree(mlxsw_core->lag.mapping); err_alloc_lag_mapping: - mlxsw_ports_fini(mlxsw_core); + mlxsw_ports_fini(mlxsw_core, reload); err_ports_init: if (!reload) devlink_resources_unregister(devlink, NULL); @@ -2056,7 +2104,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, devlink_unregister(devlink); mlxsw_emad_fini(mlxsw_core); kfree(mlxsw_core->lag.mapping); - mlxsw_ports_fini(mlxsw_core); + mlxsw_ports_fini(mlxsw_core, reload); if (!reload) devlink_resources_unregister(devlink, NULL); mlxsw_core->bus->fini(mlxsw_core->bus_priv); @@ -2755,16 +2803,25 @@ int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, const unsigned char *switch_id, unsigned char switch_id_len) { - return __mlxsw_core_port_init(mlxsw_core, local_port, - DEVLINK_PORT_FLAVOUR_PHYSICAL, - port_number, split, split_port_subnumber, - splittable, lanes, - switch_id, switch_id_len); + int err; + + err = __mlxsw_core_port_init(mlxsw_core, local_port, + DEVLINK_PORT_FLAVOUR_PHYSICAL, + port_number, split, split_port_subnumber, + splittable, lanes, + switch_id, switch_id_len); + if (err) + return err; + + atomic_inc(&mlxsw_core->active_ports_count); + return 0; } EXPORT_SYMBOL(mlxsw_core_port_init); void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port) { + atomic_dec(&mlxsw_core->active_ports_count); + __mlxsw_core_port_fini(mlxsw_core, local_port); } EXPORT_SYMBOL(mlxsw_core_port_fini); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 6b3ccbf6b238..8af7d9d03475 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -19,6 +19,11 @@ #include "cmd.h" #include "resources.h" +enum mlxsw_core_resource_id { + MLXSW_CORE_RESOURCE_PORTS = 1, + MLXSW_CORE_RESOURCE_MAX, +}; + struct mlxsw_core; struct mlxsw_core_port; struct mlxsw_driver; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index a6956cfc9cb1..a3769f95a182 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -52,7 +52,7 @@ #define MLXSW_SP_RESOURCE_NAME_COUNTERS_RIF "rif" enum mlxsw_sp_resource_id { - MLXSW_SP_RESOURCE_KVD = 1, + MLXSW_SP_RESOURCE_KVD = MLXSW_CORE_RESOURCE_MAX, MLXSW_SP_RESOURCE_KVD_LINEAR, MLXSW_SP_RESOURCE_KVD_HASH_SINGLE, MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE, diff --git a/include/net/devlink.h b/include/net/devlink.h index f466819cc477..d12ed2854c34 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -380,6 +380,8 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 +#define DEVLINK_RESOURCE_GENERIC_NAME_PORTS "physical_ports" + #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { DEVLINK_PARAM_TYPE_U8, diff --git a/net/core/devlink.c b/net/core/devlink.c index 738d4344d679..72ea79879762 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8617,6 +8617,10 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); * @resource_id: resource's id * @parent_resource_id: resource's parent id * @size_params: size parameters + * + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst */ int devlink_resource_register(struct devlink *devlink, const char *resource_name, -- cgit v1.2.3 From 6f1c0ea133a6e4a193a7b285efe209664caeea43 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 22 Jan 2021 18:19:48 +0000 Subject: net: introduce a netdev feature for UDP GRO forwarding Introduce a new netdev feature, NETIF_F_GRO_UDP_FWD, to allow user to turn UDP GRO on and off for forwarding. Defaults to off to not change current datapath. Suggested-by: Paolo Abeni Signed-off-by: Alexander Lobakin Signed-off-by: Jakub Kicinski --- include/linux/netdev_features.h | 4 +++- net/ethtool/common.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 934de56644e7..c06d6aaba9df 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -84,6 +84,7 @@ enum { NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */ NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */ + NETIF_F_GRO_UDP_FWD_BIT, /* Allow UDP GRO for forwarding */ /* * Add your fresh new feature above and remember to update @@ -157,6 +158,7 @@ enum { #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST) #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST) #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC) +#define NETIF_F_GRO_UDP_FWD __NETIF_F(GRO_UDP_FWD) /* Finds the next feature with the highest number of the range of start till 0. */ @@ -234,7 +236,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) /* Changeable features with no special hardware requirements that defaults to off. */ -#define NETIF_F_SOFT_FEATURES_OFF NETIF_F_GRO_FRAGLIST +#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD) #define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ NETIF_F_HW_VLAN_CTAG_RX | \ diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 24036e3055a1..181220101a6e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -68,6 +68,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload", + [NETIF_F_GRO_UDP_FWD_BIT] = "rx-udp-gro-forwarding", }; const char -- cgit v1.2.3 From b4a221ea8a1f890b50838ef389d016c7ff280abc Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Fri, 22 Jan 2021 16:36:11 +0800 Subject: Bluetooth: advmon offload MSFT add rssi support MSFT needs rssi parameter for monitoring advertisement packet, therefore we should supply them from mgmt. This adds a new opcode to add advertisement monitor with rssi parameters. Signed-off-by: Archie Pusaka Reviewed-by: Manish Mandlik Reviewed-by: Miao-chen Chou Reviewed-by: Yun-Hao Chung Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 9 ++ include/net/bluetooth/mgmt.h | 16 +++ net/bluetooth/mgmt.c | 225 ++++++++++++++++++++++++++------------- 3 files changed, 178 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 677a8c50b2ad..8b7cf3620938 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -250,8 +250,17 @@ struct adv_pattern { __u8 value[HCI_MAX_AD_LENGTH]; }; +struct adv_rssi_thresholds { + __s8 low_threshold; + __s8 high_threshold; + __u16 low_threshold_timeout; + __u16 high_threshold_timeout; + __u8 sampling_period; +}; + struct adv_monitor { struct list_head patterns; + struct adv_rssi_thresholds rssi; bool active; __u16 handle; }; diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index f9a6638e20b3..839a2028009e 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -821,6 +821,22 @@ struct mgmt_rp_add_ext_adv_data { __u8 instance; } __packed; +struct mgmt_adv_rssi_thresholds { + __s8 high_threshold; + __le16 high_threshold_timeout; + __s8 low_threshold; + __le16 low_threshold_timeout; + __u8 sampling_period; +} __packed; + +#define MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI 0x0056 +struct mgmt_cp_add_adv_patterns_monitor_rssi { + struct mgmt_adv_rssi_thresholds rssi; + __u8 pattern_count; + struct mgmt_adv_pattern patterns[]; +} __packed; +#define MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE 8 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 608dda5403b7..72d37c80e071 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -124,6 +124,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_REMOVE_ADV_MONITOR, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, }; static const u16 mgmt_events[] = { @@ -4225,75 +4226,15 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, return err; } -static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, - void *data, u16 len) +static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, + struct adv_monitor *m, u8 status, u16 op) { - struct mgmt_cp_add_adv_patterns_monitor *cp = data; struct mgmt_rp_add_adv_patterns_monitor rp; - struct adv_monitor *m = NULL; - struct adv_pattern *p = NULL; - unsigned int mp_cnt = 0, prev_adv_monitors_cnt; - __u8 cp_ofst = 0, cp_len = 0; - int err, i; - - BT_DBG("request for %s", hdev->name); - - if (len <= sizeof(*cp) || cp->pattern_count == 0) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; - } - - m = kmalloc(sizeof(*m), GFP_KERNEL); - if (!m) { - err = -ENOMEM; - goto failed; - } - - INIT_LIST_HEAD(&m->patterns); - m->active = false; - - for (i = 0; i < cp->pattern_count; i++) { - if (++mp_cnt > HCI_MAX_ADV_MONITOR_NUM_PATTERNS) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; - } - - cp_ofst = cp->patterns[i].offset; - cp_len = cp->patterns[i].length; - if (cp_ofst >= HCI_MAX_AD_LENGTH || - cp_len > HCI_MAX_AD_LENGTH || - (cp_ofst + cp_len) > HCI_MAX_AD_LENGTH) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); - goto failed; - } - - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) { - err = -ENOMEM; - goto failed; - } - - p->ad_type = cp->patterns[i].ad_type; - p->offset = cp->patterns[i].offset; - p->length = cp->patterns[i].length; - memcpy(p->value, cp->patterns[i].value, p->length); - - INIT_LIST_HEAD(&p->list); - list_add(&p->list, &m->patterns); - } + unsigned int prev_adv_monitors_cnt; + int err; - if (mp_cnt != cp->pattern_count) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_INVALID_PARAMS); + if (status) goto failed; - } hci_dev_lock(hdev); @@ -4301,11 +4242,11 @@ static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, err = hci_add_adv_monitor(hdev, m); if (err) { - if (err == -ENOSPC) { - mgmt_cmd_status(sk, hdev->id, - MGMT_OP_ADD_ADV_PATTERNS_MONITOR, - MGMT_STATUS_NO_RESOURCES); - } + if (err == -ENOSPC) + status = MGMT_STATUS_NO_RESOURCES; + else + status = MGMT_STATUS_FAILED; + goto unlock; } @@ -4316,7 +4257,7 @@ static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, rp.monitor_handle = cpu_to_le16(m->handle); - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + return mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); unlock: @@ -4324,7 +4265,144 @@ unlock: failed: hci_free_adv_monitor(m); - return err; + return mgmt_cmd_status(sk, hdev->id, op, status); +} + +static void parse_adv_monitor_rssi(struct adv_monitor *m, + struct mgmt_adv_rssi_thresholds *rssi) +{ + if (rssi) { + m->rssi.low_threshold = rssi->low_threshold; + m->rssi.low_threshold_timeout = + __le16_to_cpu(rssi->low_threshold_timeout); + m->rssi.high_threshold = rssi->high_threshold; + m->rssi.high_threshold_timeout = + __le16_to_cpu(rssi->high_threshold_timeout); + m->rssi.sampling_period = rssi->sampling_period; + } else { + /* Default values. These numbers are the least constricting + * parameters for MSFT API to work, so it behaves as if there + * are no rssi parameter to consider. May need to be changed + * if other API are to be supported. + */ + m->rssi.low_threshold = -127; + m->rssi.low_threshold_timeout = 60; + m->rssi.high_threshold = -127; + m->rssi.high_threshold_timeout = 0; + m->rssi.sampling_period = 0; + } +} + +static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, + struct mgmt_adv_pattern *patterns) +{ + u8 offset = 0, length = 0; + struct adv_pattern *p = NULL; + unsigned int mp_cnt = 0; + int i; + + for (i = 0; i < pattern_count; i++) { + if (++mp_cnt > HCI_MAX_ADV_MONITOR_NUM_PATTERNS) + return MGMT_STATUS_INVALID_PARAMS; + + offset = patterns[i].offset; + length = patterns[i].length; + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) + return MGMT_STATUS_INVALID_PARAMS; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return MGMT_STATUS_NO_RESOURCES; + + p->ad_type = patterns[i].ad_type; + p->offset = patterns[i].offset; + p->length = patterns[i].length; + memcpy(p->value, patterns[i].value, p->length); + + INIT_LIST_HEAD(&p->list); + list_add(&p->list, &m->patterns); + } + + if (mp_cnt != pattern_count) + return MGMT_STATUS_INVALID_PARAMS; + + return MGMT_STATUS_SUCCESS; +} + +static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_add_adv_patterns_monitor *cp = data; + struct adv_monitor *m = NULL; + u8 status = MGMT_STATUS_SUCCESS; + size_t expected_size = sizeof(*cp); + + BT_DBG("request for %s", hdev->name); + + if (len <= sizeof(*cp)) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; + } + + expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern); + if (len != expected_size) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; + } + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) { + status = MGMT_STATUS_NO_RESOURCES; + goto done; + } + + INIT_LIST_HEAD(&m->patterns); + + parse_adv_monitor_rssi(m, NULL); + status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); + +done: + return __add_adv_patterns_monitor(sk, hdev, m, status, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR); +} + +static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_add_adv_patterns_monitor_rssi *cp = data; + struct adv_monitor *m = NULL; + u8 status = MGMT_STATUS_SUCCESS; + size_t expected_size = sizeof(*cp); + + BT_DBG("request for %s", hdev->name); + + if (len <= sizeof(*cp)) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; + } + + expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern); + if (len != expected_size) { + status = MGMT_STATUS_INVALID_PARAMS; + goto done; + } + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) { + status = MGMT_STATUS_NO_RESOURCES; + goto done; + } + + INIT_LIST_HEAD(&m->patterns); + + parse_adv_monitor_rssi(m, &cp->rssi); + status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); + +done: + return __add_adv_patterns_monitor(sk, hdev, m, status, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI); } static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, @@ -8242,6 +8320,9 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { HCI_MGMT_VAR_LEN }, { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE, HCI_MGMT_VAR_LEN }, + { add_adv_patterns_monitor_rssi, + MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE, + HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From a2a4dedf88ab2f807a7ca90947d686816b430f97 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Fri, 22 Jan 2021 16:36:12 +0800 Subject: Bluetooth: advmon offload MSFT add monitor Enables advertising monitor offloading to the controller, if MSFT extension is supported. The kernel won't adjust the monitor parameters to match what the controller supports - that is the user space's responsibility. This patch only manages the addition of monitors. Monitor removal is going to be handled by another patch. Signed-off-by: Archie Pusaka Reviewed-by: Manish Mandlik Reviewed-by: Miao-chen Chou Reviewed-by: Yun-Hao Chung Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 17 +++- net/bluetooth/hci_core.c | 55 +++++++++-- net/bluetooth/mgmt.c | 114 ++++++++++++++++------ net/bluetooth/msft.c | 201 ++++++++++++++++++++++++++++++++++++++- net/bluetooth/msft.h | 12 +++ 5 files changed, 356 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8b7cf3620938..879d1e38ce96 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -261,13 +261,20 @@ struct adv_rssi_thresholds { struct adv_monitor { struct list_head patterns; struct adv_rssi_thresholds rssi; - bool active; __u16 handle; + + enum { + ADV_MONITOR_STATE_NOT_REGISTERED, + ADV_MONITOR_STATE_REGISTERED, + ADV_MONITOR_STATE_OFFLOADED + } state; }; #define HCI_MIN_ADV_MONITOR_HANDLE 1 -#define HCI_MAX_ADV_MONITOR_NUM_HANDLES 32 +#define HCI_MAX_ADV_MONITOR_NUM_HANDLES 32 #define HCI_MAX_ADV_MONITOR_NUM_PATTERNS 16 +#define HCI_ADV_MONITOR_EXT_NONE 1 +#define HCI_ADV_MONITOR_EXT_MSFT 2 #define HCI_MAX_SHORT_NAME_LENGTH 10 @@ -1326,9 +1333,12 @@ void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); void hci_adv_monitors_clear(struct hci_dev *hdev); void hci_free_adv_monitor(struct adv_monitor *monitor); -int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor); +int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status); +bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + int *err); int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle); bool hci_is_adv_monitoring(struct hci_dev *hdev); +int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); @@ -1804,6 +1814,7 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); +int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9d2c9a1c552f..625298f64a20 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3070,27 +3070,56 @@ void hci_free_adv_monitor(struct adv_monitor *monitor) kfree(monitor); } -/* This function requires the caller holds hdev->lock */ -int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) +int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) +{ + return mgmt_add_adv_patterns_monitor_complete(hdev, status); +} + +/* Assigns handle to a monitor, and if offloading is supported and power is on, + * also attempts to forward the request to the controller. + * Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + int *err) { int min, max, handle; - if (!monitor) - return -EINVAL; + *err = 0; + + if (!monitor) { + *err = -EINVAL; + return false; + } min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); - if (handle < 0) - return handle; + if (handle < 0) { + *err = handle; + return false; + } - hdev->adv_monitors_cnt++; monitor->handle = handle; - hci_update_background_scan(hdev); + if (!hdev_is_powered(hdev)) + return false; - return 0; + switch (hci_get_adv_monitor_offload_ext(hdev)) { + case HCI_ADV_MONITOR_EXT_NONE: + hci_update_background_scan(hdev); + bt_dev_dbg(hdev, "%s add monitor status %d", hdev->name, *err); + /* Message was not forwarded to controller - not an error */ + return false; + case HCI_ADV_MONITOR_EXT_MSFT: + *err = msft_add_monitor_pattern(hdev, monitor); + bt_dev_dbg(hdev, "%s add monitor msft status %d", hdev->name, + *err); + break; + } + + return (*err == 0); } static int free_adv_monitor(int id, void *ptr, void *data) @@ -3134,6 +3163,14 @@ bool hci_is_adv_monitoring(struct hci_dev *hdev) return !idr_is_empty(&hdev->adv_monitors_idr); } +int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) +{ + if (msft_monitor_supported(hdev)) + return HCI_ADV_MONITOR_EXT_MSFT; + + return HCI_ADV_MONITOR_EXT_NONE; +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 72d37c80e071..fea5e9763b72 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4185,6 +4185,7 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, int handle, err; size_t rp_size = 0; __u32 supported = 0; + __u32 enabled = 0; __u16 num_handles = 0; __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES]; @@ -4192,12 +4193,11 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR) + if (msft_monitor_supported(hdev)) supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS; - idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) { + idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) handles[num_handles++] = monitor->handle; - } hci_dev_unlock(hdev); @@ -4206,11 +4206,11 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, if (!rp) return -ENOMEM; - /* Once controller-based monitoring is in place, the enabled_features - * should reflect the use. - */ + /* All supported features are currently enabled */ + enabled = supported; + rp->supported_features = cpu_to_le32(supported); - rp->enabled_features = 0; + rp->enabled_features = cpu_to_le32(enabled); rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES); rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS; rp->num_handles = cpu_to_le16(num_handles); @@ -4226,44 +4226,105 @@ static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, return err; } +int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) +{ + struct mgmt_rp_add_adv_patterns_monitor rp; + struct mgmt_pending_cmd *cmd; + struct adv_monitor *monitor; + int err = 0; + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev); + if (!cmd) { + cmd = pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev); + if (!cmd) + goto done; + } + + monitor = cmd->user_data; + rp.monitor_handle = cpu_to_le16(monitor->handle); + + if (!status) { + mgmt_adv_monitor_added(cmd->sk, hdev, monitor->handle); + hdev->adv_monitors_cnt++; + if (monitor->state == ADV_MONITOR_STATE_NOT_REGISTERED) + monitor->state = ADV_MONITOR_STATE_REGISTERED; + hci_update_background_scan(hdev); + } + + err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), &rp, sizeof(rp)); + mgmt_pending_remove(cmd); + +done: + hci_dev_unlock(hdev); + bt_dev_dbg(hdev, "add monitor %d complete, status %d", + rp.monitor_handle, status); + + return err; +} + static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, - struct adv_monitor *m, u8 status, u16 op) + struct adv_monitor *m, u8 status, + void *data, u16 len, u16 op) { struct mgmt_rp_add_adv_patterns_monitor rp; - unsigned int prev_adv_monitors_cnt; + struct mgmt_pending_cmd *cmd; int err; + bool pending; + + hci_dev_lock(hdev); if (status) - goto failed; + goto unlock; - hci_dev_lock(hdev); + if (pending_find(MGMT_OP_SET_LE, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev) || + pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) { + status = MGMT_STATUS_BUSY; + goto unlock; + } - prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + cmd = mgmt_pending_add(sk, op, hdev, data, len); + if (!cmd) { + status = MGMT_STATUS_NO_RESOURCES; + goto unlock; + } - err = hci_add_adv_monitor(hdev, m); + pending = hci_add_adv_monitor(hdev, m, &err); if (err) { - if (err == -ENOSPC) + if (err == -ENOSPC || err == -ENOMEM) status = MGMT_STATUS_NO_RESOURCES; + else if (err == -EINVAL) + status = MGMT_STATUS_INVALID_PARAMS; else status = MGMT_STATUS_FAILED; + mgmt_pending_remove(cmd); goto unlock; } - if (hdev->adv_monitors_cnt > prev_adv_monitors_cnt) + if (!pending) { + mgmt_pending_remove(cmd); + rp.monitor_handle = cpu_to_le16(m->handle); mgmt_adv_monitor_added(sk, hdev, m->handle); + m->state = ADV_MONITOR_STATE_REGISTERED; + hdev->adv_monitors_cnt++; - hci_dev_unlock(hdev); + hci_dev_unlock(hdev); + return mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_SUCCESS, + &rp, sizeof(rp)); + } - rp.monitor_handle = cpu_to_le16(m->handle); + hci_dev_unlock(hdev); - return mgmt_cmd_complete(sk, hdev->id, op, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + cmd->user_data = m; + return 0; unlock: hci_dev_unlock(hdev); - -failed: hci_free_adv_monitor(m); return mgmt_cmd_status(sk, hdev->id, op, status); } @@ -4298,13 +4359,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, { u8 offset = 0, length = 0; struct adv_pattern *p = NULL; - unsigned int mp_cnt = 0; int i; for (i = 0; i < pattern_count; i++) { - if (++mp_cnt > HCI_MAX_ADV_MONITOR_NUM_PATTERNS) - return MGMT_STATUS_INVALID_PARAMS; - offset = patterns[i].offset; length = patterns[i].length; if (offset >= HCI_MAX_AD_LENGTH || @@ -4325,9 +4382,6 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, list_add(&p->list, &m->patterns); } - if (mp_cnt != pattern_count) - return MGMT_STATUS_INVALID_PARAMS; - return MGMT_STATUS_SUCCESS; } @@ -4364,7 +4418,7 @@ static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); done: - return __add_adv_patterns_monitor(sk, hdev, m, status, + return __add_adv_patterns_monitor(sk, hdev, m, status, data, len, MGMT_OP_ADD_ADV_PATTERNS_MONITOR); } @@ -4401,7 +4455,7 @@ static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev, status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); done: - return __add_adv_patterns_monitor(sk, hdev, m, status, + return __add_adv_patterns_monitor(sk, hdev, m, status, data, len, MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI); } diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 4b39534a14a1..e4b8fe71b9c3 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -5,9 +5,16 @@ #include #include +#include +#include "hci_request.h" +#include "mgmt_util.h" #include "msft.h" +#define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 +#define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 +#define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C + #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; @@ -21,12 +28,55 @@ struct msft_rp_read_supported_features { __u8 evt_prefix[]; } __packed; +#define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 +#define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 +struct msft_le_monitor_advertisement_pattern { + __u8 length; + __u8 data_type; + __u8 start_byte; + __u8 pattern[0]; +}; + +struct msft_le_monitor_advertisement_pattern_data { + __u8 count; + __u8 data[0]; +}; + +struct msft_cp_le_monitor_advertisement { + __u8 sub_opcode; + __s8 rssi_high; + __s8 rssi_low; + __u8 rssi_low_interval; + __u8 rssi_sampling_period; + __u8 cond_type; + __u8 data[0]; +} __packed; + +struct msft_rp_le_monitor_advertisement { + __u8 status; + __u8 sub_opcode; + __u8 handle; +} __packed; + +struct msft_monitor_advertisement_handle_data { + __u8 msft_handle; + __u16 mgmt_handle; + struct list_head list; +}; + struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; + struct list_head handle_map; + __u16 pending_add_handle; }; +bool msft_monitor_supported(struct hci_dev *hdev) +{ + return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); +} + static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { @@ -90,12 +140,14 @@ void msft_do_open(struct hci_dev *hdev) return; } + INIT_LIST_HEAD(&msft->handle_map); hdev->msft_data = msft; } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; + struct msft_monitor_advertisement_handle_data *handle_data, *tmp; if (!msft) return; @@ -104,6 +156,11 @@ void msft_do_close(struct hci_dev *hdev) hdev->msft_data = NULL; + list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { + list_del(&handle_data->list); + kfree(handle_data); + } + kfree(msft->evt_prefix); kfree(msft); } @@ -145,5 +202,147 @@ __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; - return msft ? msft->features : 0; + return msft ? msft->features : 0; +} + +static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, + u8 status, u16 opcode, + struct sk_buff *skb) +{ + struct msft_rp_le_monitor_advertisement *rp; + struct adv_monitor *monitor; + struct msft_monitor_advertisement_handle_data *handle_data; + struct msft_data *msft = hdev->msft_data; + + hci_dev_lock(hdev); + + monitor = idr_find(&hdev->adv_monitors_idr, msft->pending_add_handle); + if (!monitor) { + bt_dev_err(hdev, "msft add advmon: monitor %d is not found!", + msft->pending_add_handle); + status = HCI_ERROR_UNSPECIFIED; + goto unlock; + } + + if (status) + goto unlock; + + rp = (struct msft_rp_le_monitor_advertisement *)skb->data; + if (skb->len < sizeof(*rp)) { + status = HCI_ERROR_UNSPECIFIED; + goto unlock; + } + + handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); + if (!handle_data) { + status = HCI_ERROR_UNSPECIFIED; + goto unlock; + } + + handle_data->mgmt_handle = monitor->handle; + handle_data->msft_handle = rp->handle; + INIT_LIST_HEAD(&handle_data->list); + list_add(&handle_data->list, &msft->handle_map); + + monitor->state = ADV_MONITOR_STATE_OFFLOADED; + +unlock: + if (status && monitor) { + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + hci_free_adv_monitor(monitor); + } + + hci_dev_unlock(hdev); + + hci_add_adv_patterns_monitor_complete(hdev, status); +} + +static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) +{ + struct adv_rssi_thresholds *r = &monitor->rssi; + + if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || + r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || + r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || + r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) + return false; + + /* High_threshold_timeout is not supported, + * once high_threshold is reached, events are immediately reported. + */ + if (r->high_threshold_timeout != 0) + return false; + + if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) + return false; + + /* Sampling period from 0x00 to 0xFF are all allowed */ + return true; +} + +static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) +{ + return msft_monitor_rssi_valid(monitor); + /* No additional check needed for pattern-based monitor */ +} + +/* This function requires the caller holds hdev->lock */ +int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) +{ + struct msft_cp_le_monitor_advertisement *cp; + struct msft_le_monitor_advertisement_pattern_data *pattern_data; + struct msft_le_monitor_advertisement_pattern *pattern; + struct adv_pattern *entry; + struct hci_request req; + struct msft_data *msft = hdev->msft_data; + size_t total_size = sizeof(*cp) + sizeof(*pattern_data); + ptrdiff_t offset = 0; + u8 pattern_count = 0; + int err = 0; + + if (!msft) + return -EOPNOTSUPP; + + if (!msft_monitor_pattern_valid(monitor)) + return -EINVAL; + + list_for_each_entry(entry, &monitor->patterns, list) { + pattern_count++; + total_size += sizeof(*pattern) + entry->length; + } + + cp = kmalloc(total_size, GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; + cp->rssi_high = monitor->rssi.high_threshold; + cp->rssi_low = monitor->rssi.low_threshold; + cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; + cp->rssi_sampling_period = monitor->rssi.sampling_period; + + cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; + + pattern_data = (void *)cp->data; + pattern_data->count = pattern_count; + + list_for_each_entry(entry, &monitor->patterns, list) { + pattern = (void *)(pattern_data->data + offset); + /* the length also includes data_type and offset */ + pattern->length = entry->length + 2; + pattern->data_type = entry->ad_type; + pattern->start_byte = entry->offset; + memcpy(pattern->pattern, entry->value, entry->length); + offset += sizeof(*pattern) + entry->length; + } + + hci_req_init(&req, hdev); + hci_req_add(&req, hdev->msft_opcode, total_size, cp); + err = hci_req_run_skb(&req, msft_le_monitor_advertisement_cb); + kfree(cp); + + if (!err) + msft->pending_add_handle = monitor->handle; + + return err; } diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index e9c478e890b8..0ac9b15322b1 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -12,16 +12,28 @@ #if IS_ENABLED(CONFIG_BT_MSFTEXT) +bool msft_monitor_supported(struct hci_dev *hdev); void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); __u64 msft_get_features(struct hci_dev *hdev); +int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor); #else +static inline bool msft_monitor_supported(struct hci_dev *hdev) +{ + return false; +} + static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} static inline __u64 msft_get_features(struct hci_dev *hdev) { return 0; } +static inline int msft_add_monitor_pattern(struct hci_dev *hdev, + struct adv_monitor *monitor) +{ + return -EOPNOTSUPP; +} #endif -- cgit v1.2.3 From 66bd095ab5d408af106808cce302406542f70f65 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Fri, 22 Jan 2021 16:36:13 +0800 Subject: Bluetooth: advmon offload MSFT remove monitor Implements the monitor removal functionality for advertising monitor offloading to MSFT controllers. Supply handle = 0 to remove all monitors. Signed-off-by: Archie Pusaka Reviewed-by: Miao-chen Chou Reviewed-by: Yun-Hao Chung Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 8 ++- net/bluetooth/hci_core.c | 119 +++++++++++++++++++++++++++++------- net/bluetooth/mgmt.c | 110 ++++++++++++++++++++++++++------- net/bluetooth/msft.c | 127 +++++++++++++++++++++++++++++++++++++-- net/bluetooth/msft.h | 9 +++ 5 files changed, 323 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 879d1e38ce96..29cfc6a2d689 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1332,11 +1332,13 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); void hci_adv_monitors_clear(struct hci_dev *hdev); -void hci_free_adv_monitor(struct adv_monitor *monitor); +void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor); int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status); +int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status); bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, int *err); -int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle); +bool hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle, int *err); +bool hci_remove_all_adv_monitor(struct hci_dev *hdev, int *err); bool hci_is_adv_monitoring(struct hci_dev *hdev); int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev); @@ -1813,8 +1815,10 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); +void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status); +int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 625298f64a20..b0a63f643a07 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3051,12 +3051,15 @@ void hci_adv_monitors_clear(struct hci_dev *hdev) int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) - hci_free_adv_monitor(monitor); + hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } -void hci_free_adv_monitor(struct adv_monitor *monitor) +/* Frees the monitor structure and do some bookkeepings. + * This function requires the caller holds hdev->lock. + */ +void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; @@ -3064,8 +3067,18 @@ void hci_free_adv_monitor(struct adv_monitor *monitor) if (!monitor) return; - list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) + list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { + list_del(&pattern->list); kfree(pattern); + } + + if (monitor->handle) + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + + if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) { + hdev->adv_monitors_cnt--; + mgmt_adv_monitor_removed(hdev, monitor->handle); + } kfree(monitor); } @@ -3075,6 +3088,11 @@ int hci_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) return mgmt_add_adv_patterns_monitor_complete(hdev, status); } +int hci_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status) +{ + return mgmt_remove_adv_monitor_complete(hdev, status); +} + /* Assigns handle to a monitor, and if offloading is supported and power is on, * also attempts to forward the request to the controller. * Returns true if request is forwarded (result is pending), false otherwise. @@ -3122,39 +3140,94 @@ bool hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, return (*err == 0); } -static int free_adv_monitor(int id, void *ptr, void *data) +/* Attempts to tell the controller and free the monitor. If somehow the + * controller doesn't have a corresponding handle, remove anyway. + * Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +static bool hci_remove_adv_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, + u16 handle, int *err) { - struct hci_dev *hdev = data; - struct adv_monitor *monitor = ptr; + *err = 0; - idr_remove(&hdev->adv_monitors_idr, monitor->handle); - hci_free_adv_monitor(monitor); - hdev->adv_monitors_cnt--; + switch (hci_get_adv_monitor_offload_ext(hdev)) { + case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ + goto free_monitor; + case HCI_ADV_MONITOR_EXT_MSFT: + *err = msft_remove_monitor(hdev, monitor, handle); + break; + } - return 0; + /* In case no matching handle registered, just free the monitor */ + if (*err == -ENOENT) + goto free_monitor; + + return (*err == 0); + +free_monitor: + if (*err == -ENOENT) + bt_dev_warn(hdev, "Removing monitor with no matching handle %d", + monitor->handle); + hci_free_adv_monitor(hdev, monitor); + + *err = 0; + return false; } -/* This function requires the caller holds hdev->lock */ -int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle) +/* Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +bool hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle, int *err) +{ + struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); + bool pending; + + if (!monitor) { + *err = -EINVAL; + return false; + } + + pending = hci_remove_adv_monitor(hdev, monitor, handle, err); + if (!*err && !pending) + hci_update_background_scan(hdev); + + bt_dev_dbg(hdev, "%s remove monitor handle %d, status %d, %spending", + hdev->name, handle, *err, pending ? "" : "not "); + + return pending; +} + +/* Returns true if request is forwarded (result is pending), false otherwise. + * This function requires the caller holds hdev->lock. + */ +bool hci_remove_all_adv_monitor(struct hci_dev *hdev, int *err) { struct adv_monitor *monitor; + int idr_next_id = 0; + bool pending = false; + bool update = false; + + *err = 0; - if (handle) { - monitor = idr_find(&hdev->adv_monitors_idr, handle); + while (!*err && !pending) { + monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) - return -ENOENT; + break; - idr_remove(&hdev->adv_monitors_idr, monitor->handle); - hci_free_adv_monitor(monitor); - hdev->adv_monitors_cnt--; - } else { - /* Remove all monitors if handle is 0. */ - idr_for_each(&hdev->adv_monitors_idr, &free_adv_monitor, hdev); + pending = hci_remove_adv_monitor(hdev, monitor, 0, err); + + if (!*err && !pending) + update = true; } - hci_update_background_scan(hdev); + if (update) + hci_update_background_scan(hdev); - return 0; + bt_dev_dbg(hdev, "%s remove all monitors status %d, %spending", + hdev->name, *err, pending ? "" : "not "); + + return pending; } /* This function requires the caller holds hdev->lock */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index fea5e9763b72..8ff9c4bb43d1 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4167,14 +4167,24 @@ static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); } -static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev, - u16 handle) +void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle) { - struct mgmt_ev_adv_monitor_added ev; + struct mgmt_ev_adv_monitor_removed ev; + struct mgmt_pending_cmd *cmd; + struct sock *sk_skip = NULL; + struct mgmt_cp_remove_adv_monitor *cp; + + cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); + if (cmd) { + cp = cmd->param; + + if (cp->monitor_handle) + sk_skip = cmd->sk; + } ev.monitor_handle = cpu_to_le16(handle); - mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk); + mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk_skip); } static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, @@ -4324,8 +4334,8 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, return 0; unlock: + hci_free_adv_monitor(hdev, m); hci_dev_unlock(hdev); - hci_free_adv_monitor(m); return mgmt_cmd_status(sk, hdev->id, op, status); } @@ -4459,42 +4469,100 @@ done: MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI); } +int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status) +{ + struct mgmt_rp_remove_adv_monitor rp; + struct mgmt_cp_remove_adv_monitor *cp; + struct mgmt_pending_cmd *cmd; + int err = 0; + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); + if (!cmd) + goto done; + + cp = cmd->param; + rp.monitor_handle = cp->monitor_handle; + + if (!status) + hci_update_background_scan(hdev); + + err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), &rp, sizeof(rp)); + mgmt_pending_remove(cmd); + +done: + hci_dev_unlock(hdev); + bt_dev_dbg(hdev, "remove monitor %d complete, status %d", + rp.monitor_handle, status); + + return err; +} + static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_adv_monitor *cp = data; struct mgmt_rp_remove_adv_monitor rp; - unsigned int prev_adv_monitors_cnt; - u16 handle; - int err; + struct mgmt_pending_cmd *cmd; + u16 handle = __le16_to_cpu(cp->monitor_handle); + int err, status; + bool pending; BT_DBG("request for %s", hdev->name); + rp.monitor_handle = cp->monitor_handle; hci_dev_lock(hdev); - handle = __le16_to_cpu(cp->monitor_handle); - prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + if (pending_find(MGMT_OP_SET_LE, hdev) || + pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { + status = MGMT_STATUS_BUSY; + goto unlock; + } - err = hci_remove_adv_monitor(hdev, handle); - if (err == -ENOENT) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, - MGMT_STATUS_INVALID_INDEX); + cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); + if (!cmd) { + status = MGMT_STATUS_NO_RESOURCES; goto unlock; } - if (hdev->adv_monitors_cnt < prev_adv_monitors_cnt) - mgmt_adv_monitor_removed(sk, hdev, handle); + if (handle) + pending = hci_remove_single_adv_monitor(hdev, handle, &err); + else + pending = hci_remove_all_adv_monitor(hdev, &err); - hci_dev_unlock(hdev); + if (err) { + mgmt_pending_remove(cmd); - rp.monitor_handle = cp->monitor_handle; + if (err == -ENOENT) + status = MGMT_STATUS_INVALID_INDEX; + else + status = MGMT_STATUS_FAILED; + + goto unlock; + } + + /* monitor can be removed without forwarding request to controller */ + if (!pending) { + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_STATUS_SUCCESS, + &rp, sizeof(rp)); + } - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + hci_dev_unlock(hdev); + return 0; unlock: hci_dev_unlock(hdev); - return err; + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, + status); } static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index e4b8fe71b9c3..f5aa0e3b1b9b 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -58,6 +58,17 @@ struct msft_rp_le_monitor_advertisement { __u8 handle; } __packed; +#define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 +struct msft_cp_le_cancel_monitor_advertisement { + __u8 sub_opcode; + __u8 handle; +} __packed; + +struct msft_rp_le_cancel_monitor_advertisement { + __u8 status; + __u8 sub_opcode; +} __packed; + struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; @@ -70,6 +81,7 @@ struct msft_data { __u8 *evt_prefix; struct list_head handle_map; __u16 pending_add_handle; + __u16 pending_remove_handle; }; bool msft_monitor_supported(struct hci_dev *hdev) @@ -205,6 +217,26 @@ __u64 msft_get_features(struct hci_dev *hdev) return msft ? msft->features : 0; } +/* is_mgmt = true matches the handle exposed to userspace via mgmt. + * is_mgmt = false matches the handle used by the msft controller. + * This function requires the caller holds hdev->lock + */ +static struct msft_monitor_advertisement_handle_data *msft_find_handle_data + (struct hci_dev *hdev, u16 handle, bool is_mgmt) +{ + struct msft_monitor_advertisement_handle_data *entry; + struct msft_data *msft = hdev->msft_data; + + list_for_each_entry(entry, &msft->handle_map, list) { + if (is_mgmt && entry->mgmt_handle == handle) + return entry; + if (!is_mgmt && entry->msft_handle == handle) + return entry; + } + + return NULL; +} + static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) @@ -247,16 +279,71 @@ static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, monitor->state = ADV_MONITOR_STATE_OFFLOADED; unlock: - if (status && monitor) { - idr_remove(&hdev->adv_monitors_idr, monitor->handle); - hci_free_adv_monitor(monitor); - } + if (status && monitor) + hci_free_adv_monitor(hdev, monitor); hci_dev_unlock(hdev); hci_add_adv_patterns_monitor_complete(hdev, status); } +static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, + u8 status, u16 opcode, + struct sk_buff *skb) +{ + struct msft_cp_le_cancel_monitor_advertisement *cp; + struct msft_rp_le_cancel_monitor_advertisement *rp; + struct adv_monitor *monitor; + struct msft_monitor_advertisement_handle_data *handle_data; + struct msft_data *msft = hdev->msft_data; + int err; + bool pending; + + if (status) + goto done; + + rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; + if (skb->len < sizeof(*rp)) { + status = HCI_ERROR_UNSPECIFIED; + goto done; + } + + hci_dev_lock(hdev); + + cp = hci_sent_cmd_data(hdev, hdev->msft_opcode); + handle_data = msft_find_handle_data(hdev, cp->handle, false); + + if (handle_data) { + monitor = idr_find(&hdev->adv_monitors_idr, + handle_data->mgmt_handle); + if (monitor) + hci_free_adv_monitor(hdev, monitor); + + list_del(&handle_data->list); + kfree(handle_data); + } + + /* If remove all monitors is required, we need to continue the process + * here because the earlier it was paused when waiting for the + * response from controller. + */ + if (msft->pending_remove_handle == 0) { + pending = hci_remove_all_adv_monitor(hdev, &err); + if (pending) { + hci_dev_unlock(hdev); + return; + } + + if (err) + status = HCI_ERROR_UNSPECIFIED; + } + + hci_dev_unlock(hdev); + +done: + hci_remove_adv_monitor_complete(hdev, status); +} + static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) { struct adv_rssi_thresholds *r = &monitor->rssi; @@ -346,3 +433,35 @@ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) return err; } + +/* This function requires the caller holds hdev->lock */ +int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + u16 handle) +{ + struct msft_cp_le_cancel_monitor_advertisement cp; + struct msft_monitor_advertisement_handle_data *handle_data; + struct hci_request req; + struct msft_data *msft = hdev->msft_data; + int err = 0; + + if (!msft) + return -EOPNOTSUPP; + + handle_data = msft_find_handle_data(hdev, monitor->handle, true); + + /* If no matched handle, just remove without telling controller */ + if (!handle_data) + return -ENOENT; + + cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; + cp.handle = handle_data->msft_handle; + + hci_req_init(&req, hdev); + hci_req_add(&req, hdev->msft_opcode, sizeof(cp), &cp); + err = hci_req_run_skb(&req, msft_le_cancel_monitor_advertisement_cb); + + if (!err) + msft->pending_remove_handle = handle; + + return err; +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 0ac9b15322b1..6f126a1f1688 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -18,6 +18,8 @@ void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); __u64 msft_get_features(struct hci_dev *hdev); int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor); +int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + u16 handle); #else @@ -36,4 +38,11 @@ static inline int msft_add_monitor_pattern(struct hci_dev *hdev, return -EOPNOTSUPP; } +static inline int msft_remove_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, + u16 handle) +{ + return -EOPNOTSUPP; +} + #endif -- cgit v1.2.3 From bf6a4e30ffbd9e9ef8934582feb937f6532f8b68 Mon Sep 17 00:00:00 2001 From: Howard Chung Date: Fri, 22 Jan 2021 16:36:17 +0800 Subject: Bluetooth: disable advertisement filters during suspend This adds logic to disable and reenable advertisement filters during suspend and resume. After this patch, we would only receive packets from devices in allow list during suspend. Signed-off-by: Howard Chung Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_request.c | 29 +++++++++++++++++++++++++++++ net/bluetooth/msft.c | 17 ++++++++++++----- net/bluetooth/msft.h | 3 +++ 4 files changed, 46 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 29cfc6a2d689..fd1d10fe2f11 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -105,6 +105,8 @@ enum suspend_tasks { SUSPEND_POWERING_DOWN, SUSPEND_PREPARE_NOTIFIER, + + SUSPEND_SET_ADV_FILTER, __SUSPEND_NUM_TASKS }; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index d29a44d77b4e..e55976db4403 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -29,6 +29,7 @@ #include "smp.h" #include "hci_request.h" +#include "msft.h" #define HCI_REQ_DONE 0 #define HCI_REQ_PEND 1 @@ -1242,6 +1243,29 @@ static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode) clear_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); wake_up(&hdev->suspend_wait_q); } + + if (test_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks)) { + clear_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks); + wake_up(&hdev->suspend_wait_q); + } +} + +static void hci_req_add_set_adv_filter_enable(struct hci_request *req, + bool enable) +{ + struct hci_dev *hdev = req->hdev; + + switch (hci_get_adv_monitor_offload_ext(hdev)) { + case HCI_ADV_MONITOR_EXT_MSFT: + msft_req_add_set_filter_enable(req, enable); + break; + default: + return; + } + + /* No need to block when enabling since it's on resume path */ + if (hdev->suspended && !enable) + set_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks); } /* Call with hci_dev_lock */ @@ -1301,6 +1325,9 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hci_req_add_le_scan_disable(&req, false); } + /* Disable advertisement filters */ + hci_req_add_set_adv_filter_enable(&req, false); + /* Mark task needing completion */ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); @@ -1340,6 +1367,8 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hci_req_clear_event_filter(&req); /* Reset passive/background scanning to normal */ __hci_update_background_scan(&req); + /* Enable all of the advertisement filters */ + hci_req_add_set_adv_filter_enable(&req, true); /* Unpause directed advertising */ hdev->advertising_paused = false; diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index b2ef654b1d3d..47b104f318e9 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -579,9 +579,19 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, return err; } -int msft_set_filter_enable(struct hci_dev *hdev, bool enable) +void msft_req_add_set_filter_enable(struct hci_request *req, bool enable) { + struct hci_dev *hdev = req->hdev; struct msft_cp_le_set_advertisement_filter_enable cp; + + cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; + cp.enable = enable; + + hci_req_add(req, hdev->msft_opcode, sizeof(cp), &cp); +} + +int msft_set_filter_enable(struct hci_dev *hdev, bool enable) +{ struct hci_request req; struct msft_data *msft = hdev->msft_data; int err; @@ -589,11 +599,8 @@ int msft_set_filter_enable(struct hci_dev *hdev, bool enable) if (!msft) return -EOPNOTSUPP; - cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; - cp.enable = enable; - hci_req_init(&req, hdev); - hci_req_add(&req, hdev->msft_opcode, sizeof(cp), &cp); + msft_req_add_set_filter_enable(&req, enable); err = hci_req_run_skb(&req, msft_le_set_advertisement_filter_enable_cb); return err; diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index f8e4d3a6d641..88ed613dfa08 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -20,6 +20,7 @@ __u64 msft_get_features(struct hci_dev *hdev); int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor); int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, u16 handle); +void msft_req_add_set_filter_enable(struct hci_request *req, bool enable); int msft_set_filter_enable(struct hci_dev *hdev, bool enable); #else @@ -46,6 +47,8 @@ static inline int msft_remove_monitor(struct hci_dev *hdev, return -EOPNOTSUPP; } +static inline void msft_req_add_set_filter_enable(struct hci_request *req, + bool enable) {} static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable) { return -EOPNOTSUPP; -- cgit v1.2.3 From 4d7ea8ee90e42fc75995f6fb24032d3233314528 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Jan 2021 15:28:58 -0800 Subject: Bluetooth: L2CAP: Fix handling fragmented length Bluetooth Core Specification v5.2, Vol. 3, Part A, section 1.4, table 1.1: 'Start Fragments always either begin with the first octet of the Basic L2CAP header of a PDU or they have a length of zero (see [Vol 2] Part B, Section 6.6.2).' Apparently this was changed by the following errata: https://www.bluetooth.org/tse/errata_view.cfm?errata_id=10216 Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_core.c | 118 +++++++++++++++++++++++++++++++++--------- 2 files changed, 94 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 1d1232917de7..61800a7b6192 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -207,6 +207,7 @@ struct l2cap_hdr { __le16 len; __le16 cid; } __packed; +#define L2CAP_LEN_SIZE 2 #define L2CAP_HDR_SIZE 4 #define L2CAP_ENH_HDR_SIZE 6 #define L2CAP_EXT_HDR_SIZE 8 diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 17b87b57a175..a24183734bd9 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -8276,10 +8276,73 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) mutex_unlock(&conn->chan_lock); } +/* Append fragment into frame respecting the maximum len of rx_skb */ +static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb, + u16 len) +{ + if (!conn->rx_skb) { + /* Allocate skb for the complete frame (with header) */ + conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); + if (!conn->rx_skb) + return -ENOMEM; + /* Init rx_len */ + conn->rx_len = len; + } + + /* Copy as much as the rx_skb can hold */ + len = min_t(u16, len, skb->len); + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, len), len); + skb_pull(skb, len); + conn->rx_len -= len; + + return len; +} + +static int l2cap_recv_len(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct sk_buff *rx_skb; + int len; + + /* Append just enough to complete the header */ + len = l2cap_recv_frag(conn, skb, L2CAP_LEN_SIZE - conn->rx_skb->len); + + /* If header could not be read just continue */ + if (len < 0 || conn->rx_skb->len < L2CAP_LEN_SIZE) + return len; + + rx_skb = conn->rx_skb; + len = get_unaligned_le16(rx_skb->data); + + /* Check if rx_skb has enough space to received all fragments */ + if (len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE) <= skb_tailroom(rx_skb)) { + /* Update expected len */ + conn->rx_len = len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE); + return L2CAP_LEN_SIZE; + } + + /* Reset conn->rx_skb since it will need to be reallocated in order to + * fit all fragments. + */ + conn->rx_skb = NULL; + + /* Reallocates rx_skb using the exact expected length */ + len = l2cap_recv_frag(conn, rx_skb, + len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE)); + kfree_skb(rx_skb); + + return len; +} + +static void l2cap_recv_reset(struct l2cap_conn *conn) +{ + kfree_skb(conn->rx_skb); + conn->rx_skb = NULL; + conn->rx_len = 0; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct l2cap_conn *conn = hcon->l2cap_data; - struct l2cap_hdr *hdr; int len; /* For AMP controller do not create l2cap conn */ @@ -8298,23 +8361,23 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) case ACL_START: case ACL_START_NO_FLUSH: case ACL_COMPLETE: - if (conn->rx_len) { + if (conn->rx_skb) { BT_ERR("Unexpected start frame (len %d)", skb->len); - kfree_skb(conn->rx_skb); - conn->rx_skb = NULL; - conn->rx_len = 0; + l2cap_recv_reset(conn); l2cap_conn_unreliable(conn, ECOMM); } - /* Start fragment always begin with Basic L2CAP header */ - if (skb->len < L2CAP_HDR_SIZE) { - BT_ERR("Frame is too short (len %d)", skb->len); - l2cap_conn_unreliable(conn, ECOMM); - goto drop; + /* Start fragment may not contain the L2CAP length so just + * copy the initial byte when that happens and use conn->mtu as + * expected length. + */ + if (skb->len < L2CAP_LEN_SIZE) { + if (l2cap_recv_frag(conn, skb, conn->mtu) < 0) + goto drop; + return; } - hdr = (struct l2cap_hdr *) skb->data; - len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE; + len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE; if (len == skb->len) { /* Complete frame received */ @@ -8331,38 +8394,43 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; } - /* Allocate skb for the complete frame (with header) */ - conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); - if (!conn->rx_skb) + /* Append fragment into frame (with header) */ + if (l2cap_recv_frag(conn, skb, len) < 0) goto drop; - skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); - conn->rx_len = len - skb->len; break; case ACL_CONT: BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); - if (!conn->rx_len) { + if (!conn->rx_skb) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); l2cap_conn_unreliable(conn, ECOMM); goto drop; } + /* Complete the L2CAP length if it has not been read */ + if (conn->rx_skb->len < L2CAP_LEN_SIZE) { + if (l2cap_recv_len(conn, skb) < 0) { + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + /* Header still could not be read just continue */ + if (conn->rx_skb->len < L2CAP_LEN_SIZE) + return; + } + if (skb->len > conn->rx_len) { BT_ERR("Fragment is too long (len %d, expected %d)", skb->len, conn->rx_len); - kfree_skb(conn->rx_skb); - conn->rx_skb = NULL; - conn->rx_len = 0; + l2cap_recv_reset(conn); l2cap_conn_unreliable(conn, ECOMM); goto drop; } - skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); - conn->rx_len -= skb->len; + /* Append fragment into frame (with header) */ + l2cap_recv_frag(conn, skb, skb->len); if (!conn->rx_len) { /* Complete frame received. l2cap_recv_frame -- cgit v1.2.3 From 83ace77f51175023c3757e2d08a92565f9b1c7f3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Jan 2021 16:30:03 +0100 Subject: netfilter: ctnetlink: remove get_ct indirection Use nf_ct_get() directly, its a small inline helper without dependencies. Add CONFIG_NF_CONNTRACK guards to elide the relevant part when conntrack isn't available at all. v2: add ifdef guard around nf_ct_get call (kernel test robot) Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 -- net/netfilter/nf_conntrack_netlink.c | 7 ------- net/netfilter/nfnetlink_log.c | 8 +++++++- net/netfilter/nfnetlink_queue.c | 10 ++++++++-- 4 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 0101747de549..f0f3a8354c3c 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -463,8 +463,6 @@ extern struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; struct nfnl_ct_hook { - struct nf_conn *(*get_ct)(const struct sk_buff *skb, - enum ip_conntrack_info *ctinfo); size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 84caf3316946..1469365bac7e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2686,12 +2686,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) ; } -static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb, - enum ip_conntrack_info *ctinfo) -{ - return nf_ct_get(skb, ctinfo); -} - static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; @@ -2925,7 +2919,6 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, } static struct nfnl_ct_hook ctnetlink_glue_hook = { - .get_ct = ctnetlink_glue_get_ct, .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b35e8d9a5b37..26776b88a539 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -43,6 +43,10 @@ #include "../bridge/br_private.h" #endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + #define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ @@ -733,14 +737,16 @@ nfulnl_log_packet(struct net *net, size += nla_total_size(sizeof(u_int32_t)); if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) size += nla_total_size(sizeof(u_int32_t)); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (inst->flags & NFULNL_CFG_F_CONNTRACK) { nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { - ct = nfnl_ct->get_ct(skb, &ctinfo); + ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } +#endif if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) size += nfulnl_get_bridge_size(skb); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d1d8bca03b4f..48a07914fd94 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -444,13 +444,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfnl_ct = rcu_dereference(nfnl_ct_hook); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { - ct = nfnl_ct->get_ct(entskb, &ctinfo); + ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } +#endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ @@ -1104,9 +1106,10 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; - ct = nfnl_ct->get_ct(entry->skb, ctinfo); + ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; @@ -1118,6 +1121,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; +#else + return NULL; +#endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, -- cgit v1.2.3 From dbc859d96f1a90bafe9c3ba2e437aae5d5677318 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Jan 2021 18:56:01 +0100 Subject: netfilter: flowtable: add hash offset field to tuple Add a placeholder field to calculate hash tuple offset. Similar to 2c407aca6497 ("netfilter: conntrack: avoid gcc-10 zero-length-bounds warning"). Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 4 ++++ net/netfilter/nf_flow_table_core.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 16e8b2f8d006..54c4d5c908a5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -107,6 +107,10 @@ struct flow_offload_tuple { u8 l3proto; u8 l4proto; + + /* All members above are keys for lookups, see flow_offload_hash(). */ + struct { } __hash; + u8 dir; u16 mtu; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 513f78db3cb2..55fca71ace26 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -191,14 +191,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; - return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); + return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; - return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, @@ -207,7 +207,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; - if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; -- cgit v1.2.3 From a05829a7222e9d10c416dd2dbbf3929fe6646b89 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 Jan 2021 16:19:43 +0100 Subject: cfg80211: avoid holding the RTNL when calling the driver Currently, _everything_ in cfg80211 holds the RTNL, and if you have a slow USB device (or a few) you can get some bad lock contention on that. Fix that by re-adding a mutex to each wiphy/rdev as we had at some point, so we have locking for the wireless_dev lists and all the other things in there, and also so that drivers still don't have to worry too much about it (they still won't get parallel calls for a single device). Then, we can restrict the RTNL to a few cases where we add or remove interfaces and really need the added protection. Some of the global list management still also uses the RTNL, since we need to have it anyway for netdev management, but we only hold the RTNL for very short periods of time here. Link: https://lore.kernel.org/r/20210122161942.81df9f5e047a.I4a8e1a60b18863ea8c5e6d3a0faeafb2d45b2f40@changeid Tested-by: Marek Szyprowski [marvell driver issues] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/reg.c | 4 +- drivers/net/wireless/ath/ath6kl/core.c | 2 + drivers/net/wireless/ath/ath6kl/init.c | 2 + drivers/net/wireless/ath/wil6210/cfg80211.c | 2 + drivers/net/wireless/ath/wil6210/netdev.c | 7 +- drivers/net/wireless/ath/wil6210/pcie_bus.c | 2 + .../wireless/broadcom/brcm80211/brcmfmac/core.c | 18 +- .../wireless/broadcom/brcm80211/brcmfmac/core.h | 6 +- .../net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 12 +- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 4 +- drivers/net/wireless/intel/iwlwifi/mvm/nvm.c | 2 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 6 +- drivers/net/wireless/marvell/mwifiex/main.c | 7 + drivers/net/wireless/quantenna/qtnfmac/core.c | 3 +- drivers/net/wireless/virt_wifi.c | 8 + include/net/cfg80211.h | 104 +++- include/net/mac80211.h | 10 +- net/mac80211/iface.c | 14 +- net/mac80211/key.c | 4 +- net/mac80211/main.c | 5 + net/mac80211/pm.c | 6 +- net/mac80211/tdls.c | 6 +- net/mac80211/util.c | 14 +- net/wireless/chan.c | 5 +- net/wireless/core.c | 46 +- net/wireless/core.h | 2 +- net/wireless/debugfs.c | 4 - net/wireless/ibss.c | 3 +- net/wireless/mlme.c | 6 +- net/wireless/nl80211.c | 657 +++++++++++---------- net/wireless/reg.c | 91 ++- net/wireless/reg.h | 1 - net/wireless/scan.c | 35 +- net/wireless/sme.c | 5 +- net/wireless/sysfs.c | 5 + net/wireless/util.c | 4 +- net/wireless/wext-compat.c | 271 ++++++--- net/wireless/wext-sme.c | 4 +- 39 files changed, 880 insertions(+), 509 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/reg.c b/drivers/net/wireless/ath/ath11k/reg.c index b876fec7fa1b..e1a1df169034 100644 --- a/drivers/net/wireless/ath/ath11k/reg.c +++ b/drivers/net/wireless/ath/ath11k/reg.c @@ -247,7 +247,9 @@ int ath11k_regd_update(struct ath11k *ar, bool init) } rtnl_lock(); - ret = regulatory_set_wiphy_regd_sync_rtnl(ar->hw->wiphy, regd_copy); + wiphy_lock(ar->hw->wiphy); + ret = regulatory_set_wiphy_regd_sync(ar->hw->wiphy, regd_copy); + wiphy_unlock(ar->hw->wiphy); rtnl_unlock(); kfree(regd_copy); diff --git a/drivers/net/wireless/ath/ath6kl/core.c b/drivers/net/wireless/ath/ath6kl/core.c index ebb9f163710f..4f0a7a185fc9 100644 --- a/drivers/net/wireless/ath/ath6kl/core.c +++ b/drivers/net/wireless/ath/ath6kl/core.c @@ -212,11 +212,13 @@ int ath6kl_core_init(struct ath6kl *ar, enum ath6kl_htc_type htc_type) ar->avail_idx_map |= BIT(i); rtnl_lock(); + wiphy_lock(ar->wiphy); /* Add an initial station interface */ wdev = ath6kl_interface_add(ar, "wlan%d", NET_NAME_ENUM, NL80211_IFTYPE_STATION, 0, INFRA_NETWORK); + wiphy_unlock(ar->wiphy); rtnl_unlock(); if (!wdev) { diff --git a/drivers/net/wireless/ath/ath6kl/init.c b/drivers/net/wireless/ath/ath6kl/init.c index 39bf19686175..9b5c7d8f2b95 100644 --- a/drivers/net/wireless/ath/ath6kl/init.c +++ b/drivers/net/wireless/ath/ath6kl/init.c @@ -1904,7 +1904,9 @@ void ath6kl_stop_txrx(struct ath6kl *ar) spin_unlock_bh(&ar->list_lock); ath6kl_cfg80211_vif_stop(vif, test_bit(WMI_READY, &ar->flag)); rtnl_lock(); + wiphy_lock(ar->wiphy); ath6kl_cfg80211_vif_cleanup(vif); + wiphy_unlock(ar->wiphy); rtnl_unlock(); spin_lock_bh(&ar->list_lock); } diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 1c42410d68e1..60bba5b491e0 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -2820,7 +2820,9 @@ void wil_p2p_wdev_free(struct wil6210_priv *wil) wil->radio_wdev = wil->main_ndev->ieee80211_ptr; mutex_unlock(&wil->vif_mutex); if (p2p_wdev) { + wiphy_lock(wil->wiphy); cfg80211_unregister_wdev(p2p_wdev); + wiphy_unlock(wil->wiphy); kfree(p2p_wdev); } } diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c index 472fe804203d..0913f0bf60e7 100644 --- a/drivers/net/wireless/ath/wil6210/netdev.c +++ b/drivers/net/wireless/ath/wil6210/netdev.c @@ -473,7 +473,9 @@ int wil_if_add(struct wil6210_priv *wil) wil_update_net_queues_bh(wil, vif, NULL, true); rtnl_lock(); + wiphy_lock(wiphy); rc = wil_vif_add(wil, vif); + wiphy_unlock(wiphy); rtnl_unlock(); if (rc < 0) goto out_wiphy; @@ -543,15 +545,18 @@ void wil_if_remove(struct wil6210_priv *wil) { struct net_device *ndev = wil->main_ndev; struct wireless_dev *wdev = ndev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; wil_dbg_misc(wil, "if_remove\n"); rtnl_lock(); + wiphy_lock(wiphy); wil_vif_remove(wil, 0); + wiphy_unlock(wiphy); rtnl_unlock(); netif_napi_del(&wil->napi_tx); netif_napi_del(&wil->napi_rx); - wiphy_unregister(wdev->wiphy); + wiphy_unregister(wiphy); } diff --git a/drivers/net/wireless/ath/wil6210/pcie_bus.c b/drivers/net/wireless/ath/wil6210/pcie_bus.c index c174323c5c0b..ce40d94909ad 100644 --- a/drivers/net/wireless/ath/wil6210/pcie_bus.c +++ b/drivers/net/wireless/ath/wil6210/pcie_bus.c @@ -473,8 +473,10 @@ static void wil_pcie_remove(struct pci_dev *pdev) wil6210_debugfs_remove(wil); rtnl_lock(); + wiphy_lock(wil->wiphy); wil_p2p_wdev_free(wil); wil_remove_all_additional_vifs(wil); + wiphy_unlock(wil->wiphy); rtnl_unlock(); wil_if_remove(wil); wil_if_pcie_disable(wil); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c index 6cf308d5934c..ea78fe527c5d 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c @@ -633,7 +633,7 @@ static const struct net_device_ops brcmf_netdev_ops_pri = { .ndo_set_rx_mode = brcmf_netdev_set_multicast_list }; -int brcmf_net_attach(struct brcmf_if *ifp, bool rtnl_locked) +int brcmf_net_attach(struct brcmf_if *ifp, bool locked) { struct brcmf_pub *drvr = ifp->drvr; struct net_device *ndev; @@ -656,7 +656,7 @@ int brcmf_net_attach(struct brcmf_if *ifp, bool rtnl_locked) INIT_WORK(&ifp->multicast_work, _brcmf_set_multicast_list); INIT_WORK(&ifp->ndoffload_work, _brcmf_update_ndtable); - if (rtnl_locked) + if (locked) err = cfg80211_register_netdevice(ndev); else err = register_netdev(ndev); @@ -677,10 +677,10 @@ fail: return -EBADE; } -void brcmf_net_detach(struct net_device *ndev, bool rtnl_locked) +void brcmf_net_detach(struct net_device *ndev, bool locked) { if (ndev->reg_state == NETREG_REGISTERED) { - if (rtnl_locked) + if (locked) cfg80211_unregister_netdevice(ndev); else unregister_netdev(ndev); @@ -909,7 +909,7 @@ struct brcmf_if *brcmf_add_if(struct brcmf_pub *drvr, s32 bsscfgidx, s32 ifidx, } static void brcmf_del_if(struct brcmf_pub *drvr, s32 bsscfgidx, - bool rtnl_locked) + bool locked) { struct brcmf_if *ifp; int ifidx; @@ -938,7 +938,7 @@ static void brcmf_del_if(struct brcmf_pub *drvr, s32 bsscfgidx, cancel_work_sync(&ifp->multicast_work); cancel_work_sync(&ifp->ndoffload_work); } - brcmf_net_detach(ifp->ndev, rtnl_locked); + brcmf_net_detach(ifp->ndev, locked); } else { /* Only p2p device interfaces which get dynamically created * end up here. In this case the p2p module should be informed @@ -947,7 +947,7 @@ static void brcmf_del_if(struct brcmf_pub *drvr, s32 bsscfgidx, * serious troublesome side effects. The p2p module will clean * up the ifp if needed. */ - brcmf_p2p_ifp_removed(ifp, rtnl_locked); + brcmf_p2p_ifp_removed(ifp, locked); kfree(ifp); } @@ -956,14 +956,14 @@ static void brcmf_del_if(struct brcmf_pub *drvr, s32 bsscfgidx, drvr->if2bss[ifidx] = BRCMF_BSSIDX_INVALID; } -void brcmf_remove_interface(struct brcmf_if *ifp, bool rtnl_locked) +void brcmf_remove_interface(struct brcmf_if *ifp, bool locked) { if (!ifp || WARN_ON(ifp->drvr->iflist[ifp->bsscfgidx] != ifp)) return; brcmf_dbg(TRACE, "Enter, bsscfgidx=%d, ifidx=%d\n", ifp->bsscfgidx, ifp->ifidx); brcmf_proto_del_if(ifp->drvr, ifp); - brcmf_del_if(ifp->drvr, ifp->bsscfgidx, rtnl_locked); + brcmf_del_if(ifp->drvr, ifp->bsscfgidx, locked); } static int brcmf_psm_watchdog_notify(struct brcmf_if *ifp, diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.h index 5767d665cee5..8212c9de14f1 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.h @@ -201,16 +201,16 @@ int brcmf_netdev_wait_pend8021x(struct brcmf_if *ifp); char *brcmf_ifname(struct brcmf_if *ifp); struct brcmf_if *brcmf_get_ifp(struct brcmf_pub *drvr, int ifidx); void brcmf_configure_arp_nd_offload(struct brcmf_if *ifp, bool enable); -int brcmf_net_attach(struct brcmf_if *ifp, bool rtnl_locked); +int brcmf_net_attach(struct brcmf_if *ifp, bool locked); struct brcmf_if *brcmf_add_if(struct brcmf_pub *drvr, s32 bsscfgidx, s32 ifidx, bool is_p2pdev, const char *name, u8 *mac_addr); -void brcmf_remove_interface(struct brcmf_if *ifp, bool rtnl_locked); +void brcmf_remove_interface(struct brcmf_if *ifp, bool locked); void brcmf_txflowblock_if(struct brcmf_if *ifp, enum brcmf_netif_stop_reason reason, bool state); void brcmf_txfinalize(struct brcmf_if *ifp, struct sk_buff *txp, bool success); void brcmf_netif_rx(struct brcmf_if *ifp, struct sk_buff *skb, bool inirq); void brcmf_netif_mon_rx(struct brcmf_if *ifp, struct sk_buff *skb); -void brcmf_net_detach(struct net_device *ndev, bool rtnl_locked); +void brcmf_net_detach(struct net_device *ndev, bool locked); int brcmf_net_mon_attach(struct brcmf_if *ifp); void brcmf_net_setcarrier(struct brcmf_if *ifp, bool on); int __init brcmf_core_init(void); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index ec6fc7a150a6..6d30a0fcecea 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@ -2430,7 +2430,7 @@ int brcmf_p2p_del_vif(struct wiphy *wiphy, struct wireless_dev *wdev) return err; } -void brcmf_p2p_ifp_removed(struct brcmf_if *ifp, bool rtnl_locked) +void brcmf_p2p_ifp_removed(struct brcmf_if *ifp, bool locked) { struct brcmf_cfg80211_info *cfg; struct brcmf_cfg80211_vif *vif; @@ -2439,11 +2439,15 @@ void brcmf_p2p_ifp_removed(struct brcmf_if *ifp, bool rtnl_locked) vif = ifp->vif; cfg = wdev_to_cfg(&vif->wdev); cfg->p2p.bss_idx[P2PAPI_BSSCFG_DEVICE].vif = NULL; - if (!rtnl_locked) + if (locked) { rtnl_lock(); - cfg80211_unregister_wdev(&vif->wdev); - if (!rtnl_locked) + wiphy_lock(cfg->wiphy); + cfg80211_unregister_wdev(&vif->wdev); + wiphy_unlock(cfg->wiphy); rtnl_unlock(); + } else { + cfg80211_unregister_wdev(&vif->wdev); + } brcmf_free_vif(vif); } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index c025188fa9bc..a0b7331cab31 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2143,7 +2143,7 @@ err: out_iterate: if (!test) - ieee80211_iterate_active_interfaces_rtnl(mvm->hw, + ieee80211_iterate_active_interfaces_mtx(mvm->hw, IEEE80211_IFACE_ITER_NORMAL, iwl_mvm_d3_disconnect_iter, keep ? vif : NULL); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index da32937ba9a7..6cce72b0685b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -260,7 +260,7 @@ int iwl_mvm_init_fw_regd(struct iwl_mvm *mvm) int ret; bool changed; const struct ieee80211_regdomain *r = - rtnl_dereference(mvm->hw->wiphy->regd); + wiphy_dereference(mvm->hw->wiphy, mvm->hw->wiphy->regd); if (!r) return -ENOENT; @@ -282,7 +282,7 @@ int iwl_mvm_init_fw_regd(struct iwl_mvm *mvm) /* update cfg80211 if the regdomain was changed */ if (changed) - ret = regulatory_set_wiphy_regd_sync_rtnl(mvm->hw->wiphy, regd); + ret = regulatory_set_wiphy_regd_sync(mvm->hw->wiphy, regd); else ret = 0; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c index abb8c1088c2f..7fb4e618f76e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c @@ -545,7 +545,7 @@ int iwl_mvm_init_mcc(struct iwl_mvm *mvm) return -EIO; } - retval = regulatory_set_wiphy_regd_sync_rtnl(mvm->hw->wiphy, regd); + retval = regulatory_set_wiphy_regd_sync(mvm->hw->wiphy, regd); kfree(regd); return retval; } diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 15e1cee7f465..5553df913290 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -2097,7 +2097,7 @@ mwifiex_cfg80211_disconnect(struct wiphy *wiphy, struct net_device *dev, struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); if (!mwifiex_stop_bg_scan(priv)) - cfg80211_sched_scan_stopped_rtnl(priv->wdev.wiphy, 0); + cfg80211_sched_scan_stopped_locked(priv->wdev.wiphy, 0); if (mwifiex_deauthenticate(priv, NULL)) return -EFAULT; @@ -2366,7 +2366,7 @@ mwifiex_cfg80211_connect(struct wiphy *wiphy, struct net_device *dev, (int)sme->ssid_len, (char *)sme->ssid, sme->bssid); if (!mwifiex_stop_bg_scan(priv)) - cfg80211_sched_scan_stopped_rtnl(priv->wdev.wiphy, 0); + cfg80211_sched_scan_stopped_locked(priv->wdev.wiphy, 0); ret = mwifiex_cfg80211_assoc(priv, sme->ssid_len, sme->ssid, sme->bssid, priv->bss_mode, sme->channel, sme, 0); @@ -2576,7 +2576,7 @@ mwifiex_cfg80211_scan(struct wiphy *wiphy, priv->scan_block = false; if (!mwifiex_stop_bg_scan(priv)) - cfg80211_sched_scan_stopped_rtnl(priv->wdev.wiphy, 0); + cfg80211_sched_scan_stopped_locked(priv->wdev.wiphy, 0); user_scan_cfg = kzalloc(sizeof(*user_scan_cfg), GFP_KERNEL); if (!user_scan_cfg) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index ee52fb839ef7..529dfd8b7ae8 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -598,12 +598,14 @@ static int _mwifiex_fw_dpc(const struct firmware *firmware, void *context) } rtnl_lock(); + wiphy_lock(adapter->wiphy); /* Create station interface by default */ wdev = mwifiex_add_virtual_intf(adapter->wiphy, "mlan%d", NET_NAME_ENUM, NL80211_IFTYPE_STATION, NULL); if (IS_ERR(wdev)) { mwifiex_dbg(adapter, ERROR, "cannot create default STA interface\n"); + wiphy_unlock(adapter->wiphy); rtnl_unlock(); goto err_add_intf; } @@ -614,6 +616,7 @@ static int _mwifiex_fw_dpc(const struct firmware *firmware, void *context) if (IS_ERR(wdev)) { mwifiex_dbg(adapter, ERROR, "cannot create AP interface\n"); + wiphy_unlock(adapter->wiphy); rtnl_unlock(); goto err_add_intf; } @@ -625,10 +628,12 @@ static int _mwifiex_fw_dpc(const struct firmware *firmware, void *context) if (IS_ERR(wdev)) { mwifiex_dbg(adapter, ERROR, "cannot create p2p client interface\n"); + wiphy_unlock(adapter->wiphy); rtnl_unlock(); goto err_add_intf; } } + wiphy_unlock(adapter->wiphy); rtnl_unlock(); mwifiex_drv_get_driver_version(adapter, fmt, sizeof(fmt) - 1); @@ -1440,9 +1445,11 @@ static void mwifiex_uninit_sw(struct mwifiex_adapter *adapter) if (!priv) continue; rtnl_lock(); + wiphy_lock(adapter->wiphy); if (priv->netdev && priv->wdev.iftype != NL80211_IFTYPE_UNSPECIFIED) mwifiex_del_virtual_intf(adapter->wiphy, &priv->wdev); + wiphy_unlock(adapter->wiphy); rtnl_unlock(); } diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c index 18964e2a9f28..b4dd60b2ebc9 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.c +++ b/drivers/net/wireless/quantenna/qtnfmac/core.c @@ -611,8 +611,9 @@ static int qtnf_core_mac_attach(struct qtnf_bus *bus, unsigned int macid) mac->wiphy_registered = 1; rtnl_lock(); - + wiphy_lock(priv_to_wiphy(mac)); ret = qtnf_core_net_attach(mac, vif, "wlan%d", NET_NAME_ENUM); + wiphy_unlock(priv_to_wiphy(mac)); rtnl_unlock(); if (ret) { diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c index c878097f0dda..4b455a4ae15b 100644 --- a/drivers/net/wireless/virt_wifi.c +++ b/drivers/net/wireless/virt_wifi.c @@ -537,7 +537,9 @@ static int virt_wifi_newlink(struct net *src_net, struct net_device *dev, dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION; dev->ieee80211_ptr->wiphy = common_wiphy; + wiphy_lock(common_wiphy); err = register_netdevice(dev); + wiphy_unlock(common_wiphy); if (err) { dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n", err); @@ -560,7 +562,9 @@ static int virt_wifi_newlink(struct net *src_net, struct net_device *dev, return 0; unregister_netdev: + wiphy_lock(common_wiphy); unregister_netdevice(dev); + wiphy_unlock(common_wiphy); free_wireless_dev: kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; @@ -586,7 +590,9 @@ static void virt_wifi_dellink(struct net_device *dev, netdev_rx_handler_unregister(priv->lowerdev); netdev_upper_dev_unlink(priv->lowerdev, dev); + wiphy_lock(common_wiphy); unregister_netdevice_queue(dev, head); + wiphy_unlock(common_wiphy); module_put(THIS_MODULE); /* Deleting the wiphy is handled in the module destructor. */ @@ -625,7 +631,9 @@ static int virt_wifi_event(struct notifier_block *this, unsigned long event, upper_dev = priv->upperdev; upper_dev->rtnl_link_ops->dellink(upper_dev, &list_kill); + wiphy_lock(common_wiphy); unregister_netdevice_many(&list_kill); + wiphy_unlock(common_wiphy); break; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e7703fdbac8d..4741d71ead21 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3632,9 +3632,10 @@ struct mgmt_frame_regs { * All callbacks except where otherwise noted should return 0 * on success or a negative error code. * - * All operations are currently invoked under rtnl for consistency with the - * wireless extensions but this is subject to reevaluation as soon as this - * code is used more widely and we have a first user without wext. + * All operations are invoked with the wiphy mutex held. The RTNL may be + * held in addition (due to wireless extensions) but this cannot be relied + * upon except in cases where documented below. Note that due to ordering, + * the RTNL also cannot be acquired in any handlers. * * @suspend: wiphy device needs to be suspended. The variable @wow will * be %NULL or contain the enabled Wake-on-Wireless triggers that are @@ -3649,11 +3650,14 @@ struct mgmt_frame_regs { * the new netdev in the wiphy's network namespace! Returns the struct * wireless_dev, or an ERR_PTR. For P2P device wdevs, the driver must * also set the address member in the wdev. + * This additionally holds the RTNL to be able to do netdev changes. * * @del_virtual_intf: remove the virtual interface + * This additionally holds the RTNL to be able to do netdev changes. * * @change_virtual_intf: change type/configuration of virtual interface, * keep the struct wireless_dev's iftype updated. + * This additionally holds the RTNL to be able to do netdev changes. * * @add_key: add a key with the given parameters. @mac_addr will be %NULL * when adding a group key. @@ -4743,6 +4747,7 @@ struct wiphy_iftype_akm_suites { /** * struct wiphy - wireless hardware description + * @mtx: mutex for the data (structures) of this device * @reg_notifier: the driver's regulatory notification callback, * note that if your driver uses wiphy_apply_custom_regulatory() * the reg_notifier's request can be passed as NULL @@ -4936,6 +4941,8 @@ struct wiphy_iftype_akm_suites { * @sar_capa: SAR control capabilities */ struct wiphy { + struct mutex mtx; + /* assign these fields before you register the wiphy */ u8 perm_addr[ETH_ALEN]; @@ -5188,6 +5195,37 @@ static inline struct wiphy *wiphy_new(const struct cfg80211_ops *ops, */ int wiphy_register(struct wiphy *wiphy); +/* this is a define for better error reporting (file/line) */ +#define lockdep_assert_wiphy(wiphy) lockdep_assert_held(&(wiphy)->mtx) + +/** + * rcu_dereference_wiphy - rcu_dereference with debug checking + * @wiphy: the wiphy to check the locking on + * @p: The pointer to read, prior to dereferencing + * + * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() + * or RTNL. Note: Please prefer wiphy_dereference() or rcu_dereference(). + */ +#define rcu_dereference_wiphy(wiphy, p) \ + rcu_dereference_check(p, lockdep_is_held(&wiphy->mtx)) + +/** + * wiphy_dereference - fetch RCU pointer when updates are prevented by wiphy mtx + * @wiphy: the wiphy to check the locking on + * @p: The pointer to read, prior to dereferencing + * + * Return the value of the specified RCU-protected pointer, but omit the + * READ_ONCE(), because caller holds the wiphy mutex used for updates. + */ +#define wiphy_dereference(wiphy, p) \ + rcu_dereference_protected(p, lockdep_is_held(&wiphy->mtx)) + +/** + * get_wiphy_regdom - get custom regdomain for the given wiphy + * @wiphy: the wiphy to get the regdomain from + */ +const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy); + /** * wiphy_unregister - deregister a wiphy from cfg80211 * @@ -5212,6 +5250,35 @@ struct cfg80211_internal_bss; struct cfg80211_cached_keys; struct cfg80211_cqm_config; +/** + * wiphy_lock - lock the wiphy + * @wiphy: the wiphy to lock + * + * This is mostly exposed so it can be done around registering and + * unregistering netdevs that aren't created through cfg80211 calls, + * since that requires locking in cfg80211 when the notifiers is + * called, but that cannot differentiate which way it's called. + * + * When cfg80211 ops are called, the wiphy is already locked. + */ +static inline void wiphy_lock(struct wiphy *wiphy) + __acquires(&wiphy->mtx) +{ + mutex_lock(&wiphy->mtx); + __acquire(&wiphy->mtx); +} + +/** + * wiphy_unlock - unlock the wiphy again + * @wiphy: the wiphy to unlock + */ +static inline void wiphy_unlock(struct wiphy *wiphy) + __releases(&wiphy->mtx) +{ + __release(&wiphy->mtx); + mutex_unlock(&wiphy->mtx); +} + /** * struct wireless_dev - wireless device state * @@ -5219,7 +5286,10 @@ struct cfg80211_cqm_config; * that uses the ieee80211_ptr field in struct net_device (this * is intentional so it can be allocated along with the netdev.) * It need not be registered then as netdev registration will - * be intercepted by cfg80211 to see the new wireless device. + * be intercepted by cfg80211 to see the new wireless device, + * however, drivers must lock the wiphy before registering or + * unregistering netdevs if they pre-create any netdevs (in ops + * called from cfg80211, the wiphy is already locked.) * * For non-netdev uses, it must also be allocated by the driver * in response to the cfg80211 callbacks that require it, as @@ -5981,18 +6051,18 @@ int regulatory_set_wiphy_regd(struct wiphy *wiphy, struct ieee80211_regdomain *rd); /** - * regulatory_set_wiphy_regd_sync_rtnl - set regdom for self-managed drivers + * regulatory_set_wiphy_regd_sync - set regdom for self-managed drivers * @wiphy: the wireless device we want to process the regulatory domain on * @rd: the regulatory domain information to use for this wiphy * - * This functions requires the RTNL to be held and applies the new regdomain - * synchronously to this wiphy. For more details see - * regulatory_set_wiphy_regd(). + * This functions requires the RTNL and the wiphy mutex to be held and + * applies the new regdomain synchronously to this wiphy. For more details + * see regulatory_set_wiphy_regd(). * * Return: 0 on success. -EINVAL, -EPERM */ -int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy, - struct ieee80211_regdomain *rd); +int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy, + struct ieee80211_regdomain *rd); /** * wiphy_apply_custom_regulatory - apply a custom driver regulatory domain @@ -6110,7 +6180,7 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid); /** - * cfg80211_sched_scan_stopped_rtnl - notify that the scheduled scan has stopped + * cfg80211_sched_scan_stopped_locked - notify that the scheduled scan has stopped * * @wiphy: the wiphy on which the scheduled scan stopped * @reqid: identifier for the related scheduled scan request @@ -6118,9 +6188,9 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid); * The driver can call this function to inform cfg80211 that the * scheduled scan had to be stopped, for whatever reason. The driver * is then called back via the sched_scan_stop operation when done. - * This function should be called with rtnl locked. + * This function should be called with the wiphy mutex held. */ -void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy, u64 reqid); +void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid); /** * cfg80211_inform_bss_frame_data - inform cfg80211 of a received BSS frame @@ -7557,7 +7627,7 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, * also checks if IR-relaxation conditions apply, to allow beaconing under * more permissive conditions. * - * Requires the RTNL to be held. + * Requires the wiphy mutex to be held. */ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, @@ -7661,7 +7731,7 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate); * when the driver wishes to unregister the wdev, e.g. when the hardware device * is unbound from the driver. * - * Requires the RTNL to be held. + * Requires the RTNL and wiphy mutex to be held. */ void cfg80211_unregister_wdev(struct wireless_dev *wdev); @@ -7673,6 +7743,8 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev); * than register_netdevice(), unregister_netdev() is impossible as the RTNL is * held. Otherwise, both register_netdevice() and register_netdev() are usable * instead as well. + * + * Requires the RTNL and wiphy mutex to be held. */ int cfg80211_register_netdevice(struct net_device *dev); @@ -7684,6 +7756,8 @@ int cfg80211_register_netdevice(struct net_device *dev); * than unregister_netdevice(), unregister_netdev() is impossible as the RTNL * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are * usable instead as well. + * + * Requires the RTNL and wiphy mutex to be held. */ static inline void cfg80211_unregister_netdevice(struct net_device *dev) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 176fe0d9f67f..2d1d629e5d14 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5529,7 +5529,7 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, void *data); /** - * ieee80211_iterate_active_interfaces_rtnl - iterate active interfaces + * ieee80211_iterate_active_interfaces_mtx - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. @@ -5540,12 +5540,12 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ -void ieee80211_iterate_active_interfaces_rtnl(struct ieee80211_hw *hw, - u32 iter_flags, - void (*iterator)(void *data, +void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, + u32 iter_flags, + void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), - void *data); + void *data); /** * ieee80211_iterate_stations_atomic - iterate stations diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index fcaf4d20cabf..4bc350cf4eee 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -357,11 +357,14 @@ static int ieee80211_open(struct net_device *dev) if (err) return err; - return ieee80211_do_open(&sdata->wdev, true); + wiphy_lock(sdata->local->hw.wiphy); + err = ieee80211_do_open(&sdata->wdev, true); + wiphy_unlock(sdata->local->hw.wiphy); + + return err; } -static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, - bool going_down) +static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { struct ieee80211_local *local = sdata->local; unsigned long flags; @@ -637,7 +640,9 @@ static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + wiphy_lock(sdata->local->hw.wiphy); ieee80211_do_stop(sdata, true); + wiphy_unlock(sdata->local->hw.wiphy); return 0; } @@ -2057,13 +2062,16 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) list_add(&sdata->list, &wdev_list); } mutex_unlock(&local->iflist_mtx); + unregister_netdevice_many(&unreg_list); + wiphy_lock(local->hw.wiphy); list_for_each_entry_safe(sdata, tmp, &wdev_list, list) { list_del(&sdata->list); cfg80211_unregister_wdev(&sdata->wdev); kfree(sdata); } + wiphy_unlock(local->hw.wiphy); } static int netdev_notify(struct notifier_block *nb, diff --git a/net/mac80211/key.c b/net/mac80211/key.c index a4817aa4b171..56c068cb49c4 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -887,7 +887,7 @@ void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata) struct ieee80211_key *key; struct ieee80211_sub_if_data *vlan; - ASSERT_RTNL(); + lockdep_assert_wiphy(sdata->local->hw.wiphy); mutex_lock(&sdata->local->key_mtx); @@ -924,7 +924,7 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_key *key, *tmp; struct ieee80211_sub_if_data *sdata; - ASSERT_RTNL(); + lockdep_assert_wiphy(hw->wiphy); mutex_lock(&local->key_mtx); if (vif) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index dee88ec566ad..4f3f8bb58e76 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -261,7 +261,9 @@ static void ieee80211_restart_work(struct work_struct *work) "%s called with hardware scan in progress\n", __func__); flush_work(&local->radar_detected_work); + /* we might do interface manipulations, so need both */ rtnl_lock(); + wiphy_lock(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { /* * XXX: there may be more work for other vif types and even @@ -293,6 +295,7 @@ static void ieee80211_restart_work(struct work_struct *work) synchronize_net(); ieee80211_reconfig(local); + wiphy_unlock(local->hw.wiphy); rtnl_unlock(); } @@ -1272,6 +1275,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rate_control_add_debugfs(local); rtnl_lock(); + wiphy_lock(hw->wiphy); /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && @@ -1285,6 +1289,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) "Failed to add default virtual iface\n"); } + wiphy_unlock(hw->wiphy); rtnl_unlock(); #ifdef CONFIG_INET diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index ae378a41c927..7809a906d7fe 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -1,4 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * Portions + * Copyright (C) 2020-2021 Intel Corporation + */ #include #include @@ -11,7 +15,7 @@ static void ieee80211_sched_scan_cancel(struct ieee80211_local *local) { if (ieee80211_request_sched_scan_stop(local)) return; - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); + cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index e01e4daeb8cd..f91d02b81b92 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1927,7 +1927,7 @@ ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_data *tf = (void *)skb->data; struct wiphy *wiphy = sdata->local->hw.wiphy; - ASSERT_RTNL(); + lockdep_assert_wiphy(wiphy); /* make sure the driver supports it */ if (!(wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH)) @@ -1979,7 +1979,7 @@ void ieee80211_tdls_chsw_work(struct work_struct *wk) struct sk_buff *skb; struct ieee80211_tdls_data *tf; - rtnl_lock(); + wiphy_lock(local->hw.wiphy); while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) { tf = (struct ieee80211_tdls_data *)skb->data; list_for_each_entry(sdata, &local->interfaces, list) { @@ -1994,7 +1994,7 @@ void ieee80211_tdls_chsw_work(struct work_struct *wk) kfree_skb(skb); } - rtnl_unlock(); + wiphy_unlock(local->hw.wiphy); } void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 8d3ae6b2f95f..f080fcf60e45 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -832,7 +832,7 @@ void ieee80211_iterate_active_interfaces_atomic( } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); -void ieee80211_iterate_active_interfaces_rtnl( +void ieee80211_iterate_active_interfaces_mtx( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), @@ -840,12 +840,12 @@ void ieee80211_iterate_active_interfaces_rtnl( { struct ieee80211_local *local = hw_to_local(hw); - ASSERT_RTNL(); + lockdep_assert_wiphy(hw->wiphy); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } -EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_rtnl); +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, @@ -2595,7 +2595,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->mtx); if (sched_scan_stopped) - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); + cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: @@ -3811,7 +3811,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) struct cfg80211_chan_def chandef; /* for interface list, to avoid linking iflist_mtx and chanctx_mtx */ - ASSERT_RTNL(); + lockdep_assert_wiphy(local->hw.wiphy); mutex_lock(&local->mtx); list_for_each_entry(sdata, &local->interfaces, list) { @@ -3851,9 +3851,9 @@ void ieee80211_dfs_radar_detected_work(struct work_struct *work) } mutex_unlock(&local->chanctx_mtx); - rtnl_lock(); + wiphy_lock(local->hw.wiphy); ieee80211_dfs_cac_cancel(local); - rtnl_unlock(); + wiphy_unlock(local->hw.wiphy); if (num_chanctx > 1) /* XXX: multi-channel is not supported yet */ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index e4030f1fbc60..285b8076054b 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -1093,7 +1093,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, struct wireless_dev *wdev; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) || !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) @@ -1216,9 +1216,10 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_iftype iftype) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); bool check_no_ir; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); /* * Under certain conditions suggested by some regulatory bodies a diff --git a/net/wireless/core.c b/net/wireless/core.c index 9e7d1f9620bd..200cd9f5fd5f 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -222,7 +222,7 @@ static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) return; @@ -247,7 +247,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) return; @@ -273,7 +273,11 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) dev_close(wdev->netdev); continue; } + /* otherwise, check iftype */ + + wiphy_lock(wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); @@ -284,6 +288,8 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) default: break; } + + wiphy_unlock(wiphy); } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); @@ -318,9 +324,9 @@ static void cfg80211_event_work(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, event_work); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); cfg80211_process_rdev_events(rdev); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) @@ -475,6 +481,7 @@ use_default_name: } } + mutex_init(&rdev->wiphy.mtx); INIT_LIST_HEAD(&rdev->wiphy.wdev_list); INIT_LIST_HEAD(&rdev->beacon_registrations); spin_lock_init(&rdev->beacon_registrations_lock); @@ -1007,15 +1014,16 @@ void wiphy_unregister(struct wiphy *wiphy) wait_event(rdev->dev_wait, ({ int __count; - rtnl_lock(); + wiphy_lock(&rdev->wiphy); __count = rdev->opencount; - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); __count == 0; })); if (rdev->rfkill) rfkill_unregister(rdev->rfkill); rtnl_lock(); + wiphy_lock(&rdev->wiphy); nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; @@ -1038,6 +1046,7 @@ void wiphy_unregister(struct wiphy *wiphy) cfg80211_rdev_list_generation++; device_del(&rdev->wiphy.dev); + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); flush_work(&rdev->scan_done_wk); @@ -1070,6 +1079,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) } list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) cfg80211_put_bss(&rdev->wiphy, &scan->pub); + mutex_destroy(&rdev->wiphy.mtx); kfree(rdev); } @@ -1100,6 +1110,7 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); flush_work(&wdev->pmsr_free_wk); @@ -1166,7 +1177,7 @@ static const struct device_type wiphy_type = { void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); rdev->num_running_ifaces += num; if (iftype == NL80211_IFTYPE_MONITOR) @@ -1179,7 +1190,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); cfg80211_pmsr_wdev_down(wdev); @@ -1296,6 +1307,9 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { + ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); + /* * We get here also when the interface changes network namespaces, * as it's registered into the new one, but we don't want it to @@ -1375,21 +1389,30 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: - if (!wdev->registered) + if (!wdev->registered) { + wiphy_lock(&rdev->wiphy); cfg80211_register_wdev(rdev, wdev); + wiphy_unlock(&rdev->wiphy); + } break; case NETDEV_UNREGISTER: /* * It is possible to get NETDEV_UNREGISTER multiple times, * so check wdev->registered. */ - if (wdev->registered) + if (wdev->registered) { + wiphy_lock(&rdev->wiphy); _cfg80211_unregister_wdev(wdev, false); + wiphy_unlock(&rdev->wiphy); + } break; case NETDEV_GOING_DOWN: + wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); + wiphy_unlock(&rdev->wiphy); break; case NETDEV_DOWN: + wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && @@ -1406,9 +1429,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, } rdev->opencount--; + wiphy_unlock(&rdev->wiphy); wake_up(&rdev->dev_wait); break; case NETDEV_UP: + wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, 1); wdev_lock(wdev); switch (wdev->iftype) { @@ -1455,6 +1480,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, /* assume this means it's off */ wdev->ps = false; } + wiphy_unlock(&rdev->wiphy); break; case NETDEV_PRE_UP: if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, diff --git a/net/wireless/core.h b/net/wireless/core.h index 7df91f940212..a7d19b4b40ac 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -231,7 +231,7 @@ static inline void wdev_unlock(struct wireless_dev *wdev) static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces && rdev->num_running_ifaces > 0; diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 76b845f68ac8..aab43469a2f0 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -73,8 +73,6 @@ static ssize_t ht40allow_map_read(struct file *file, if (!buf) return -ENOMEM; - rtnl_lock(); - for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) @@ -84,8 +82,6 @@ static ssize_t ht40allow_map_read(struct file *file, buf, buf_size, offset); } - rtnl_unlock(); - r = simple_read_from_buffer(user_buf, count, ppos, buf, offset); kfree(buf); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index a0621bb76d8e..8f98e546becf 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -3,6 +3,7 @@ * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg + * Copyright (C) 2020-2021 Intel Corporation */ #include @@ -92,7 +93,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = dev->ieee80211_ptr; int err; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); ASSERT_WDEV_LOCK(wdev); if (wdev->ssid_len) diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index e1e90761dc00..3aa69b375a10 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -450,7 +450,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) struct cfg80211_mgmt_registration *reg; struct mgmt_frame_regs upd = {}; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_bh(&wdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { @@ -492,10 +492,10 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 775d0c4d86c3..e5e9d889f00f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -64,9 +64,9 @@ static const struct genl_multicast_group nl80211_mcgrps[] = { /* returns ERR_PTR values */ static struct wireless_dev * -__cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) +__cfg80211_wdev_from_attrs(struct cfg80211_registered_device *rdev, + struct net *netns, struct nlattr **attrs) { - struct cfg80211_registered_device *rdev; struct wireless_dev *result = NULL; bool have_ifidx = attrs[NL80211_ATTR_IFINDEX]; bool have_wdev_id = attrs[NL80211_ATTR_WDEV]; @@ -74,8 +74,6 @@ __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) int wiphy_idx = -1; int ifidx = -1; - ASSERT_RTNL(); - if (!have_ifidx && !have_wdev_id) return ERR_PTR(-EINVAL); @@ -86,6 +84,28 @@ __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) wiphy_idx = wdev_id >> 32; } + if (rdev) { + struct wireless_dev *wdev; + + lockdep_assert_held(&rdev->wiphy.mtx); + + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + if (have_ifidx && wdev->netdev && + wdev->netdev->ifindex == ifidx) { + result = wdev; + break; + } + if (have_wdev_id && wdev->identifier == (u32)wdev_id) { + result = wdev; + break; + } + } + + return result ?: ERR_PTR(-ENODEV); + } + + ASSERT_RTNL(); + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { struct wireless_dev *wdev; @@ -914,22 +934,31 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb, return err; } - *wdev = __cfg80211_wdev_from_attrs(sock_net(cb->skb->sk), + rtnl_lock(); + *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(cb->skb->sk), attrbuf); kfree(attrbuf); - if (IS_ERR(*wdev)) + if (IS_ERR(*wdev)) { + rtnl_unlock(); return PTR_ERR(*wdev); + } *rdev = wiphy_to_rdev((*wdev)->wiphy); + mutex_lock(&(*rdev)->wiphy.mtx); + rtnl_unlock(); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wiphy_idx + 1; cb->args[1] = (*wdev)->identifier; } else { /* subtract the 1 again here */ - struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); + struct wiphy *wiphy; struct wireless_dev *tmp; - if (!wiphy) + rtnl_lock(); + wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1); + if (!wiphy) { + rtnl_unlock(); return -ENODEV; + } *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -940,8 +969,12 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb, } } - if (!*wdev) + if (!*wdev) { + rtnl_unlock(); return -ENODEV; + } + mutex_lock(&(*rdev)->wiphy.mtx); + rtnl_unlock(); } return 0; @@ -3141,7 +3174,7 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = NULL; struct net_device *netdev = NULL; struct wireless_dev *wdev; int result = 0, rem_txq_params = 0; @@ -3152,8 +3185,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 coverage_class = 0; u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0; - ASSERT_RTNL(); - + rtnl_lock(); /* * Try to find the wiphy and netdev. Normally this * function shouldn't need the netdev, but this is @@ -3177,14 +3209,19 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!netdev) { rdev = __cfg80211_rdev_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(rdev)) + if (IS_ERR(rdev)) { + rtnl_unlock(); return PTR_ERR(rdev); + } wdev = NULL; netdev = NULL; result = 0; } else wdev = netdev->ieee80211_ptr; + wiphy_lock(&rdev->wiphy); + rtnl_unlock(); + /* * end workaround code, by now the rdev is available * and locked, and wdev may or may not be NULL. @@ -3195,24 +3232,32 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME])); if (result) - return result; + goto out; if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; - if (!rdev->ops->set_txq_params) - return -EOPNOTSUPP; + if (!rdev->ops->set_txq_params) { + result = -EOPNOTSUPP; + goto out; + } - if (!netdev) - return -EINVAL; + if (!netdev) { + result = -EINVAL; + goto out; + } if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) - return -EINVAL; + netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + result = -EINVAL; + goto out; + } - if (!netif_running(netdev)) - return -ENETDOWN; + if (!netif_running(netdev)) { + result = -ENETDOWN; + goto out; + } nla_for_each_nested(nl_txq_params, info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], @@ -3223,15 +3268,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) txq_params_policy, info->extack); if (result) - return result; + goto out; result = parse_txq_params(tb, &txq_params); if (result) - return result; + goto out; result = rdev_set_txq_params(rdev, netdev, &txq_params); if (result) - return result; + goto out; } } @@ -3241,7 +3286,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) nl80211_can_set_dev_channel(wdev) ? netdev : NULL, info); if (result) - return result; + goto out; } if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) { @@ -3252,15 +3297,19 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER)) txp_wdev = NULL; - if (!rdev->ops->set_tx_power) - return -EOPNOTSUPP; + if (!rdev->ops->set_tx_power) { + result = -EOPNOTSUPP; + goto out; + } idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING; type = nla_get_u32(info->attrs[idx]); if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] && - (type != NL80211_TX_POWER_AUTOMATIC)) - return -EINVAL; + (type != NL80211_TX_POWER_AUTOMATIC)) { + result = -EINVAL; + goto out; + } if (type != NL80211_TX_POWER_AUTOMATIC) { idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL; @@ -3269,7 +3318,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); if (result) - return result; + goto out; } if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && @@ -3278,8 +3327,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || - !rdev->ops->set_antenna) - return -EOPNOTSUPP; + !rdev->ops->set_antenna) { + result = -EOPNOTSUPP; + goto out; + } tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]); rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]); @@ -3287,15 +3338,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) /* reject antenna configurations which don't match the * available antenna masks, except for the "all" mask */ if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) || - (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) - return -EINVAL; + (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) { + result = -EINVAL; + goto out; + } tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; result = rdev_set_antenna(rdev, tx_ant, rx_ant); if (result) - return result; + goto out; } changed = 0; @@ -3317,8 +3370,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) { frag_threshold = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]); - if (frag_threshold < 256) - return -EINVAL; + if (frag_threshold < 256) { + result = -EINVAL; + goto out; + } if (frag_threshold != (u32) -1) { /* @@ -3339,8 +3394,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { - if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) - return -EINVAL; + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { + result = -EINVAL; + goto out; + } coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); @@ -3348,16 +3405,20 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { - if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) - return -EOPNOTSUPP; + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) { + result = -EOPNOTSUPP; + goto out; + } changed |= WIPHY_PARAM_DYN_ACK; } if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) - return -EOPNOTSUPP; + NL80211_EXT_FEATURE_TXQS)) { + result = -EOPNOTSUPP; + goto out; + } txq_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_LIMIT]); changed |= WIPHY_PARAM_TXQ_LIMIT; @@ -3365,8 +3426,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) - return -EOPNOTSUPP; + NL80211_EXT_FEATURE_TXQS)) { + result = -EOPNOTSUPP; + goto out; + } txq_memory_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]); changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT; @@ -3374,8 +3437,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) - return -EOPNOTSUPP; + NL80211_EXT_FEATURE_TXQS)) { + result = -EOPNOTSUPP; + goto out; + } txq_quantum = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_QUANTUM]); changed |= WIPHY_PARAM_TXQ_QUANTUM; @@ -3387,8 +3452,10 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 old_coverage_class; u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; - if (!rdev->ops->set_wiphy_params) - return -EOPNOTSUPP; + if (!rdev->ops->set_wiphy_params) { + result = -EOPNOTSUPP; + goto out; + } old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; @@ -3426,10 +3493,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.txq_limit = old_txq_limit; rdev->wiphy.txq_memory_limit = old_txq_memory_limit; rdev->wiphy.txq_quantum = old_txq_quantum; - return result; + goto out; } } - return 0; + + result = 0; + +out: + wiphy_unlock(&rdev->wiphy); + return result; } static int nl80211_send_chandef(struct sk_buff *msg, @@ -3959,6 +4031,17 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->del_virtual_intf) return -EOPNOTSUPP; + /* + * We hold RTNL, so this is safe, without RTNL opencount cannot + * reach 0, and thus the rdev cannot be deleted. + * + * We need to do it for the dev_close(), since that will call + * the netdev notifiers, and we need to acquire the mutex there + * but don't know if we get there from here or from some other + * place (e.g. "ip link set ... down"). + */ + mutex_unlock(&rdev->wiphy.mtx); + /* * If we remove a wireless device without a netdev then clear * user_ptr[1] so that nl80211_post_doit won't dereference it @@ -3968,6 +4051,10 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) */ if (!wdev->netdev) info->user_ptr[1] = NULL; + else + dev_close(wdev->netdev); + + mutex_lock(&rdev->wiphy.mtx); return rdev_del_virtual_intf(rdev, wdev); } @@ -5884,10 +5971,11 @@ static int nl80211_dump_station(struct sk_buff *skb, int sta_idx = cb->args[2]; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) - goto out_err; + return err; + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); if (!wdev->netdev) { err = -EINVAL; @@ -5922,7 +6010,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return err; } @@ -6780,10 +6868,11 @@ static int nl80211_dump_mpath(struct sk_buff *skb, int path_idx = cb->args[2]; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) - goto out_err; + return err; + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); if (!rdev->ops->dump_mpath) { err = -EOPNOTSUPP; @@ -6816,7 +6905,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return err; } @@ -6979,10 +7068,11 @@ static int nl80211_dump_mpp(struct sk_buff *skb, int path_idx = cb->args[2]; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); if (err) - goto out_err; + return err; + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); if (!rdev->ops->dump_mpp) { err = -EOPNOTSUPP; @@ -7015,7 +7105,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb, cb->args[2] = path_idx; err = skb->len; out_err: - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return err; } @@ -7634,12 +7724,15 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) if (!hdr) goto put_failure; + rtnl_lock(); + if (info->attrs[NL80211_ATTR_WIPHY]) { bool self_managed; rdev = cfg80211_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { nlmsg_free(msg); + rtnl_unlock(); return PTR_ERR(rdev); } @@ -7651,6 +7744,7 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) /* a self-managed-reg device must have a private regdom */ if (WARN_ON(!regdom && self_managed)) { nlmsg_free(msg); + rtnl_unlock(); return -EINVAL; } @@ -7675,11 +7769,13 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); genlmsg_end(msg, hdr); + rtnl_unlock(); return genlmsg_reply(msg, info); nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: + rtnl_unlock(); put_failure: nlmsg_free(msg); return -EMSGSIZE; @@ -7842,12 +7938,17 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!reg_is_valid_request(alpha2)) - return -EINVAL; + rtnl_lock(); + if (!reg_is_valid_request(alpha2)) { + r = -EINVAL; + goto out; + } rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); - if (!rd) - return -ENOMEM; + if (!rd) { + r = -ENOMEM; + goto out; + } rd->n_reg_rules = num_rules; rd->alpha2[0] = alpha2[0]; @@ -7879,10 +7980,13 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) } } + r = set_regdom(rd, REGD_SOURCE_CRDA); /* set_regdom takes ownership of rd */ - return set_regdom(rd, REGD_SOURCE_CRDA); + rd = NULL; bad_reg: kfree(rd); + out: + rtnl_unlock(); return r; } #endif /* CONFIG_CFG80211_CRDA_SUPPORT */ @@ -9050,10 +9154,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_csa_settings params; - /* csa_attrs is defined static to avoid waste of stack size - this - * function is called under RTNL lock, so this should not be a problem. - */ - static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1]; + struct nlattr **csa_attrs = NULL; int err; bool need_new_beacon = false; bool need_handle_dfs_flag = true; @@ -9118,28 +9219,39 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; + csa_attrs = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*csa_attrs), + GFP_KERNEL); + if (!csa_attrs) + return -ENOMEM; + err = nla_parse_nested_deprecated(csa_attrs, NL80211_ATTR_MAX, info->attrs[NL80211_ATTR_CSA_IES], nl80211_policy, info->extack); if (err) - return err; + goto free; err = nl80211_parse_beacon(rdev, csa_attrs, ¶ms.beacon_csa); if (err) - return err; + goto free; - if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) - return -EINVAL; + if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) { + err = -EINVAL; + goto free; + } len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); - if (!len || (len % sizeof(u16))) - return -EINVAL; + if (!len || (len % sizeof(u16))) { + err = -EINVAL; + goto free; + } params.n_counter_offsets_beacon = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && (params.n_counter_offsets_beacon > - rdev->wiphy.max_num_csa_counters)) - return -EINVAL; + rdev->wiphy.max_num_csa_counters)) { + err = -EINVAL; + goto free; + } params.counter_offsets_beacon = nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); @@ -9148,23 +9260,31 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < params.n_counter_offsets_beacon; i++) { u16 offset = params.counter_offsets_beacon[i]; - if (offset >= params.beacon_csa.tail_len) - return -EINVAL; + if (offset >= params.beacon_csa.tail_len) { + err = -EINVAL; + goto free; + } - if (params.beacon_csa.tail[offset] != params.count) - return -EINVAL; + if (params.beacon_csa.tail[offset] != params.count) { + err = -EINVAL; + goto free; + } } if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); - if (!len || (len % sizeof(u16))) - return -EINVAL; + if (!len || (len % sizeof(u16))) { + err = -EINVAL; + goto free; + } params.n_counter_offsets_presp = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && (params.n_counter_offsets_presp > - rdev->wiphy.max_num_csa_counters)) - return -EINVAL; + rdev->wiphy.max_num_csa_counters)) { + err = -EINVAL; + goto free; + } params.counter_offsets_presp = nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); @@ -9173,35 +9293,42 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < params.n_counter_offsets_presp; i++) { u16 offset = params.counter_offsets_presp[i]; - if (offset >= params.beacon_csa.probe_resp_len) - return -EINVAL; + if (offset >= params.beacon_csa.probe_resp_len) { + err = -EINVAL; + goto free; + } if (params.beacon_csa.probe_resp[offset] != - params.count) - return -EINVAL; + params.count) { + err = -EINVAL; + goto free; + } } } skip_beacons: err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); if (err) - return err; + goto free; if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) - return -EINVAL; + wdev->iftype)) { + err = -EINVAL; + goto free; + } err = cfg80211_chandef_dfs_required(wdev->wiphy, ¶ms.chandef, wdev->iftype); if (err < 0) - return err; + goto free; if (err > 0) { params.radar_required = true; if (need_handle_dfs_flag && !nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS])) { - return -EINVAL; + err = -EINVAL; + goto free; } } @@ -9212,6 +9339,8 @@ skip_beacons: err = rdev_channel_switch(rdev, dev, ¶ms); wdev_unlock(wdev); +free: + kfree(csa_attrs); return err; } @@ -9362,12 +9491,11 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[2], idx = 0; int err; - rtnl_lock(); err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); - if (err) { - rtnl_unlock(); + if (err) return err; - } + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); wdev_lock(wdev); spin_lock_bh(&rdev->bss_lock); @@ -9398,7 +9526,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) wdev_unlock(wdev); cb->args[2] = idx; - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return skb->len; } @@ -9496,10 +9624,13 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) if (!attrbuf) return -ENOMEM; - rtnl_lock(); res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); - if (res) - goto out_err; + if (res) { + kfree(attrbuf); + return res; + } + /* nl80211_prepare_wdev_dump acquired it in the successful case */ + __acquire(&rdev->wiphy.mtx); /* prepare_wdev_dump parsed the attributes */ radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS]; @@ -9541,7 +9672,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) res = skb->len; out_err: kfree(attrbuf); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); return res; } @@ -10403,10 +10534,14 @@ EXPORT_SYMBOL(__cfg80211_send_event_skb); static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct wireless_dev *wdev = - __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs); + struct wireless_dev *wdev; int err; + lockdep_assert_held(&rdev->wiphy.mtx); + + wdev = __cfg80211_wdev_from_attrs(rdev, genl_info_net(info), + info->attrs); + if (!rdev->ops->testmode_cmd) return -EOPNOTSUPP; @@ -13591,7 +13726,8 @@ static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = - __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs); + __cfg80211_wdev_from_attrs(rdev, genl_info_net(info), + info->attrs); int i, err; u32 vid, subcmd; @@ -13715,7 +13851,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, goto out; } - *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf); + *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(skb->sk), attrbuf); if (IS_ERR(*wdev)) *wdev = NULL; @@ -14650,31 +14786,24 @@ bad_tid_conf: static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = NULL; struct wireless_dev *wdev; struct net_device *dev; - bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL; - - if (rtnl) - rtnl_lock(); + rtnl_lock(); if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) { rdev = cfg80211_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return PTR_ERR(rdev); } info->user_ptr[0] = rdev; } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV || ops->internal_flags & NL80211_FLAG_NEED_WDEV) { - ASSERT_RTNL(); - - wdev = __cfg80211_wdev_from_attrs(genl_info_net(info), + wdev = __cfg80211_wdev_from_attrs(NULL, genl_info_net(info), info->attrs); if (IS_ERR(wdev)) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return PTR_ERR(wdev); } @@ -14683,8 +14812,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) { if (!dev) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return -EINVAL; } @@ -14695,8 +14823,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && !wdev_running(wdev)) { - if (rtnl) - rtnl_unlock(); + rtnl_unlock(); return -ENETDOWN; } @@ -14706,6 +14833,14 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, info->user_ptr[0] = rdev; } + if (rdev) { + wiphy_lock(&rdev->wiphy); + /* we keep the mutex locked until post_doit */ + __release(&rdev->wiphy.mtx); + } + if (!(ops->internal_flags & NL80211_FLAG_NEED_RTNL)) + rtnl_unlock(); + return 0; } @@ -14723,6 +14858,14 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, } } + if (info->user_ptr[0]) { + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + + /* we kept the mutex locked since pre_doit */ + __acquire(&rdev->wiphy.mtx); + wiphy_unlock(&rdev->wiphy); + } + if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) rtnl_unlock(); @@ -14851,8 +14994,7 @@ static const struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_wiphy, .done = nl80211_dump_wiphy_done, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, }; @@ -14862,7 +15004,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wiphy, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_INTERFACE, @@ -14870,8 +15011,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_interface, .dumpit = nl80211_dump_interface, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV, }, { .cmd = NL80211_CMD_SET_INTERFACE, @@ -14902,8 +15042,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_KEY, @@ -14911,7 +15050,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_set_key, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, }, { @@ -14920,7 +15058,6 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_new_key, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, }, { @@ -14928,64 +15065,56 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_BEACON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_set_beacon, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_START_AP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_start_ap, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_STOP_AP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_stop_ap, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_station, .dumpit = nl80211_dump_station, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_NEW_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DEL_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_MPATH, @@ -14993,8 +15122,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_mpath, .dumpit = nl80211_dump_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_MPP, @@ -15002,47 +15130,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_get_mpp, .dumpit = nl80211_dump_mpp, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_NEW_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DEL_MPATH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_mpath, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_BSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_bss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_REG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_reg_do, .dumpit = nl80211_get_reg_dump, - .internal_flags = NL80211_FLAG_NEED_RTNL, + .internal_flags = 0, /* can be retrieved by unprivileged users */ }, #ifdef CONFIG_CFG80211_CRDA_SUPPORT @@ -15051,7 +15174,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_reg, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_RTNL, + .internal_flags = 0, }, #endif { @@ -15071,32 +15194,28 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_mesh_config, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_MESH_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_mesh_config, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TRIGGER_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_trigger_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_ABORT_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_abort_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_GET_SCAN, @@ -15108,16 +15227,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_sched_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_STOP_SCHED_SCAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_sched_scan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_AUTHENTICATE, @@ -15125,7 +15242,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_authenticate, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15134,7 +15251,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_associate, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15142,32 +15259,28 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_deauthenticate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DISASSOCIATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disassociate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_JOIN_IBSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ibss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_LEAVE_IBSS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ibss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, #ifdef CONFIG_NL80211_TESTMODE { @@ -15176,8 +15289,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_testmode_do, .dumpit = nl80211_testmode_dump, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, #endif { @@ -15186,7 +15298,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_connect, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15195,7 +15307,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_update_connect_params, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15203,16 +15315,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disconnect, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_WIPHY_NETNS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_wiphy_netns, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_GET_SURVEY, @@ -15225,7 +15335,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15233,128 +15343,112 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_setdel_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_FLUSH_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_flush_pmksa, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_remain_on_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_cancel_remain_on_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_tx_bitrate_mask, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_REGISTER_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV, }, { .cmd = NL80211_CMD_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt_cancel_wait, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_SET_POWER_SAVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_power_save, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_GET_POWER_SAVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_power_save, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_CQM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_cqm, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_channel, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_JOIN_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_mesh, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_LEAVE_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_mesh, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_JOIN_OCB, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ocb, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_LEAVE_OCB, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ocb, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, #ifdef CONFIG_PM { @@ -15362,16 +15456,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_wowlan, /* can be retrieved by unprivileged users */ - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_WOWLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wowlan, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, #endif { @@ -15380,7 +15472,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_set_rekey_data, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15388,48 +15480,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_mgmt, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TDLS_OPER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_oper, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_UNEXPECTED_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_unexpected_frame, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_PROBE_CLIENT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_probe_client, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_REGISTER_BEACONS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_beacons, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_NOACK_MAP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_noack_map, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_START_P2P_DEVICE, @@ -15468,48 +15554,42 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_add_func, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_DEL_NAN_FUNCTION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_del_func, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_CHANGE_NAN_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_change_config, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_SET_MCAST_RATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mcast_rate, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_MAC_ACL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mac_acl, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_RADAR_DETECT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_radar_detection, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES, @@ -15521,47 +15601,41 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_ft_ies, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_start, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_stop, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_GET_COALESCE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_coalesce, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_COALESCE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_coalesce, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_VENDOR, @@ -15570,7 +15644,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .dumpit = nl80211_vendor_cmd_dump, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { @@ -15578,123 +15652,108 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_qos_map, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_ADD_TX_TS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_add_tx_ts, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_DEL_TX_TS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_tx_ts, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_cancel_channel_switch, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_multicast_to_unicast, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL | + 0 | NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_EXTERNAL_AUTH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_external_auth, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_CONTROL_PORT_FRAME, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_control_port, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_ftm_responder_stats, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_PEER_MEASUREMENT_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_pmsr_start, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_WDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP, }, { .cmd = NL80211_CMD_NOTIFY_RADAR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_notify_radar_detection, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_UPDATE_OWE_INFO, .doit = nl80211_update_owe_info, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_PROBE_MESH_LINK, .doit = nl80211_probe_mesh_link, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, }, { .cmd = NL80211_CMD_SET_TID_CONFIG, .doit = nl80211_set_tid_config, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | - NL80211_FLAG_NEED_RTNL, + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_SAR_SPECS, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 8114bba8556c..452b698f42be 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -142,12 +142,15 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void) /* * Returns the regulatory domain associated with the wiphy. * - * Requires either RTNL or RCU protection + * Requires any of RTNL, wiphy mutex or RCU protection. */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { - return rcu_dereference_rtnl(wiphy->regd); + return rcu_dereference_check(wiphy->regd, + lockdep_is_held(&wiphy->mtx) || + lockdep_rtnl_is_held()); } +EXPORT_SYMBOL(get_wiphy_regdom); static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) { @@ -169,7 +172,9 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; + rcu_read_lock(); regd = get_cfg80211_regdom(); + if (!wiphy) goto out; @@ -186,6 +191,8 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) reg_dfs_region_str(regd->dfs_region)); out: + rcu_read_unlock(); + return regd->dfs_region; } @@ -2577,11 +2584,13 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, return; rtnl_lock(); + wiphy_lock(wiphy); tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, new_regd); rcu_free_regdom(tmp); + wiphy_unlock(wiphy); rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); @@ -2744,7 +2753,10 @@ reg_process_hint_driver(struct wiphy *wiphy, return REG_REQ_IGNORE; tmp = get_wiphy_regdom(wiphy); + ASSERT_RTNL(); + wiphy_lock(wiphy); rcu_assign_pointer(wiphy->regd, regd); + wiphy_unlock(wiphy); rcu_free_regdom(tmp); } @@ -3076,41 +3088,52 @@ static void reg_process_pending_beacon_hints(void) spin_unlock_bh(®_pending_beacons_lock); } -static void reg_process_self_managed_hints(void) +static void reg_process_self_managed_hint(struct wiphy *wiphy) { - struct cfg80211_registered_device *rdev; - struct wiphy *wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const struct ieee80211_regdomain *tmp; const struct ieee80211_regdomain *regd; enum nl80211_band band; struct regulatory_request request = {}; - list_for_each_entry(rdev, &cfg80211_rdev_list, list) { - wiphy = &rdev->wiphy; + ASSERT_RTNL(); + lockdep_assert_wiphy(wiphy); - spin_lock(®_requests_lock); - regd = rdev->requested_regd; - rdev->requested_regd = NULL; - spin_unlock(®_requests_lock); + spin_lock(®_requests_lock); + regd = rdev->requested_regd; + rdev->requested_regd = NULL; + spin_unlock(®_requests_lock); - if (regd == NULL) - continue; + if (!regd) + return; - tmp = get_wiphy_regdom(wiphy); - rcu_assign_pointer(wiphy->regd, regd); - rcu_free_regdom(tmp); + tmp = get_wiphy_regdom(wiphy); + rcu_assign_pointer(wiphy->regd, regd); + rcu_free_regdom(tmp); + + for (band = 0; band < NUM_NL80211_BANDS; band++) + handle_band_custom(wiphy, wiphy->bands[band], regd); - for (band = 0; band < NUM_NL80211_BANDS; band++) - handle_band_custom(wiphy, wiphy->bands[band], regd); + reg_process_ht_flags(wiphy); - reg_process_ht_flags(wiphy); + request.wiphy_idx = get_wiphy_idx(wiphy); + request.alpha2[0] = regd->alpha2[0]; + request.alpha2[1] = regd->alpha2[1]; + request.initiator = NL80211_REGDOM_SET_BY_DRIVER; - request.wiphy_idx = get_wiphy_idx(wiphy); - request.alpha2[0] = regd->alpha2[0]; - request.alpha2[1] = regd->alpha2[1]; - request.initiator = NL80211_REGDOM_SET_BY_DRIVER; + nl80211_send_wiphy_reg_change_event(&request); +} - nl80211_send_wiphy_reg_change_event(&request); +static void reg_process_self_managed_hints(void) +{ + struct cfg80211_registered_device *rdev; + + ASSERT_RTNL(); + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + wiphy_lock(&rdev->wiphy); + reg_process_self_managed_hint(&rdev->wiphy); + wiphy_unlock(&rdev->wiphy); } reg_check_channels(); @@ -3789,14 +3812,21 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, return -ENODEV; if (!driver_request->intersect) { - if (request_wiphy->regd) + ASSERT_RTNL(); + wiphy_lock(request_wiphy); + if (request_wiphy->regd) { + wiphy_unlock(request_wiphy); return -EALREADY; + } regd = reg_copy_regd(rd); - if (IS_ERR(regd)) + if (IS_ERR(regd)) { + wiphy_unlock(request_wiphy); return PTR_ERR(regd); + } rcu_assign_pointer(request_wiphy->regd, regd); + wiphy_unlock(request_wiphy); reset_regdomains(false, rd); return 0; } @@ -3978,8 +4008,8 @@ int regulatory_set_wiphy_regd(struct wiphy *wiphy, } EXPORT_SYMBOL(regulatory_set_wiphy_regd); -int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy, - struct ieee80211_regdomain *rd) +int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy, + struct ieee80211_regdomain *rd) { int ret; @@ -3990,10 +4020,11 @@ int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy, return ret; /* process the request immediately */ - reg_process_self_managed_hints(); + reg_process_self_managed_hint(wiphy); + reg_check_channels(); return 0; } -EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl); +EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync); void wiphy_regulatory_register(struct wiphy *wiphy) { diff --git a/net/wireless/reg.h b/net/wireless/reg.h index f9e83031a40a..f3707f729024 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -63,7 +63,6 @@ unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule); bool reg_last_request_cell_base(void); -const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy); /** * regulatory_hint_found_beacon - hints a beacon was found on a channel diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 1b7fec3b53cd..019952d4fc7d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -918,7 +918,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, union iwreq_data wrqu; #endif - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (rdev->scan_msg) { nl80211_send_scan_msg(rdev, rdev->scan_msg); @@ -987,9 +987,9 @@ void __cfg80211_scan_done(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, scan_done_wk); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); ___cfg80211_scan_done(rdev, true); - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } void cfg80211_scan_done(struct cfg80211_scan_request *request, @@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(cfg80211_scan_done); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); list_add_rcu(&req->list, &rdev->sched_scan_req_list); } @@ -1030,7 +1030,7 @@ void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); list_del_rcu(&req->list); kfree_rcu(req, rcu_head); @@ -1042,7 +1042,7 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) struct cfg80211_sched_scan_request *pos; list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list, - lockdep_rtnl_is_held()) { + lockdep_is_held(&rdev->wiphy.mtx)) { if (pos->reqid == reqid) return pos; } @@ -1090,7 +1090,7 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); - rtnl_lock(); + wiphy_lock(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; @@ -1105,7 +1105,7 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) NL80211_CMD_SCHED_SCAN_RESULTS); } } - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) @@ -1126,23 +1126,23 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) } EXPORT_SYMBOL(cfg80211_sched_scan_results); -void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy, u64 reqid) +void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - ASSERT_RTNL(); + lockdep_assert_held(&wiphy->mtx); trace_cfg80211_sched_scan_stopped(wiphy, reqid); __cfg80211_stop_sched_scan(rdev, reqid, true); } -EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl); +EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { - rtnl_lock(); - cfg80211_sched_scan_stopped_rtnl(wiphy, reqid); - rtnl_unlock(); + wiphy_lock(wiphy); + cfg80211_sched_scan_stopped_locked(wiphy, reqid); + wiphy_unlock(wiphy); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); @@ -1150,7 +1150,7 @@ int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated) { - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); if (!driver_initiated) { int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid); @@ -1170,7 +1170,7 @@ int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, { struct cfg80211_sched_scan_request *sched_scan_req; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); if (!sched_scan_req) @@ -2774,6 +2774,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, eth_broadcast_addr(creq->bssid); + wiphy_lock(&rdev->wiphy); + rdev->scan_req = creq; err = rdev_scan(rdev, creq); if (err) { @@ -2785,6 +2787,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, creq = NULL; dev_hold(dev); } + wiphy_unlock(&rdev->wiphy); out: kfree(creq); return err; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 38df713f2e2e..07756ca5e3b5 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -67,7 +67,6 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) struct cfg80211_scan_request *request; int n_channels, err; - ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); if (rdev->scan_req || rdev->scan_msg) @@ -233,7 +232,7 @@ void cfg80211_conn_work(struct work_struct *work) u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; - rtnl_lock(); + wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) @@ -266,7 +265,7 @@ void cfg80211_conn_work(struct work_struct *work) wdev_unlock(wdev); } - rtnl_unlock(); + wiphy_unlock(&rdev->wiphy); } /* Returned bss is reference counted and must be cleaned up appropriately. */ diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 3ac1f48195d2..043762354a66 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -5,6 +5,7 @@ * * Copyright 2005-2006 Jiri Benc * Copyright 2006 Johannes Berg + * Copyright (C) 2020-2021 Intel Corporation */ #include @@ -104,6 +105,7 @@ static int wiphy_suspend(struct device *dev) rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); + wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); @@ -118,6 +120,7 @@ static int wiphy_suspend(struct device *dev) ret = rdev_suspend(rdev, NULL); } } + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; @@ -132,8 +135,10 @@ static int wiphy_resume(struct device *dev) cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); + wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); + wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; diff --git a/net/wireless/util.c b/net/wireless/util.c index eab928002cd8..1bf0200f562a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -997,7 +997,7 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); @@ -1010,7 +1010,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; - ASSERT_RTNL(); + lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index fd9ad74972fb..2e35cb78221e 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -7,7 +7,7 @@ * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation */ #include @@ -253,17 +253,23 @@ int cfg80211_wext_siwrts(struct net_device *dev, u32 orts = wdev->wiphy->rts_threshold; int err; - if (rts->disabled || !rts->fixed) + wiphy_lock(&rdev->wiphy); + if (rts->disabled || !rts->fixed) { wdev->wiphy->rts_threshold = (u32) -1; - else if (rts->value < 0) - return -EINVAL; - else + } else if (rts->value < 0) { + err = -EINVAL; + goto out; + } else { wdev->wiphy->rts_threshold = rts->value; + } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); + if (err) wdev->wiphy->rts_threshold = orts; +out: + wiphy_unlock(&rdev->wiphy); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); @@ -291,11 +297,13 @@ int cfg80211_wext_siwfrag(struct net_device *dev, u32 ofrag = wdev->wiphy->frag_threshold; int err; - if (frag->disabled || !frag->fixed) + wiphy_lock(&rdev->wiphy); + if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; - else if (frag->value < 256) - return -EINVAL; - else { + } else if (frag->value < 256) { + err = -EINVAL; + goto out; + } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; } @@ -303,6 +311,8 @@ int cfg80211_wext_siwfrag(struct net_device *dev, err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; +out: + wiphy_unlock(&rdev->wiphy); return err; } @@ -337,6 +347,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; + wiphy_lock(&rdev->wiphy); if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; @@ -355,6 +366,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } + wiphy_unlock(&rdev->wiphy); return err; } @@ -577,15 +589,18 @@ static int cfg80211_wext_siwencode(struct net_device *dev, !rdev->ops->set_default_key) return -EOPNOTSUPP; + wiphy_lock(&rdev->wiphy); idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; - } else if (idx < 1 || idx > 4) - return -EINVAL; - else + } else if (idx < 1 || idx > 4) { + err = -EINVAL; + goto out; + } else { idx--; + } if (erq->flags & IW_ENCODE_DISABLED) remove = true; @@ -599,22 +614,28 @@ static int cfg80211_wext_siwencode(struct net_device *dev, if (!err) wdev->wext.default_key = idx; wdev_unlock(wdev); - return err; + goto out; } memset(¶ms, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; - if (erq->length == 5) + if (erq->length == 5) { params.cipher = WLAN_CIPHER_SUITE_WEP40; - else if (erq->length == 13) + } else if (erq->length == 13) { params.cipher = WLAN_CIPHER_SUITE_WEP104; - else if (!remove) - return -EINVAL; + } else if (!remove) { + err = -EINVAL; + goto out; + } + + err = cfg80211_set_encryption(rdev, dev, false, NULL, remove, + wdev->wext.default_key == -1, + idx, ¶ms); +out: + wiphy_unlock(&rdev->wiphy); - return cfg80211_set_encryption(rdev, dev, false, NULL, remove, - wdev->wext.default_key == -1, - idx, ¶ms); + return err; } static int cfg80211_wext_siwencodeext(struct net_device *dev, @@ -754,38 +775,61 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; - int freq; + int freq, ret; + + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); + ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); + break; case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); + ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); + break; case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) - return freq; - if (freq == 0) - return -EINVAL; + if (freq < 0) { + ret = freq; + break; + } + if (freq == 0) { + ret = -EINVAL; + break; + } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) - return -EINVAL; - return cfg80211_set_monitor_channel(rdev, &chandef); + if (!chandef.chan) { + ret = -EINVAL; + break; + } + ret = cfg80211_set_monitor_channel(rdev, &chandef); + break; case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) - return freq; - if (freq == 0) - return -EINVAL; + if (freq < 0) { + ret = freq; + break; + } + if (freq == 0) { + ret = -EINVAL; + break; + } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) - return -EINVAL; - return cfg80211_set_mesh_channel(rdev, wdev, &chandef); + if (!chandef.chan) { + ret = -EINVAL; + break; + } + ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwfreq(struct net_device *dev, @@ -797,24 +841,35 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = {}; int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); + ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); + break; case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); + ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); + break; case NL80211_IFTYPE_MONITOR: - if (!rdev->ops->get_channel) - return -EINVAL; + if (!rdev->ops->get_channel) { + ret = -EINVAL; + break; + } ret = rdev_get_channel(rdev, wdev, &chandef); if (ret) - return ret; + break; freq->m = chandef.chan->center_freq; freq->e = 6; - return 0; + ret = 0; + break; default: - return -EINVAL; + ret = -EINVAL; + break; } + + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_siwtxpower(struct net_device *dev, @@ -825,6 +880,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; + int ret; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; @@ -866,7 +922,11 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, return 0; } - return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + wiphy_lock(&rdev->wiphy); + ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -885,7 +945,9 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; + wiphy_lock(&rdev->wiphy); err = rdev_get_tx_power(rdev, wdev, &val); + wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1125,7 +1187,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev, timeout = wrq->value / 1000; } + wiphy_lock(&rdev->wiphy); err = rdev_set_power_mgmt(rdev, dev, ps, timeout); + wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1156,7 +1220,7 @@ static int cfg80211_wext_siwrate(struct net_device *dev, struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; - int band, ridx; + int band, ridx, ret; bool match = false; if (!rdev->ops->set_bitrate_mask) @@ -1195,7 +1259,11 @@ static int cfg80211_wext_siwrate(struct net_device *dev, if (!match) return -EINVAL; - return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + wiphy_lock(&rdev->wiphy); + ret = rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwrate(struct net_device *dev, @@ -1224,7 +1292,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; + wiphy_lock(&rdev->wiphy); err = rdev_get_station(rdev, dev, addr, &sinfo); + wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1249,6 +1319,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) static struct iw_statistics wstats; static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; + int ret; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) return NULL; @@ -1267,7 +1338,11 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) memset(&sinfo, 0, sizeof(sinfo)); - if (rdev_get_station(rdev, dev, bssid, &sinfo)) + wiphy_lock(&rdev->wiphy); + ret = rdev_get_station(rdev, dev, bssid, &sinfo); + wiphy_unlock(&rdev->wiphy); + + if (ret) return NULL; memset(&wstats, 0, sizeof(wstats)); @@ -1318,15 +1393,24 @@ static int cfg80211_wext_siwap(struct net_device *dev, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); + ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); + ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwap(struct net_device *dev, @@ -1334,15 +1418,24 @@ static int cfg80211_wext_giwap(struct net_device *dev, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); + ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); + ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_siwessid(struct net_device *dev, @@ -1350,15 +1443,24 @@ static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_siwessid(dev, info, data, ssid); + ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_siwessid(dev, info, data, ssid); + ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_giwessid(struct net_device *dev, @@ -1366,18 +1468,27 @@ static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ret; data->flags = 0; data->length = 0; + wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); + ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid); + break; case NL80211_IFTYPE_STATION: - return cfg80211_mgd_wext_giwessid(dev, info, data, ssid); + ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } static int cfg80211_wext_siwpmksa(struct net_device *dev, @@ -1388,6 +1499,7 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; + int ret; memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); @@ -1397,28 +1509,39 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, cfg_pmksa.bssid = pmksa->bssid.sa_data; cfg_pmksa.pmkid = pmksa->pmkid; + wiphy_lock(&rdev->wiphy); switch (pmksa->cmd) { case IW_PMKSA_ADD: - if (!rdev->ops->set_pmksa) - return -EOPNOTSUPP; - - return rdev_set_pmksa(rdev, dev, &cfg_pmksa); + if (!rdev->ops->set_pmksa) { + ret = -EOPNOTSUPP; + break; + } + ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa); + break; case IW_PMKSA_REMOVE: - if (!rdev->ops->del_pmksa) - return -EOPNOTSUPP; - - return rdev_del_pmksa(rdev, dev, &cfg_pmksa); + if (!rdev->ops->del_pmksa) { + ret = -EOPNOTSUPP; + break; + } + ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa); + break; case IW_PMKSA_FLUSH: - if (!rdev->ops->flush_pmksa) - return -EOPNOTSUPP; - - return rdev_flush_pmksa(rdev, dev); + if (!rdev->ops->flush_pmksa) { + ret = -EOPNOTSUPP; + break; + } + ret = rdev_flush_pmksa(rdev, dev); + break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + break; } + wiphy_unlock(&rdev->wiphy); + + return ret; } #define DEFINE_WEXT_COMPAT_STUB(func, type) \ diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 73df23570d43..193a18a53142 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -3,7 +3,7 @@ * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg - * Copyright (C) 2009 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020-2021 Intel Corporation. */ #include @@ -379,6 +379,7 @@ int cfg80211_wext_siwmlme(struct net_device *dev, if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; + wiphy_lock(&rdev->wiphy); wdev_lock(wdev); switch (mlme->cmd) { case IW_MLME_DEAUTH: @@ -390,6 +391,7 @@ int cfg80211_wext_siwmlme(struct net_device *dev, break; } wdev_unlock(wdev); + wiphy_unlock(&rdev->wiphy); return err; } -- cgit v1.2.3 From 6b2e04bc240fe9be9e690059f710e9f95346d34d Mon Sep 17 00:00:00 2001 From: Praveen Chaudhary Date: Mon, 25 Jan 2021 13:44:30 -0800 Subject: net: allow user to set metric on default route learned via Router Advertisement For IPv4, default route is learned via DHCPv4 and user is allowed to change metric using config etc/network/interfaces. But for IPv6, default route can be learned via RA, for which, currently a fixed metric value 1024 is used. Ideally, user should be able to configure metric on default route for IPv6 similar to IPv4. This patch adds sysctl for the same. Logs: For IPv4: Config in etc/network/interfaces: auto eth0 iface eth0 inet dhcp metric 4261413864 IPv4 Kernel Route Table: $ ip route list default via 172.21.47.1 dev eth0 metric 4261413864 FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over DHCPv4 default route.] Codes: K - kernel route, C - connected, S - static, R - RIP, O - OSPF, I - IS-IS, B - BGP, P - PIM, E - EIGRP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* 0.0.0.0/0 [20/0] is directly connected, eth0, 00:00:03 K 0.0.0.0/0 [254/1000] via 172.21.47.1, eth0, 6d08h51m i.e. User can prefer Default Router learned via Routing Protocol in IPv4. Similar behavior is not possible for IPv6, without this fix. After fix [for IPv6]: sudo sysctl -w net.ipv6.conf.eth0.net.ipv6.conf.eth0.ra_defrtr_metric=1996489705 IP monitor: [When IPv6 RA is received] default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 pref high Kernel IPv6 routing table $ ip -6 route list default via fe80::be16:65ff:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 21sec hoplimit 64 pref high FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over IPv6 RA default route.] Codes: K - kernel route, C - connected, S - static, R - RIPng, O - OSPFv3, I - IS-IS, B - BGP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* ::/0 [20/0] is directly connected, eth0, 00:00:06 K ::/0 [119/1001] via fe80::xx16:xxxx:feb3:ce8e, eth0, 6d07h43m If the metric is changed later, the effect will be seen only when next IPv6 RA is received, because the default route must be fully controlled by RA msg. Below metric is changed from 1996489705 to 1996489704. $ sudo sysctl -w net.ipv6.conf.eth0.ra_defrtr_metric=1996489704 net.ipv6.conf.eth0.ra_defrtr_metric = 1996489704 IP monitor: [On next IPv6 RA msg, Kernel deletes prev route and installs new route with updated metric] Deleted default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 3sec hoplimit 64 pref high default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489704 pref high Signed-off-by: Praveen Chaudhary Signed-off-by: Zhenggen Xu Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20210125214430.24079-1-pchaudhary@linkedin.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 10 ++++++++++ include/linux/ipv6.h | 1 + include/net/ip6_route.h | 3 ++- include/uapi/linux/ipv6.h | 1 + include/uapi/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 11 +++++++++++ net/ipv6/ndisc.c | 12 ++++++++---- net/ipv6/route.c | 5 +++-- 8 files changed, 37 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index dd2b12a32b73..0e51ddd9a2f1 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1871,6 +1871,16 @@ accept_ra_defrtr - BOOLEAN - enabled if accept_ra is enabled. - disabled if accept_ra is disabled. +ra_defrtr_metric - UNSIGNED INTEGER + Route metric for default route learned in Router Advertisement. This value + will be assigned as metric for the default route learned via IPv6 Router + Advertisement. Takes affect only if accept_ra_defrtr is enabled. + + Possible values: + 1 to 0xFFFFFFFF + + Default: IP6_RT_PRIO_USER i.e. 1024. + accept_ra_from_local - BOOLEAN Accept RA with source-address that is found on local machine if the RA is otherwise proper and able to be accepted. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index dda61d150a13..9d1f29f0c512 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -31,6 +31,7 @@ struct ipv6_devconf { __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; + __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 2a5277758379..f51a118bfce8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -174,7 +174,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct net_device *dev); struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, - struct net_device *dev, unsigned int pref); + struct net_device *dev, unsigned int pref, + u32 defrtr_usr_metric); void rt6_purge_dflt_routers(struct net *net); diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 13e8751bf24a..70603775fe91 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -189,6 +189,7 @@ enum { DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, DEVCONF_NDISC_TCLASS, DEVCONF_RPL_SEG_ENABLED, + DEVCONF_RA_DEFRTR_METRIC, DEVCONF_MAX }; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 458179df9b27..1e05d3caa712 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -571,6 +571,7 @@ enum { NET_IPV6_ACCEPT_SOURCE_ROUTE=25, NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, + NET_IPV6_RA_DEFRTR_METRIC=28, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9edc5bb2d531..f2337fb756ac 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -205,6 +205,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, @@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, @@ -5476,6 +5478,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_RA_DEFRTR_METRIC] = cnf->ra_defrtr_metric; array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF @@ -6668,6 +6671,14 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ra_defrtr_metric", + .data = &ipv6_devconf.ra_defrtr_metric, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)SYSCTL_ONE, + }, { .procname = "accept_ra_min_hop_limit", .data = &ipv6_devconf.accept_ra_min_hop_limit, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 76717478f173..c467c6419893 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1173,6 +1173,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; struct fib6_info *rt = NULL; + u32 defrtr_usr_metric; struct net *net; int lifetime; struct ndisc_options ndopts; @@ -1303,18 +1304,21 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } } - if (rt && lifetime == 0) { + /* Set default route metric as specified by user */ + defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; + /* delete the route if lifetime is 0 or if metric needs change */ + if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } - ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", - rt, lifetime, skb->dev->name); + ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", + rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, - skb->dev, pref); + skb->dev, pref, defrtr_usr_metric); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 188e114b29b4..41d8f801b75f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4252,11 +4252,12 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, - unsigned int pref) + unsigned int pref, + u32 defrtr_usr_metric) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, - .fc_metric = IP6_RT_PRIO_USER, + .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), -- cgit v1.2.3 From 6fe27d68b45666381691b6a841a2adbda8c8d153 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 20 Jan 2021 02:03:55 +0900 Subject: can: dev: export can_get_state_str() function The can_get_state_str() function is also relevant to the drivers. Export the symbol and make it visible in the can/dev.h header. Link: https://lore.kernel.org/r/20210119170355.12040-1-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 3 ++- include/linux/can/dev.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 01e4a194f187..d9281ae853f8 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -74,7 +74,7 @@ static int can_rx_state_to_frame(struct net_device *dev, enum can_state state) } } -static const char *can_get_state_str(const enum can_state state) +const char *can_get_state_str(const enum can_state state) { switch (state) { case CAN_STATE_ERROR_ACTIVE: @@ -95,6 +95,7 @@ static const char *can_get_state_str(const enum can_state state) return ""; } +EXPORT_SYMBOL_GPL(can_get_state_str); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state) diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 7faf6a37d5b2..ac4d83a1ab81 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -123,6 +123,7 @@ void unregister_candev(struct net_device *dev); int can_restart_now(struct net_device *dev); void can_bus_off(struct net_device *dev); +const char *can_get_state_str(const enum can_state state); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state); -- cgit v1.2.3 From 87baa23e0236bc1c41ce744f524bf12dbe7fde66 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 11 Jan 2021 19:07:40 +0100 Subject: bus: mhi: core: Add helper API to return number of free TREs Introduce mhi_get_free_desc_count() API to return number of TREs available to queue buffer. MHI clients can use this API to know before hand if ring is full without calling queue API. Signed-off-by: Hemant Kumar Reviewed-by: Jeffrey Hugo Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/1610388462-16322-1-git-send-email-loic.poulain@linaro.org Signed-off-by: Manivannan Sadhasivam --- drivers/bus/mhi/core/main.c | 12 ++++++++++++ include/linux/mhi.h | 9 +++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index d34d7e90e38d..1202433ecf98 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -260,6 +260,18 @@ int mhi_destroy_device(struct device *dev, void *data) return 0; } +int mhi_get_free_desc_count(struct mhi_device *mhi_dev, + enum dma_data_direction dir) +{ + struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl; + struct mhi_chan *mhi_chan = (dir == DMA_TO_DEVICE) ? + mhi_dev->ul_chan : mhi_dev->dl_chan; + struct mhi_ring *tre_ring = &mhi_chan->tre_ring; + + return get_nr_avail_ring_elements(mhi_cntrl, tre_ring); +} +EXPORT_SYMBOL_GPL(mhi_get_free_desc_count); + void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason) { struct mhi_driver *mhi_drv; diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 562862ff819c..ece53a252217 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -598,6 +598,15 @@ void mhi_set_mhi_state(struct mhi_controller *mhi_cntrl, */ void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason); +/** + * mhi_get_free_desc_count - Get transfer ring length + * Get # of TD available to queue buffers + * @mhi_dev: Device associated with the channels + * @dir: Direction of the channel + */ +int mhi_get_free_desc_count(struct mhi_device *mhi_dev, + enum dma_data_direction dir); + /** * mhi_prepare_for_power_up - Do pre-initialization before power up. * This is optional, call this before power up if -- cgit v1.2.3 From 4f16d25c68ec844299a4df6ecbb0234eaf88a935 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Jan 2021 17:28:18 +0100 Subject: netfilter: nftables: add nft_parse_register_load() and use it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This new function combines the netlink register attribute parser and the load validation function. This update requires to replace: enum nft_registers sreg:8; in many of the expression private areas otherwise compiler complains with: error: cannot take address of bit-field ‘sreg’ when passing the register field as reference. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_core.h | 6 +++--- include/net/netfilter/nft_meta.h | 2 +- net/ipv4/netfilter/nft_dup_ipv4.c | 18 ++++++++--------- net/ipv6/netfilter/nft_dup_ipv6.c | 18 ++++++++--------- net/netfilter/nf_tables_api.c | 18 +++++++++++++++-- net/netfilter/nft_bitwise.c | 10 +++++----- net/netfilter/nft_byteorder.c | 6 +++--- net/netfilter/nft_cmp.c | 8 +++----- net/netfilter/nft_ct.c | 5 ++--- net/netfilter/nft_dup_netdev.c | 6 +++--- net/netfilter/nft_dynset.c | 12 ++++++------ net/netfilter/nft_exthdr.c | 6 +++--- net/netfilter/nft_fwd_netdev.c | 18 ++++++++--------- net/netfilter/nft_hash.c | 10 ++++++---- net/netfilter/nft_lookup.c | 6 +++--- net/netfilter/nft_masq.c | 18 +++++++---------- net/netfilter/nft_meta.c | 3 +-- net/netfilter/nft_nat.c | 35 ++++++++++++++-------------------- net/netfilter/nft_objref.c | 6 +++--- net/netfilter/nft_payload.c | 4 ++-- net/netfilter/nft_queue.c | 12 ++++++------ net/netfilter/nft_range.c | 6 +++--- net/netfilter/nft_redir.c | 18 +++++++---------- net/netfilter/nft_tproxy.c | 14 +++++++------- 25 files changed, 132 insertions(+), 135 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f4af8362d234..033b31aa38cc 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -203,7 +203,7 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); unsigned int nft_parse_register(const struct nlattr *attr); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); -int nft_validate_register_load(enum nft_registers reg, unsigned int len); +int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 8657e6815b07..b7aff03a3f0f 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -26,14 +26,14 @@ void nf_tables_core_module_exit(void); struct nft_bitwise_fast_expr { u32 mask; u32 xor; - enum nft_registers sreg:8; + u8 sreg; enum nft_registers dreg:8; }; struct nft_cmp_fast_expr { u32 data; u32 mask; - enum nft_registers sreg:8; + u8 sreg; u8 len; bool inv; }; @@ -67,7 +67,7 @@ struct nft_payload_set { enum nft_payload_bases base:8; u8 offset; u8 len; - enum nft_registers sreg:8; + u8 sreg; u8 csum_type; u8 csum_offset; u8 csum_flags; diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 07e2fd507963..946fa8c83798 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -8,7 +8,7 @@ struct nft_meta { enum nft_meta_keys key:8; union { enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 sreg; }; }; diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bcdb37f86a94..aeb631760eb9 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -13,8 +13,8 @@ #include struct nft_dup_ipv4 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv4_eval(const struct nft_expr *expr, @@ -40,16 +40,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 8b5193efb1f1..3a00d95e964e 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -13,8 +13,8 @@ #include struct nft_dup_ipv6 { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; + u8 sreg_addr; + u8 sreg_dev; }; static void nft_dup_ipv6_eval(const struct nft_expr *expr, @@ -38,16 +38,16 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr)); + err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + sizeof(struct in6_addr)); if (err < 0) return err; - if (tb[NFTA_DUP_SREG_DEV] != NULL) { - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); - } - return 0; + if (tb[NFTA_DUP_SREG_DEV]) + err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + &priv->sreg_dev, sizeof(int)); + + return err; } static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 15c467f1a9dd..1e82ebba230d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8634,7 +8634,7 @@ EXPORT_SYMBOL_GPL(nft_dump_register); * Validate that the input register is one of the general purpose * registers and that the length of the load is within the bounds. */ -int nft_validate_register_load(enum nft_registers reg, unsigned int len) +static int nft_validate_register_load(enum nft_registers reg, unsigned int len) { if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; @@ -8645,7 +8645,21 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -EXPORT_SYMBOL_GPL(nft_validate_register_load); + +int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) +{ + u32 reg; + int err; + + reg = nft_parse_register(attr); + err = nft_validate_register_load(reg, len); + if (err < 0) + return err; + + *sreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_load); /** * nft_validate_register_store - validate an expressions' register store diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index bbd773d74377..2157970b3cd3 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -16,7 +16,7 @@ #include struct nft_bitwise { - enum nft_registers sreg:8; + u8 sreg; enum nft_registers dreg:8; enum nft_bitwise_ops op:8; u8 len; @@ -169,8 +169,8 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + priv->len); if (err < 0) return err; @@ -315,8 +315,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); int err; - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, sizeof(u32)); + err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 12bed3f7bbc6..0960563cd5a1 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -16,7 +16,7 @@ #include struct nft_byteorder { - enum nft_registers sreg:8; + u8 sreg; enum nft_registers dreg:8; enum nft_byteorder_ops op:8; u8 len; @@ -131,14 +131,14 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; - err = nft_validate_register_load(priv->sreg, priv->len); + err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, + priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 00e563a72d3d..3640eea8a87b 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -18,7 +18,7 @@ struct nft_cmp_expr { struct nft_data data; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_cmp_ops op:8; }; @@ -87,8 +87,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return err; } - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -174,8 +173,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); - err = nft_validate_register_load(priv->sreg, desc.len); + err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 8bcd49f14797..c1d7a3c615d8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -28,7 +28,7 @@ struct nft_ct { enum ip_conntrack_dir dir:8; union { enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 sreg; }; }; @@ -600,8 +600,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } } - priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); - err = nft_validate_register_load(priv->sreg, len); + err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 40788b3f1071..bbf3fcba3df4 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -14,7 +14,7 @@ #include struct nft_dup_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_dup_netdev_eval(const struct nft_expr *expr, @@ -40,8 +40,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 0b053f75cd60..b20c4c7ea57b 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -16,8 +16,8 @@ struct nft_dynset { struct nft_set *set; struct nft_set_ext_tmpl tmpl; enum nft_dynset_ops op:8; - enum nft_registers sreg_key:8; - enum nft_registers sreg_data:8; + u8 sreg_key; + u8 sreg_data; bool invert; bool expr; u8 num_exprs; @@ -219,8 +219,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return err; } - priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); - err = nft_validate_register_load(priv->sreg_key, set->klen); + err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + set->klen); if (err < 0) return err; @@ -230,8 +230,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]); - err = nft_validate_register_load(priv->sreg_data, set->dlen); + err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA], + &priv->sreg_data, set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 3c48cdc8935d..9c2b3bde1ac9 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -20,7 +20,7 @@ struct nft_exthdr { u8 len; u8 op; enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 sreg; u8 flags; }; @@ -400,11 +400,11 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg, + priv->len); } static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index b77985986b24..cd59afde5b2f 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -18,7 +18,7 @@ #include struct nft_fwd_netdev { - enum nft_registers sreg_dev:8; + u8 sreg_dev; }; static void nft_fwd_netdev_eval(const struct nft_expr *expr, @@ -50,8 +50,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); } static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -78,8 +78,8 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, } struct nft_fwd_neigh { - enum nft_registers sreg_dev:8; - enum nft_registers sreg_addr:8; + u8 sreg_dev; + u8 sreg_addr; u8 nfproto; }; @@ -157,8 +157,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, !tb[NFTA_FWD_NFPROTO]) return -EINVAL; - priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); - priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]); priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); switch (priv->nfproto) { @@ -172,11 +170,13 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_validate_register_load(priv->sreg_dev, sizeof(int)); + err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + sizeof(int)); if (err < 0) return err; - return nft_validate_register_load(priv->sreg_addr, addr_len); + return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + addr_len); } static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 96371d878e7e..7ee6c6da50ae 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -14,7 +14,7 @@ #include struct nft_jhash { - enum nft_registers sreg:8; + u8 sreg; enum nft_registers dreg:8; u8 len; bool autogen_seed:1; @@ -83,7 +83,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); @@ -94,6 +93,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; + err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len); + if (err < 0) + return err; + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -108,8 +111,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx, get_random_bytes(&priv->seed, sizeof(priv->seed)); } - return nft_validate_register_load(priv->sreg, len) && - nft_validate_register_store(ctx, priv->dreg, NULL, + return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index f1363b8aabba..9e87b6d39f51 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -17,7 +17,7 @@ struct nft_lookup { struct nft_set *set; - enum nft_registers sreg:8; + u8 sreg; enum nft_registers dreg:8; bool invert; struct nft_set_binding binding; @@ -76,8 +76,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg, + set->klen); if (err < 0) return err; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 71390b727040..9953e8053753 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -15,8 +15,8 @@ struct nft_masq { u32 flags; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { @@ -54,19 +54,15 @@ static int nft_masq_init(const struct nft_ctx *ctx, } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index bf4b3ad5314c..65e231ec1884 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -661,8 +661,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); - err = nft_validate_register_load(priv->sreg, len); + err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 4bcf33b049c4..0840c635b752 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -21,10 +21,10 @@ #include struct nft_nat { - enum nft_registers sreg_addr_min:8; - enum nft_registers sreg_addr_max:8; - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_addr_min; + u8 sreg_addr_max; + u8 sreg_proto_min; + u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; @@ -206,18 +206,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - priv->sreg_addr_min = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]); - err = nft_validate_register_load(priv->sreg_addr_min, alen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], + &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - priv->sreg_addr_max = - nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]); - - err = nft_validate_register_load(priv->sreg_addr_max, - alen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], + &priv->sreg_addr_max, + alen); if (err < 0) return err; } else { @@ -229,19 +226,15 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 5f9207a9f485..bc104d36d3bb 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -95,7 +95,7 @@ static const struct nft_expr_ops nft_objref_ops = { struct nft_objref_map { struct nft_set *set; - enum nft_registers sreg:8; + u8 sreg; struct nft_set_binding binding; }; @@ -137,8 +137,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]); - err = nft_validate_register_load(priv->sreg, set->klen); + err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + set->klen); if (err < 0) return err; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 47d4e0e21651..d4e88db4116d 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -658,7 +658,6 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]); if (tb[NFTA_PAYLOAD_CSUM_TYPE]) priv->csum_type = @@ -691,7 +690,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - return nft_validate_register_load(priv->sreg, priv->len); + return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, + priv->len); } static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 23265d757acb..9ba1de51ac07 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -19,10 +19,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - enum nft_registers sreg_qnum:8; - u16 queuenum; - u16 queues_total; - u16 flags; + u8 sreg_qnum; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -111,8 +111,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); - err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], + &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 89efcc5a533d..e4a1c44d7f51 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -15,7 +15,7 @@ struct nft_range_expr { struct nft_data data_from; struct nft_data data_to; - enum nft_registers sreg:8; + u8 sreg; u8 len; enum nft_range_ops op:8; }; @@ -86,8 +86,8 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr goto err2; } - priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); - err = nft_validate_register_load(priv->sreg, desc_from.len); + err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg, + desc_from.len); if (err < 0) goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 2056051c0af0..ba09890dddb5 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -14,8 +14,8 @@ #include struct nft_redir { - enum nft_registers sreg_proto_min:8; - enum nft_registers sreg_proto_max:8; + u8 sreg_proto_min; + u8 sreg_proto_max; u16 flags; }; @@ -50,19 +50,15 @@ static int nft_redir_init(const struct nft_ctx *ctx, plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - priv->sreg_proto_min = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); - - err = nft_validate_register_load(priv->sreg_proto_min, plen); + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - priv->sreg_proto_max = - nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]); - - err = nft_validate_register_load(priv->sreg_proto_max, - plen); + err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX], + &priv->sreg_proto_max, + plen); if (err < 0) return err; } else { diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index d67f83a0958d..43a5a780a6d3 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -13,9 +13,9 @@ #endif struct nft_tproxy { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_port:8; - u8 family; + u8 sreg_addr; + u8 sreg_port; + u8 family; }; static void nft_tproxy_eval_v4(const struct nft_expr *expr, @@ -247,15 +247,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]); - err = nft_validate_register_load(priv->sreg_addr, alen); + err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], + &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]); - err = nft_validate_register_load(priv->sreg_port, sizeof(u16)); + err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], + &priv->sreg_port, sizeof(u16)); if (err < 0) return err; } -- cgit v1.2.3 From 345023b0db315648ccc3c1a36aee88304a8b4d91 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Jan 2021 18:27:22 +0100 Subject: netfilter: nftables: add nft_parse_register_store() and use it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This new function combines the netlink register attribute parser and the store validation function. This update requires to replace: enum nft_registers dreg:8; in many of the expression private areas otherwise compiler complains with: error: cannot take address of bit-field ‘dreg’ when passing the register field as reference. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++---- include/net/netfilter/nf_tables_core.h | 6 +++--- include/net/netfilter/nft_fib.h | 2 +- include/net/netfilter/nft_meta.h | 2 +- net/bridge/netfilter/nft_meta_bridge.c | 5 ++--- net/netfilter/nf_tables_api.c | 34 +++++++++++++++++++++++++++++----- net/netfilter/nft_bitwise.c | 13 ++++++------- net/netfilter/nft_byteorder.c | 8 ++++---- net/netfilter/nft_ct.c | 7 +++---- net/netfilter/nft_exthdr.c | 8 ++++---- net/netfilter/nft_fib.c | 5 ++--- net/netfilter/nft_hash.c | 17 +++++++---------- net/netfilter/nft_immediate.c | 6 +++--- net/netfilter/nft_lookup.c | 8 ++++---- net/netfilter/nft_meta.c | 5 ++--- net/netfilter/nft_numgen.c | 15 ++++++--------- net/netfilter/nft_osf.c | 8 ++++---- net/netfilter/nft_payload.c | 6 +++--- net/netfilter/nft_rt.c | 7 +++---- net/netfilter/nft_socket.c | 7 +++---- net/netfilter/nft_tunnel.c | 8 +++----- net/netfilter/nft_xfrm.c | 7 +++---- 22 files changed, 100 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 033b31aa38cc..78d174517749 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -204,10 +204,10 @@ unsigned int nft_parse_register(const struct nlattr *attr); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); -int nft_validate_register_store(const struct nft_ctx *ctx, - enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type, unsigned int len); +int nft_parse_register_store(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *dreg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index b7aff03a3f0f..fd10a7862fdc 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -27,7 +27,7 @@ struct nft_bitwise_fast_expr { u32 mask; u32 xor; u8 sreg; - enum nft_registers dreg:8; + u8 dreg; }; struct nft_cmp_fast_expr { @@ -40,7 +40,7 @@ struct nft_cmp_fast_expr { struct nft_immediate_expr { struct nft_data data; - enum nft_registers dreg:8; + u8 dreg; u8 dlen; }; @@ -60,7 +60,7 @@ struct nft_payload { enum nft_payload_bases base:8; u8 offset; u8 len; - enum nft_registers dreg:8; + u8 dreg; }; struct nft_payload_set { diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 628b6fa579cd..237f3757637e 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -5,7 +5,7 @@ #include struct nft_fib { - enum nft_registers dreg:8; + u8 dreg; u8 result; u32 flags; }; diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 946fa8c83798..2dce55c736f4 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -7,7 +7,7 @@ struct nft_meta { enum nft_meta_keys key:8; union { - enum nft_registers dreg:8; + u8 dreg; u8 sreg; }; }; diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 8e8ffac037cd..97805ec424c1 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -87,9 +87,8 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, return nft_meta_get_init(ctx, expr, tb); } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static struct nft_expr_type nft_meta_bridge_type; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1e82ebba230d..c23163ffb5a1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4438,6 +4438,12 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return nft_delset(&ctx, set); } +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len); + static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -8675,10 +8681,11 @@ EXPORT_SYMBOL_GPL(nft_parse_register_load); * A value of NULL for the data means that its runtime gathered * data. */ -int nft_validate_register_store(const struct nft_ctx *ctx, - enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type, unsigned int len) +static int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, + unsigned int len) { int err; @@ -8710,7 +8717,24 @@ int nft_validate_register_store(const struct nft_ctx *ctx, return 0; } } -EXPORT_SYMBOL_GPL(nft_validate_register_store); + +int nft_parse_register_store(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *dreg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len) +{ + int err; + u32 reg; + + reg = nft_parse_register(attr); + err = nft_validate_register_store(ctx, reg, data, type, len); + if (err < 0) + return err; + + *dreg = reg; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_register_store); static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 2157970b3cd3..47b0dba95054 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -17,7 +17,7 @@ struct nft_bitwise { u8 sreg; - enum nft_registers dreg:8; + u8 dreg; enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; @@ -174,9 +174,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); if (err < 0) return err; @@ -320,9 +320,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 0960563cd5a1..9d5947ab8d4e 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -17,7 +17,7 @@ struct nft_byteorder { u8 sreg; - enum nft_registers dreg:8; + u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; @@ -142,9 +142,9 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index c1d7a3c615d8..882fe8648653 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -27,7 +27,7 @@ struct nft_ct { enum nft_ct_keys key:8; enum ip_conntrack_dir dir:8; union { - enum nft_registers dreg:8; + u8 dreg; u8 sreg; }; }; @@ -498,9 +498,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, } } - priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, + NFT_DATA_VALUE, len); if (err < 0) return err; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 9c2b3bde1ac9..f64f0017e9a5 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -19,7 +19,7 @@ struct nft_exthdr { u8 offset; u8 len; u8 op; - enum nft_registers dreg:8; + u8 dreg; u8 sreg; u8 flags; }; @@ -350,12 +350,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; - priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); priv->flags = flags; priv->op = op; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 4dfdaeaf09a5..b10ce732b337 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -86,7 +86,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); - priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]); switch (priv->result) { case NFT_FIB_RESULT_OIF: @@ -106,8 +105,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); if (err < 0) return err; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 7ee6c6da50ae..f829f5289e16 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -15,7 +15,7 @@ struct nft_jhash { u8 sreg; - enum nft_registers dreg:8; + u8 dreg; u8 len; bool autogen_seed:1; u32 modulus; @@ -38,7 +38,7 @@ static void nft_jhash_eval(const struct nft_expr *expr, } struct nft_symhash { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -83,8 +83,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); if (err < 0) return err; @@ -111,8 +109,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx, get_random_bytes(&priv->seed, sizeof(priv->seed)); } - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_symhash_init(const struct nft_ctx *ctx, @@ -128,8 +126,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; @@ -137,8 +133,9 @@ static int nft_symhash_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + sizeof(u32)); } static int nft_jhash_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c63eb3b17178..90c64d27ae53 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -48,9 +48,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, priv->dlen = desc.len; - priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, &priv->data, - desc.type, desc.len); + err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], + &priv->dreg, &priv->data, desc.type, + desc.len); if (err < 0) goto err1; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 9e87b6d39f51..b0f558b4fea5 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -18,7 +18,7 @@ struct nft_lookup { struct nft_set *set; u8 sreg; - enum nft_registers dreg:8; + u8 dreg; bool invert; struct nft_set_binding binding; }; @@ -100,9 +100,9 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_MAP)) return -EINVAL; - priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - set->dtype, set->dlen); + err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], + &priv->dreg, NULL, set->dtype, + set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 65e231ec1884..a7e01e9952f1 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -535,9 +535,8 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } EXPORT_SYMBOL_GPL(nft_meta_get_init); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index f1fc824f9737..722cac1e90e0 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -16,7 +16,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); struct nft_ng_inc { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; atomic_t counter; u32 offset; @@ -66,11 +66,10 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); atomic_set(&priv->counter, priv->modulus - 1); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, @@ -100,7 +99,7 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) } struct nft_ng_random { - enum nft_registers dreg:8; + u8 dreg; u32 modulus; u32 offset; }; @@ -140,10 +139,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, prandom_init_once(&nft_numgen_prandom_state); - priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); - - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, sizeof(u32)); + return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index c261d57a666a..ac61f708b82d 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -6,7 +6,7 @@ #include struct nft_osf { - enum nft_registers dreg:8; + u8 dreg; u8 ttl; u32 flags; }; @@ -78,9 +78,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->flags = flags; } - priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); + err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, + NFT_OSF_MAXGENRELEN); if (err < 0) return err; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index d4e88db4116d..cb1c8c231880 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -144,10 +144,10 @@ static int nft_payload_init(const struct nft_ctx *ctx, priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); } static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 7cfcb0e2f7ee..bcd01a63e38f 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -15,7 +15,7 @@ struct nft_rt { enum nft_rt_keys key:8; - enum nft_registers dreg:8; + u8 dreg; }; static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) @@ -141,9 +141,8 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_rt_get_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index a28aca5124ce..c9b8a2b03b71 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -10,7 +10,7 @@ struct nft_socket { enum nft_socket_keys key:8; union { - enum nft_registers dreg:8; + u8 dreg; }; }; @@ -133,9 +133,8 @@ static int nft_socket_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_socket_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index d3eb953d0333..3b27926d5382 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -15,7 +15,7 @@ struct nft_tunnel { enum nft_tunnel_keys key:8; - enum nft_registers dreg:8; + u8 dreg; enum nft_tunnel_mode mode:8; }; @@ -93,8 +93,6 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]); - if (tb[NFTA_TUNNEL_MODE]) { priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE])); if (priv->mode > NFT_TUNNEL_MODE_MAX) @@ -103,8 +101,8 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, priv->mode = NFT_TUNNEL_MODE_NONE; } - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } static int nft_tunnel_get_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 06d5cabf1d7c..cbbbc4ecad3a 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -24,7 +24,7 @@ static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { struct nft_xfrm { enum nft_xfrm_keys key:8; - enum nft_registers dreg:8; + u8 dreg; u8 dir; u8 spnum; }; @@ -86,9 +86,8 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx, priv->spnum = spnum; - priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); + return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current -- cgit v1.2.3 From 08a01c11a5bb3de9b0a9c9b2685867e50eda9910 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Jan 2021 23:19:17 +0100 Subject: netfilter: nftables: statify nft_parse_register() This function is not used anymore by any extension, statify it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 78d174517749..99d4571fef46 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -200,7 +200,6 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); -unsigned int nft_parse_register(const struct nlattr *attr); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c23163ffb5a1..1d76d07592bf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8595,7 +8595,7 @@ EXPORT_SYMBOL_GPL(nft_parse_u32_check); * Registers used to be 128 bit wide, these register numbers will be * mapped to the corresponding 32 bit register numbers. */ -unsigned int nft_parse_register(const struct nlattr *attr) +static unsigned int nft_parse_register(const struct nlattr *attr) { unsigned int reg; @@ -8607,7 +8607,6 @@ unsigned int nft_parse_register(const struct nlattr *attr) return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; } } -EXPORT_SYMBOL_GPL(nft_parse_register); /** * nft_dump_register - dump a register value to a netlink attribute -- cgit v1.2.3 From 8063e184e49011f6f3f34f6c358dc8a83890bb5b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 27 Jan 2021 14:15:01 -0800 Subject: skmsg: Make sk_psock_destroy() static sk_psock_destroy() is a RCU callback, I can't see any reason why it could be used outside. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Cc: John Fastabend Cc: Jakub Sitnicki Cc: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210127221501.46866-1-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 1 - net/core/skmsg.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index fec0c5ac1c4f..8edbbf5f2f93 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -390,7 +390,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) } void sk_psock_stop(struct sock *sk, struct sk_psock *psock); -void sk_psock_destroy(struct rcu_head *rcu); void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 25cdbb20f3a0..1261512d6807 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -669,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -void sk_psock_destroy(struct rcu_head *rcu) +static void sk_psock_destroy(struct rcu_head *rcu) { struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); INIT_WORK(&psock->gc, sk_psock_destroy_deferred); schedule_work(&psock->gc); } -EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { -- cgit v1.2.3 From 2dba407f994e5b0eb3b70a8cb280e014ec4a7ff3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 26 Jan 2021 11:35:33 +0200 Subject: net: bridge: multicast: make tracked EHT hosts limit configurable Add two new port attributes which make EHT hosts limit configurable and export the current number of tracked EHT hosts: - IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT: configure/retrieve current limit - IFLA_BRPORT_MCAST_EHT_HOSTS_CNT: current number of tracked hosts Setting IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT to 0 is currently not allowed. Note that we have to increase RTNL_SLAVE_MAX_TYPE to 38 minimum, I've increased it to 40 to have space for two more future entries. v2: move br_multicast_eht_set_hosts_limit() to br_multicast_eht.c, no functional change Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 2 ++ net/bridge/br_multicast_eht.c | 15 +++++++++++++++ net/bridge/br_netlink.c | 19 ++++++++++++++++++- net/bridge/br_private_mcast_eht.h | 2 ++ net/bridge/br_sysfs_if.c | 26 ++++++++++++++++++++++++++ net/core/rtnetlink.c | 2 +- 6 files changed, 64 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2bd0d8bbcdb2..eb8018c3a737 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -525,6 +525,8 @@ enum { IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index 445768c8495f..fea38b9a7268 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -861,3 +861,18 @@ bool br_multicast_eht_handle(struct net_bridge_port_group *pg, out: return changed; } + +int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p, + u32 eht_hosts_limit) +{ + struct net_bridge *br = p->br; + + if (!eht_hosts_limit) + return -EINVAL; + + spin_lock_bh(&br->multicast_lock); + p->multicast_eht_hosts_limit = eht_hosts_limit; + spin_unlock_bh(&br->multicast_lock); + + return 0; +} diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 762f273802cd..bd3962da345a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -18,6 +18,7 @@ #include "br_private_stp.h" #include "br_private_cfm.h" #include "br_private_tunnel.h" +#include "br_private_mcast_eht.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) @@ -199,6 +200,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + 0; } @@ -283,7 +286,11 @@ static int br_port_fill_attrs(struct sk_buff *skb, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, - p->multicast_router)) + p->multicast_router) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + p->multicast_eht_hosts_limit) || + nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + p->multicast_eht_hosts_cnt)) return -EMSGSIZE; #endif @@ -820,6 +827,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, + [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -955,6 +963,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; } + + if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) { + u32 hlimit; + + hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]); + err = br_multicast_eht_set_hosts_limit(p, hlimit); + if (err) + return err; + } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h index b2c8d988721f..f89049f4892c 100644 --- a/net/bridge/br_private_mcast_eht.h +++ b/net/bridge/br_private_mcast_eht.h @@ -57,6 +57,8 @@ bool br_multicast_eht_handle(struct net_bridge_port_group *pg, u32 nsrcs, size_t addr_size, int grec_type); +int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p, + u32 eht_hosts_limit); static inline bool br_multicast_eht_should_del_pg(const struct net_bridge_port_group *pg) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 7a59cdddd3ce..b66305fae26b 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -16,6 +16,7 @@ #include #include "br_private.h" +#include "br_private_mcast_eht.h" struct brport_attribute { struct attribute attr; @@ -245,6 +246,29 @@ static int store_multicast_router(struct net_bridge_port *p, static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); +static ssize_t show_multicast_eht_hosts_limit(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%u\n", p->multicast_eht_hosts_limit); +} + +static int store_multicast_eht_hosts_limit(struct net_bridge_port *p, + unsigned long v) +{ + return br_multicast_eht_set_hosts_limit(p, v); +} +static BRPORT_ATTR(multicast_eht_hosts_limit, 0644, + show_multicast_eht_hosts_limit, + store_multicast_eht_hosts_limit); + +static ssize_t show_multicast_eht_hosts_cnt(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%u\n", p->multicast_eht_hosts_cnt); +} +static BRPORT_ATTR(multicast_eht_hosts_cnt, 0444, show_multicast_eht_hosts_cnt, + NULL); + BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST); #endif @@ -274,6 +298,8 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, &brport_attr_multicast_to_unicast, + &brport_attr_multicast_eht_hosts_limit, + &brport_attr_multicast_eht_hosts_cnt, #endif &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3d6ab194d0f5..c313aaf2bce1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -55,7 +55,7 @@ #include #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 36 +#define RTNL_SLAVE_MAX_TYPE 40 struct rtnl_link { rtnl_doit_func doit; -- cgit v1.2.3 From 772412176fb98493158929b220fe250127f611af Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 27 Jan 2021 11:31:39 -0800 Subject: bpf: Allow rewriting to ports under ip_unprivileged_port_start At the moment, BPF_CGROUP_INET{4,6}_BIND hooks can rewrite user_port to the privileged ones (< ip_unprivileged_port_start), but it will be rejected later on in the __inet_bind or __inet6_bind. Let's add another return value to indicate that CAP_NET_BIND_SERVICE check should be ignored. Use the same idea as we currently use in cgroup/egress where bit #1 indicates CN. Instead, for cgroup/bind{4,6}, bit #1 indicates that CAP_NET_BIND_SERVICE should be bypassed. v5: - rename flags to be less confusing (Andrey Ignatov) - rework BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY to work on flags and accept BPF_RET_SET_CN (no behavioral changes) v4: - Add missing IPv6 support (Martin KaFai Lau) v3: - Update description (Martin KaFai Lau) - Fix capability restore in selftest (Martin KaFai Lau) v2: - Switch to explicit return code (Martin KaFai Lau) Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Reviewed-by: Martin KaFai Lau Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20210127193140.3170382-1-sdf@google.com --- include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++++++--------- include/linux/bpf.h | 52 +++++++++++++++++++++++++++++----------------- include/net/inet_common.h | 2 ++ kernel/bpf/cgroup.c | 8 +++++-- kernel/bpf/verifier.c | 3 +++ net/ipv4/af_inet.c | 9 +++++--- net/ipv6/af_inet6.c | 9 +++++--- 7 files changed, 84 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0748fd87969e..c42e02b4d84b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -125,7 +125,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx); + void *t_ctx, + u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, @@ -231,30 +232,48 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ + u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - NULL); \ + NULL, \ + &__unused_flags); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ + u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - t_ctx); \ + t_ctx, \ + &__unused_flags); \ release_sock(sk); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) - -#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) +/* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags + * via upper bits of return code. The only flag that is supported + * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check + * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). + */ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +({ \ + u32 __flags = 0; \ + int __ret = 0; \ + if (cgroup_bpf_enabled(type)) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + NULL, &__flags); \ + release_sock(sk); \ + if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ + *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ + } \ + __ret; \ +}) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ @@ -453,8 +472,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1aac2af12fed..321966fc35db 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1073,6 +1073,34 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ +#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) +/* BPF program asks to set CN on the packet. */ +#define BPF_RET_SET_CN (1 << 0) + +#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 _ret = 1; \ + u32 func_ret; \ + migrate_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + func_ret = func(_prog, ctx); \ + _ret &= (func_ret & 1); \ + *(ret_flags) |= (func_ret >> 1); \ + _item++; \ + } \ + rcu_read_unlock(); \ + migrate_enable(); \ + _ret; \ + }) + #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ struct bpf_prog_array_item *_item; \ @@ -1120,25 +1148,11 @@ _out: \ */ #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - u32 ret; \ - u32 _ret = 1; \ - u32 _cn = 0; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ - ret = func(_prog, ctx); \ - _ret &= (ret & 1); \ - _cn |= (ret & 2); \ - _item++; \ - } \ - rcu_read_unlock(); \ - migrate_enable(); \ + u32 _flags = 0; \ + bool _cn; \ + u32 _ret; \ + _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \ + _cn = _flags & BPF_RET_SET_CN; \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index cb2818862919..cad2a611efde 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -41,6 +41,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) +/* Skip CAP_NET_BIND_SERVICE check. */ +#define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da649f20d6b2..cdf3c7e611d9 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program + * return value (OR'ed together). * * socket is expected to be of type INET or INET6. * @@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx) + void *t_ctx, + u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, @@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN, flags); return ret == 1 ? 0 : -EPERM; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d0eae51b31e4..972fc38eb62d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7986,6 +7986,9 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); + if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || + env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) + range = tnum_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6ba2930ff49b..aaa94bea19c3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET4_BIND, &flags); if (err) return err; - return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet_bind); @@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b9c654836b72..f091fe9b4da5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,7 +295,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -439,6 +440,7 @@ out_unlock: int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err = 0; /* If the socket has its own bind function then use it. */ @@ -451,11 +453,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET6_BIND, &flags); if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet6_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet6_bind); -- cgit v1.2.3 From e78ab164591ffd55d2771401ed0d9b083dad55fa Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 26 Jan 2021 15:24:06 -0800 Subject: devlink: Add DMAC filter generic packet trap Add packet trap that can report packets that were dropped due to destination MAC filtering. Signed-off-by: Aya Levin Reviewed-by: Ido Schimmel Reviewed-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-trap.rst | 5 +++++ include/net/devlink.h | 3 +++ net/core/devlink.c | 1 + 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index d875f3e1e9cf..935b6397e8cf 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -480,6 +480,11 @@ be added to the following table: - ``drop`` - Traps packets that the device decided to drop in case they hit a blackhole nexthop + * - ``dmac_filter`` + - ``drop`` + - Traps incoming packets that the device decided to drop because + the destination MAC is not configured in the MAC table and + the interface is not in promiscuous mode Driver-specific Packet Traps ============================ diff --git a/include/net/devlink.h b/include/net/devlink.h index d12ed2854c34..426b98e74b6e 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -838,6 +838,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_GTP_PARSING, DEVLINK_TRAP_GENERIC_ID_ESP_PARSING, DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_NEXTHOP, + DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -1063,6 +1064,8 @@ enum devlink_trap_group_generic_id { "esp_parsing" #define DEVLINK_TRAP_GENERIC_NAME_BLACKHOLE_NEXTHOP \ "blackhole_nexthop" +#define DEVLINK_TRAP_GENERIC_NAME_DMAC_FILTER \ + "dest_mac_filter" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" diff --git a/net/core/devlink.c b/net/core/devlink.c index 72ea79879762..f6e6445b9801 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9512,6 +9512,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), + DEVLINK_TRAP(DMAC_FILTER, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ -- cgit v1.2.3 From 3d347b1b19da20f973d1d3c6bb60c11185606afd Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 26 Jan 2021 15:24:07 -0800 Subject: net/mlx5: Add support for devlink traps in mlx5 core driver Add devlink traps infra-structure to mlx5 core driver. Add traps list to mlx5_priv and corresponding API: - mlx5_devlink_trap_report() to wrap trap reports to devlink - mlx5_devlink_trap_get_num_active() to decide whether to open/close trap resources. Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 84 +++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/devlink.h | 16 +++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 + include/linux/mlx5/driver.h | 1 + 4 files changed, 103 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 3261d0dc1104..f04afaf785cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -168,6 +168,56 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a return 0; } +static struct mlx5_devlink_trap *mlx5_find_trap_by_id(struct mlx5_core_dev *dev, int trap_id) +{ + struct mlx5_devlink_trap *dl_trap; + + list_for_each_entry(dl_trap, &dev->priv.traps, list) + if (dl_trap->trap.id == trap_id) + return dl_trap; + + return NULL; +} + +static int mlx5_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap, + void *trap_ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = kzalloc(sizeof(*dl_trap), GFP_KERNEL); + if (!dl_trap) + return -ENOMEM; + + dl_trap->trap.id = trap->id; + dl_trap->trap.action = DEVLINK_TRAP_ACTION_DROP; + dl_trap->item = trap_ctx; + + if (mlx5_find_trap_by_id(dev, trap->id)) { + kfree(dl_trap); + mlx5_core_err(dev, "Devlink trap: Trap 0x%x already found", trap->id); + return -EEXIST; + } + + list_add_tail(&dl_trap->list, &dev->priv.traps); + return 0; +} + +static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink_trap *trap, + void *trap_ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap->id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Missing trap id 0x%x", trap->id); + return; + } + list_del(&dl_trap->list); + kfree(dl_trap); +} + static const struct devlink_ops mlx5_devlink_ops = { #ifdef CONFIG_MLX5_ESWITCH .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, @@ -186,8 +236,42 @@ static const struct devlink_ops mlx5_devlink_ops = { .reload_limits = BIT(DEVLINK_RELOAD_LIMIT_NO_RESET), .reload_down = mlx5_devlink_reload_down, .reload_up = mlx5_devlink_reload_up, + .trap_init = mlx5_devlink_trap_init, + .trap_fini = mlx5_devlink_trap_fini, }; +void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port) +{ + struct devlink *devlink = priv_to_devlink(dev); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap_id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Report on invalid trap id 0x%x", trap_id); + return; + } + + if (dl_trap->trap.action != DEVLINK_TRAP_ACTION_TRAP) { + mlx5_core_dbg(dev, "Devlink trap: Trap id %d has action %d", trap_id, + dl_trap->trap.action); + return; + } + devlink_trap_report(devlink, skb, dl_trap->item, dl_port, NULL); +} + +int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev) +{ + struct mlx5_devlink_trap *dl_trap; + int count = 0; + + list_for_each_entry(dl_trap, &dev->priv.traps, list) + if (dl_trap->trap.action == DEVLINK_TRAP_ACTION_TRAP) + count++; + + return count; +} + struct devlink *mlx5_devlink_alloc(void) { return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h index f0de327a59be..a9829006fa78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -12,6 +12,22 @@ enum mlx5_devlink_param_id { MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, }; +struct mlx5_trap_ctx { + int id; + int action; +}; + +struct mlx5_devlink_trap { + struct mlx5_trap_ctx trap; + void *item; + struct list_head list; +}; + +struct mlx5_core_dev; +void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port); +int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev); + struct devlink *mlx5_devlink_alloc(void); void mlx5_devlink_free(struct devlink *devlink); int mlx5_devlink_register(struct devlink *devlink, struct device *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ca6f2fc39ea0..bfedf064db1a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1305,6 +1305,8 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) priv->dbg_root = debugfs_create_dir(dev_name(dev->device), mlx5_debugfs_root); + INIT_LIST_HEAD(&priv->traps); + err = mlx5_health_init(dev); if (err) goto err_health_init; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f93bfe7473aa..c4615dc51b6f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -564,6 +564,7 @@ struct mlx5_priv { int host_pf_pages; struct mlx5_core_health health; + struct list_head traps; /* start: qp staff */ struct dentry *qp_debugfs; -- cgit v1.2.3 From 241dc159391fb9d351362d911a39dff84074cc92 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 26 Jan 2021 15:24:11 -0800 Subject: net/mlx5: Notify on trap action by blocking event In order to allow mlx5 core driver to trigger synchronous operations to its consumers, add a blocking events handler. Add wrappers to blocking_notifier_[call_chain/chain_register/chain_unregister]. Add trap callback for action set and notify about this change. Following patches in the set add a listener for this event. Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 36 +++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/events.c | 28 ++++++++++++++++++ include/linux/mlx5/device.h | 4 +++ include/linux/mlx5/driver.h | 15 ++++++++++ 4 files changed, 83 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index f081eff9be25..c47291467cb0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -218,6 +218,41 @@ static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink kfree(dl_trap); } +static int mlx5_devlink_trap_action_set(struct devlink *devlink, + const struct devlink_trap *trap, + enum devlink_trap_action action, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + enum devlink_trap_action action_orig; + struct mlx5_devlink_trap *dl_trap; + int err = 0; + + dl_trap = mlx5_find_trap_by_id(dev, trap->id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id); + err = -EINVAL; + goto out; + } + + if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) { + err = -EOPNOTSUPP; + goto out; + } + + if (action == dl_trap->trap.action) + goto out; + + action_orig = dl_trap->trap.action; + dl_trap->trap.action = action; + err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP, + &dl_trap->trap); + if (err) + dl_trap->trap.action = action_orig; +out: + return err; +} + static const struct devlink_ops mlx5_devlink_ops = { #ifdef CONFIG_MLX5_ESWITCH .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, @@ -238,6 +273,7 @@ static const struct devlink_ops mlx5_devlink_ops = { .reload_up = mlx5_devlink_reload_up, .trap_init = mlx5_devlink_trap_init, .trap_fini = mlx5_devlink_trap_fini, + .trap_action_set = mlx5_devlink_trap_action_set, }; void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index 054c0bc36d24..670f25f5ffd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -61,6 +61,8 @@ struct mlx5_events { struct mlx5_pme_stats pme_stats; /*pcie_core*/ struct work_struct pcie_core_work; + /* driver notifier chain for sw events */ + struct blocking_notifier_head sw_nh; }; static const char *eqe_type_str(u8 type) @@ -351,6 +353,7 @@ int mlx5_events_init(struct mlx5_core_dev *dev) return -ENOMEM; } INIT_WORK(&events->pcie_core_work, mlx5_pcie_event); + BLOCKING_INIT_NOTIFIER_HEAD(&events->sw_nh); return 0; } @@ -406,3 +409,28 @@ int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, voi { return atomic_notifier_call_chain(&events->fw_nh, event, data); } + +/* This API is used only for processing and forwarding driver-specific + * events to mlx5 consumers. + */ +int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + struct mlx5_events *events = dev->priv.events; + + return blocking_notifier_chain_register(&events->sw_nh, nb); +} + +int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + struct mlx5_events *events = dev->priv.events; + + return blocking_notifier_chain_unregister(&events->sw_nh, nb); +} + +int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event, + void *data) +{ + struct mlx5_events *events = dev->priv.events; + + return blocking_notifier_call_chain(&events->sw_nh, event, data); +} diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f1de49d64a98..77ba54d38772 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -359,6 +359,10 @@ enum mlx5_event { MLX5_EVENT_TYPE_MAX = 0x100, }; +enum mlx5_driver_event { + MLX5_DRIVER_EVENT_TYPE_TRAP = 0, +}; + enum { MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE = 0x0, MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE = 0x1, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c4615dc51b6f..45df5b465ba8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1073,11 +1073,26 @@ enum { MAX_MR_CACHE_ENTRIES }; +/* Async-atomic event notifier used by mlx5 core to forward FW + * evetns recived from event queue to mlx5 consumers. + * Optimise event queue dipatching. + */ int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb); int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb); + +/* Async-atomic event notifier used for forwarding + * evetns from the event queue into the to mlx5 events dispatcher, + * eswitch, clock and others. + */ int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb); int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb); +/* Blocking event notifier used to forward SW events, used for slow path */ +int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb); +int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb); +int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event, + void *data); + int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 5543e989fe5e2fe6a5829ee42c00152cac2bb8a0 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 26 Jan 2021 15:24:16 -0800 Subject: net/mlx5e: Add trap entity to ETH driver Introduce mlx5e_trap which includes a dedicated RQ and NAPI for trapped packets. Trap-RQ processes packets that were destined to be dropped, but for debug and visibility sake these packets are trapped and reported to devlink. Trap-RQ connects between the HW and the driver and is not a part of a channel. Open mlx5e_create_rq() and mlx5_core_destroy_rq() as API and add dedicate RQ handlers which report to devlink of trapped packets. Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 + drivers/net/ethernet/mellanox/mlx5/core/en/trap.c | 409 ++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en/trap.h | 35 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 46 +++ include/linux/mlx5/device.h | 5 + 7 files changed, 505 insertions(+), 4 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/trap.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/trap.h (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index fcfc0b114985..d44f5f6ee449 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -27,7 +27,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ en_selftest.o en/port.o en/monitor_stats.o en/health.o \ en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \ en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \ - en/qos.o + en/qos.o en/trap.o # # Netdev extra diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index dc4895a1fa9b..f439a977ad61 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -564,6 +564,7 @@ typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq); typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16); int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool xsk); +void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params); enum mlx5e_rq_flag { MLX5E_RQ_FLAG_XDP_XMIT, @@ -805,6 +806,8 @@ struct mlx5e_htb { u16 defcls; }; +struct mlx5e_trap; + struct mlx5e_priv { /* priv data path fields - start */ /* +1 for port ptp ts */ @@ -844,8 +847,10 @@ struct mlx5e_priv { struct mlx5_core_dev *mdev; struct net_device *netdev; + struct mlx5e_trap *en_trap; struct mlx5e_stats stats; struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS]; + struct mlx5e_channel_stats trap_stats; struct mlx5e_port_ptp_stats port_ptp_stats; u16 max_nch; u8 max_opened_tc; @@ -961,6 +966,8 @@ int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params, int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time); void mlx5e_deactivate_rq(struct mlx5e_rq *rq); void mlx5e_close_rq(struct mlx5e_rq *rq); +int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param); +void mlx5e_destroy_rq(struct mlx5e_rq *rq); struct mlx5e_sq_param; int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c new file mode 100644 index 000000000000..5507efacb9dc --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies */ + +#include +#include "en/txrx.h" +#include "en/params.h" +#include "en/trap.h" + +static int mlx5e_trap_napi_poll(struct napi_struct *napi, int budget) +{ + struct mlx5e_trap *trap_ctx = container_of(napi, struct mlx5e_trap, napi); + struct mlx5e_ch_stats *ch_stats = trap_ctx->stats; + struct mlx5e_rq *rq = &trap_ctx->rq; + bool busy = false; + int work_done = 0; + + ch_stats->poll++; + + work_done = mlx5e_poll_rx_cq(&rq->cq, budget); + busy |= work_done == budget; + busy |= rq->post_wqes(rq); + + if (busy) + return budget; + + if (unlikely(!napi_complete_done(napi, work_done))) + return work_done; + + mlx5e_cq_arm(&rq->cq); + return work_done; +} + +static int mlx5e_alloc_trap_rq(struct mlx5e_priv *priv, struct mlx5e_rq_param *rqp, + struct mlx5e_rq_stats *stats, struct mlx5e_params *params, + struct mlx5e_ch_stats *ch_stats, + struct mlx5e_rq *rq) +{ + void *rqc_wq = MLX5_ADDR_OF(rqc, rqp->rqc, wq); + struct mlx5_core_dev *mdev = priv->mdev; + struct page_pool_params pp_params = {}; + int node = dev_to_node(mdev->device); + u32 pool_size; + int wq_sz; + int err; + int i; + + rqp->wq.db_numa_node = node; + + rq->wq_type = params->rq_wq_type; + rq->pdev = mdev->device; + rq->netdev = priv->netdev; + rq->mdev = mdev; + rq->priv = priv; + rq->stats = stats; + rq->clock = &mdev->clock; + rq->tstamp = &priv->tstamp; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + + xdp_rxq_info_unused(&rq->xdp_rxq); + + rq->buff.map_dir = DMA_FROM_DEVICE; + rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, NULL); + pool_size = 1 << params->log_rq_mtu_frames; + + err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq, &rq->wq_ctrl); + if (err) + return err; + + rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR]; + + wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq); + + rq->wqe.info = rqp->frags_info; + rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; + rq->wqe.frags = kvzalloc_node(array_size(sizeof(*rq->wqe.frags), + (wq_sz << rq->wqe.info.log_num_frags)), + GFP_KERNEL, node); + if (!rq->wqe.frags) { + err = -ENOMEM; + goto err_wq_cyc_destroy; + } + + err = mlx5e_init_di_list(rq, wq_sz, node); + if (err) + goto err_free_frags; + + rq->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); + + mlx5e_rq_set_trap_handlers(rq, params); + + /* Create a page_pool and register it with rxq */ + pp_params.order = 0; + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */ + pp_params.pool_size = pool_size; + pp_params.nid = node; + pp_params.dev = mdev->device; + pp_params.dma_dir = rq->buff.map_dir; + + /* page_pool can be used even when there is no rq->xdp_prog, + * given page_pool does not handle DMA mapping there is no + * required state to clear. And page_pool gracefully handle + * elevated refcnt. + */ + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + err = PTR_ERR(rq->page_pool); + rq->page_pool = NULL; + goto err_free_di_list; + } + for (i = 0; i < wq_sz; i++) { + struct mlx5e_rx_wqe_cyc *wqe = + mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i); + int f; + + for (f = 0; f < rq->wqe.info.num_frags; f++) { + u32 frag_size = rq->wqe.info.arr[f].frag_size | + MLX5_HW_START_PADDING; + + wqe->data[f].byte_count = cpu_to_be32(frag_size); + wqe->data[f].lkey = rq->mkey_be; + } + /* check if num_frags is not a pow of two */ + if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) { + wqe->data[f].byte_count = 0; + wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + wqe->data[f].addr = 0; + } + } + return 0; + +err_free_di_list: + mlx5e_free_di_list(rq); +err_free_frags: + kvfree(rq->wqe.frags); +err_wq_cyc_destroy: + mlx5_wq_destroy(&rq->wq_ctrl); + + return err; +} + +static void mlx5e_free_trap_rq(struct mlx5e_rq *rq) +{ + page_pool_destroy(rq->page_pool); + mlx5e_free_di_list(rq); + kvfree(rq->wqe.frags); + mlx5_wq_destroy(&rq->wq_ctrl); +} + +static int mlx5e_open_trap_rq(struct mlx5e_priv *priv, struct napi_struct *napi, + struct mlx5e_rq_stats *stats, struct mlx5e_params *params, + struct mlx5e_rq_param *rq_param, + struct mlx5e_ch_stats *ch_stats, + struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_create_cq_param ccp = {}; + struct dim_cq_moder trap_moder = {}; + struct mlx5e_cq *cq = &rq->cq; + int err; + + ccp.node = dev_to_node(mdev->device); + ccp.ch_stats = ch_stats; + ccp.napi = napi; + ccp.ix = 0; + err = mlx5e_open_cq(priv, trap_moder, &rq_param->cqp, &ccp, cq); + if (err) + return err; + + err = mlx5e_alloc_trap_rq(priv, rq_param, stats, params, ch_stats, rq); + if (err) + goto err_destroy_cq; + + err = mlx5e_create_rq(rq, rq_param); + if (err) + goto err_free_rq; + + err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) + goto err_destroy_rq; + + return 0; + +err_destroy_rq: + mlx5e_destroy_rq(rq); + mlx5e_free_rx_descs(rq); +err_free_rq: + mlx5e_free_trap_rq(rq); +err_destroy_cq: + mlx5e_close_cq(cq); + + return err; +} + +static void mlx5e_close_trap_rq(struct mlx5e_rq *rq) +{ + mlx5e_destroy_rq(rq); + mlx5e_free_rx_descs(rq); + mlx5e_free_trap_rq(rq); + mlx5e_close_cq(&rq->cq); +} + +static int mlx5e_create_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, + u32 rqn) +{ + void *tirc; + int inlen; + u32 *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.td.tdn); + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_NONE); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, rqn); + err = mlx5e_create_tir(mdev, tir, in); + kvfree(in); + + return err; +} + +static void mlx5e_destroy_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir) +{ + mlx5e_destroy_tir(mdev, tir); +} + +static void mlx5e_activate_trap_rq(struct mlx5e_rq *rq) +{ + set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); +} + +static void mlx5e_deactivate_trap_rq(struct mlx5e_rq *rq) +{ + clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); +} + +static void mlx5e_build_trap_params(struct mlx5e_priv *priv, struct mlx5e_trap *t) +{ + struct mlx5e_params *params = &t->params; + + params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC; + mlx5e_init_rq_type_params(priv->mdev, params); + params->sw_mtu = priv->netdev->max_mtu; + mlx5e_build_rq_param(priv, params, NULL, &t->rq_param); +} + +static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv) +{ + int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, 0)); + struct net_device *netdev = priv->netdev; + struct mlx5e_trap *t; + int err; + + t = kvzalloc_node(sizeof(*t), GFP_KERNEL, cpu_to_node(cpu)); + if (!t) + return ERR_PTR(-ENOMEM); + + mlx5e_build_trap_params(priv, t); + + t->priv = priv; + t->mdev = priv->mdev; + t->tstamp = &priv->tstamp; + t->pdev = mlx5_core_dma_dev(priv->mdev); + t->netdev = priv->netdev; + t->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); + t->stats = &priv->trap_stats.ch; + + netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll, 64); + + err = mlx5e_open_trap_rq(priv, &t->napi, + &priv->trap_stats.rq, + &t->params, &t->rq_param, + &priv->trap_stats.ch, + &t->rq); + if (unlikely(err)) + goto err_napi_del; + + err = mlx5e_create_trap_direct_rq_tir(t->mdev, &t->tir, t->rq.rqn); + if (err) + goto err_close_trap_rq; + + return t; + +err_close_trap_rq: + mlx5e_close_trap_rq(&t->rq); +err_napi_del: + netif_napi_del(&t->napi); + kvfree(t); + return ERR_PTR(err); +} + +void mlx5e_close_trap(struct mlx5e_trap *trap) +{ + mlx5e_destroy_trap_direct_rq_tir(trap->mdev, &trap->tir); + mlx5e_close_trap_rq(&trap->rq); + netif_napi_del(&trap->napi); + kvfree(trap); +} + +static void mlx5e_activate_trap(struct mlx5e_trap *trap) +{ + napi_enable(&trap->napi); + mlx5e_activate_trap_rq(&trap->rq); + napi_schedule(&trap->napi); +} + +void mlx5e_deactivate_trap(struct mlx5e_priv *priv) +{ + struct mlx5e_trap *trap = priv->en_trap; + + mlx5e_deactivate_trap_rq(&trap->rq); + napi_disable(&trap->napi); +} + +static struct mlx5e_trap *mlx5e_add_trap_queue(struct mlx5e_priv *priv) +{ + struct mlx5e_trap *trap; + + trap = mlx5e_open_trap(priv); + if (IS_ERR(trap)) + goto out; + + mlx5e_activate_trap(trap); +out: + return trap; +} + +static void mlx5e_del_trap_queue(struct mlx5e_priv *priv) +{ + mlx5e_deactivate_trap(priv); + mlx5e_close_trap(priv->en_trap); + priv->en_trap = NULL; +} + +static int mlx5e_trap_get_tirn(struct mlx5e_trap *en_trap) +{ + return en_trap->tir.tirn; +} + +static int mlx5e_handle_action_trap(struct mlx5e_priv *priv, int trap_id) +{ + bool open_queue = !priv->en_trap; + struct mlx5e_trap *trap; + int err; + + if (open_queue) { + trap = mlx5e_add_trap_queue(priv); + if (IS_ERR(trap)) + return PTR_ERR(trap); + priv->en_trap = trap; + } + + switch (trap_id) { + case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER: + err = mlx5e_add_vlan_trap(priv, trap_id, mlx5e_trap_get_tirn(priv->en_trap)); + if (err) + goto err_out; + break; + default: + netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id); + err = -EINVAL; + goto err_out; + } + return 0; + +err_out: + if (open_queue) + mlx5e_del_trap_queue(priv); + return err; +} + +static int mlx5e_handle_action_drop(struct mlx5e_priv *priv, int trap_id) +{ + switch (trap_id) { + case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER: + mlx5e_remove_vlan_trap(priv); + break; + default: + netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id); + return -EINVAL; + } + if (priv->en_trap && !mlx5_devlink_trap_get_num_active(priv->mdev)) + mlx5e_del_trap_queue(priv); + + return 0; +} + +int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx) +{ + int err = 0; + + switch (trap_ctx->action) { + case DEVLINK_TRAP_ACTION_TRAP: + err = mlx5e_handle_action_trap(priv, trap_ctx->id); + break; + case DEVLINK_TRAP_ACTION_DROP: + err = mlx5e_handle_action_drop(priv, trap_ctx->id); + break; + default: + netdev_warn(priv->netdev, "%s: Unsupported action %d\n", __func__, + trap_ctx->action); + err = -EINVAL; + } + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h new file mode 100644 index 000000000000..cc1fa9f12c45 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies */ + +#ifndef __MLX5E_TRAP_H__ +#define __MLX5E_TRAP_H__ + +#include "../en.h" +#include "../devlink.h" + +struct mlx5e_trap { + /* data path */ + struct mlx5e_rq rq; + struct mlx5e_tir tir; + struct napi_struct napi; + struct device *pdev; + struct net_device *netdev; + __be32 mkey_be; + + /* data path - accessed per napi poll */ + struct mlx5e_ch_stats *stats; + + /* control */ + struct mlx5e_priv *priv; + struct mlx5_core_dev *mdev; + struct hwtstamp_config *tstamp; + DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES); + + struct mlx5e_params params; + struct mlx5e_rq_param rq_param; +}; + +void mlx5e_close_trap(struct mlx5e_trap *trap); +void mlx5e_deactivate_trap(struct mlx5e_priv *priv); +int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx); +#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bed2f1a6d730..ec5bb48cb54a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -649,8 +649,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) mlx5_wq_destroy(&rq->wq_ctrl); } -static int mlx5e_create_rq(struct mlx5e_rq *rq, - struct mlx5e_rq_param *param) +int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) { struct mlx5_core_dev *mdev = rq->mdev; @@ -773,7 +772,7 @@ static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) return err; } -static void mlx5e_destroy_rq(struct mlx5e_rq *rq) +void mlx5e_destroy_rq(struct mlx5e_rq *rq) { mlx5_core_destroy_rq(rq->mdev, rq->rqn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index dec93d57542f..98b56f495b32 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -52,6 +52,7 @@ #include "en/xsk/rx.h" #include "en/health.h" #include "en/params.h" +#include "devlink.h" static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, @@ -1815,3 +1816,48 @@ int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool return 0; } + +static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5e_priv *priv = netdev_priv(rq->netdev); + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 trap_id; + u16 ci; + + trap_id = get_cqe_flow_tag(cqe); + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + rq->stats->wqe_err++; + goto free_wqe; + } + + skb = mlx5e_skb_from_cqe_nonlinear(rq, cqe, wi, cqe_bcnt); + if (!skb) + goto free_wqe; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + skb_push(skb, ETH_HLEN); + + mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port); + dev_kfree_skb_any(skb); + +free_wqe: + mlx5e_free_rx_wqe(rq, wi, false); + mlx5_wq_cyc_pop(wq); +} + +void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params) +{ + rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params, NULL) ? + mlx5e_skb_from_cqe_linear : + mlx5e_skb_from_cqe_nonlinear; + rq->post_wqes = mlx5e_post_rx_wqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_wqe; + rq->handle_rx_cqe = mlx5e_trap_handle_rx_cqe; +} diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 77ba54d38772..00057eae89ab 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -903,6 +903,11 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe) return (u64)lo | ((u64)hi << 32); } +static inline u16 get_cqe_flow_tag(struct mlx5_cqe64 *cqe) +{ + return be32_to_cpu(cqe->sop_drop_qpn) & 0xFFF; +} + #define MLX5_MPWQE_LOG_NUM_STRIDES_BASE (9) #define MLX5_MPWQE_LOG_STRIDE_SZ_BASE (6) -- cgit v1.2.3 From 28af22c6c8dff6a16163e5b6a56211d5b535c97b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Jan 2021 18:39:39 +0100 Subject: net: adjust net_device layout for cacheline usage The current layout of net_device is not optimal for cacheline usage. The member adj_list.lower linked list is split between cacheline 2 and 3. The ifindex is placed together with stats (struct net_device_stats), although most modern drivers don't update this stats member. The members netdev_ops, mtu and hard_header_len are placed on three different cachelines. These members are accessed for XDP redirect into devmap, which were noticeably with perf tool. When not using the map redirect variant (like TC-BPF does), then ifindex is also used, which is placed on a separate fourth cacheline. These members are also accessed during forwarding with regular network stack. The members priv_flags and flags are on fast-path for network stack transmit path in __dev_queue_xmit (currently located together with mtu cacheline). This patch creates a read mostly cacheline, with the purpose of keeping the above mentioned members on the same cacheline. Some netdev_features_t members also becomes part of this cacheline, which is on purpose, as function netif_skb_features() is on fast-path via validate_xmit_skb(). Signed-off-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/161168277983.410784.12401225493601624417.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 53 ++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e8572533d8e..e9e7ada07ea1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1858,7 +1858,6 @@ struct net_device { unsigned long mem_end; unsigned long mem_start; unsigned long base_addr; - int irq; /* * Some hardware also needs these fields (state,dev_list, @@ -1880,6 +1879,23 @@ struct net_device { struct list_head lower; } adj_list; + /* Read-mostly cache-line for fast-path access */ + unsigned int flags; + unsigned int priv_flags; + const struct net_device_ops *netdev_ops; + int ifindex; + unsigned short gflags; + unsigned short hard_header_len; + + /* Note : dev->mtu is often read without holding a lock. + * Writers usually hold RTNL. + * It is recommended to use READ_ONCE() to annotate the reads, + * and to use WRITE_ONCE() to annotate the writes. + */ + unsigned int mtu; + unsigned short needed_headroom; + unsigned short needed_tailroom; + netdev_features_t features; netdev_features_t hw_features; netdev_features_t wanted_features; @@ -1888,10 +1904,15 @@ struct net_device { netdev_features_t mpls_features; netdev_features_t gso_partial_features; - int ifindex; + unsigned int min_mtu; + unsigned int max_mtu; + unsigned short type; + unsigned char min_header_len; + unsigned char name_assign_type; + int group; - struct net_device_stats stats; + struct net_device_stats stats; /* not used by modern drivers */ atomic_long_t rx_dropped; atomic_long_t tx_dropped; @@ -1905,7 +1926,6 @@ struct net_device { const struct iw_handler_def *wireless_handlers; struct iw_public_data *wireless_data; #endif - const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV const struct l3mdev_ops *l3mdev_ops; @@ -1924,34 +1944,12 @@ struct net_device { const struct header_ops *header_ops; - unsigned int flags; - unsigned int priv_flags; - - unsigned short gflags; - unsigned short padded; - unsigned char operstate; unsigned char link_mode; unsigned char if_port; unsigned char dma; - /* Note : dev->mtu is often read without holding a lock. - * Writers usually hold RTNL. - * It is recommended to use READ_ONCE() to annotate the reads, - * and to use WRITE_ONCE() to annotate the writes. - */ - unsigned int mtu; - unsigned int min_mtu; - unsigned int max_mtu; - unsigned short type; - unsigned short hard_header_len; - unsigned char min_header_len; - unsigned char name_assign_type; - - unsigned short needed_headroom; - unsigned short needed_tailroom; - /* Interface address info. */ unsigned char perm_addr[MAX_ADDR_LEN]; unsigned char addr_assign_type; @@ -1962,7 +1960,10 @@ struct net_device { unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; + unsigned short padded; + spinlock_t addr_list_lock; + int irq; struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; -- cgit v1.2.3 From ef6af7bdb9e6c14eae8dc5fe852aefe1e089c85c Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 28 Jan 2021 12:41:06 +0100 Subject: net/af_iucv: count packets in the xmit path The TX code keeps track of all skbs that are in-flight but haven't actually been sent out yet. For native IUCV sockets that's not a huge deal, but with TRANS_HIPER sockets it would be much better if we didn't need to maintain a list of skb clones. Note that we actually only care about the _count_ of skbs in this stage of the TX pipeline. So as prep work for removing the skb tracking on TRANS_HIPER sockets, keep track of the skb count in a separate variable and pair any list {enqueue, unlink} with a count {increment, decrement}. Then replace all occurences where we currently look at the skb list's fill level. Signed-off-by: Julian Wiedmann Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/net/iucv/af_iucv.h | 1 + net/iucv/af_iucv.c | 30 ++++++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index 9259ce2b22f3..0816bcd44dd1 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -128,6 +128,7 @@ struct iucv_sock { u8 flags; u16 msglimit; u16 msglimit_peer; + atomic_t skbs_in_xmit; atomic_t msg_sent; atomic_t msg_recv; atomic_t pendings; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 8683b6939f45..e487f472027a 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -182,7 +182,7 @@ static inline int iucv_below_msglim(struct sock *sk) if (sk->sk_state != IUCV_CONNECTED) return 1; if (iucv->transport == AF_IUCV_TRANS_IUCV) - return (skb_queue_len(&iucv->send_skb_q) < iucv->path->msglim); + return (atomic_read(&iucv->skbs_in_xmit) < iucv->path->msglim); else return ((atomic_read(&iucv->msg_sent) < iucv->msglimit_peer) && (atomic_read(&iucv->pendings) <= 0)); @@ -269,8 +269,10 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, } skb_queue_tail(&iucv->send_skb_q, nskb); + atomic_inc(&iucv->skbs_in_xmit); err = dev_queue_xmit(skb); if (net_xmit_eval(err)) { + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(nskb, &iucv->send_skb_q); kfree_skb(nskb); } else { @@ -424,7 +426,7 @@ static void iucv_sock_close(struct sock *sk) sk->sk_state = IUCV_CLOSING; sk->sk_state_change(sk); - if (!err && !skb_queue_empty(&iucv->send_skb_q)) { + if (!err && atomic_read(&iucv->skbs_in_xmit) > 0) { if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) timeo = sk->sk_lingertime; else @@ -491,6 +493,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, atomic_set(&iucv->pendings, 0); iucv->flags = 0; iucv->msglimit = 0; + atomic_set(&iucv->skbs_in_xmit, 0); atomic_set(&iucv->msg_sent, 0); atomic_set(&iucv->msg_recv, 0); iucv->path = NULL; @@ -1055,6 +1058,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, } } else { /* Classic VM IUCV transport */ skb_queue_tail(&iucv->send_skb_q, skb); + atomic_inc(&iucv->skbs_in_xmit); if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) && skb->len <= 7) { @@ -1063,6 +1067,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* on success: there is no message_complete callback */ /* for an IPRMDATA msg; remove skb from send queue */ if (err == 0) { + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); kfree_skb(skb); } @@ -1071,6 +1076,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* IUCV_IPRMDATA path flag is set... sever path */ if (err == 0x15) { pr_iucv->path_sever(iucv->path, NULL); + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); err = -EPIPE; goto fail; @@ -1109,6 +1115,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, } else { err = -EPIPE; } + + atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); goto fail; } @@ -1748,10 +1756,14 @@ static void iucv_callback_txdone(struct iucv_path *path, { struct sock *sk = path->private; struct sk_buff *this = NULL; - struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q; + struct sk_buff_head *list; struct sk_buff *list_skb; + struct iucv_sock *iucv; unsigned long flags; + iucv = iucv_sk(sk); + list = &iucv->send_skb_q; + bh_lock_sock(sk); spin_lock_irqsave(&list->lock, flags); @@ -1761,8 +1773,11 @@ static void iucv_callback_txdone(struct iucv_path *path, break; } } - if (this) + if (this) { + atomic_dec(&iucv->skbs_in_xmit); __skb_unlink(this, list); + } + spin_unlock_irqrestore(&list->lock, flags); if (this) { @@ -1772,7 +1787,7 @@ static void iucv_callback_txdone(struct iucv_path *path, } if (sk->sk_state == IUCV_CLOSING) { - if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) { + if (atomic_read(&iucv->skbs_in_xmit) == 0) { sk->sk_state = IUCV_CLOSED; sk->sk_state_change(sk); } @@ -2150,6 +2165,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, if (skb_shinfo(list_skb) == skb_shinfo(skb)) { switch (n) { case TX_NOTIFY_OK: + atomic_dec(&iucv->skbs_in_xmit); __skb_unlink(list_skb, list); kfree_skb(list_skb); iucv_sock_wake_msglim(sk); @@ -2158,6 +2174,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, atomic_inc(&iucv->pendings); break; case TX_NOTIFY_DELAYED_OK: + atomic_dec(&iucv->skbs_in_xmit); __skb_unlink(list_skb, list); atomic_dec(&iucv->pendings); if (atomic_read(&iucv->pendings) <= 0) @@ -2169,6 +2186,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, case TX_NOTIFY_TPQFULL: /* not yet used */ case TX_NOTIFY_GENERALERROR: case TX_NOTIFY_DELAYED_GENERALERROR: + atomic_dec(&iucv->skbs_in_xmit); __skb_unlink(list_skb, list); kfree_skb(list_skb); if (sk->sk_state == IUCV_CONNECTED) { @@ -2183,7 +2201,7 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, spin_unlock_irqrestore(&list->lock, flags); if (sk->sk_state == IUCV_CLOSING) { - if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) { + if (atomic_read(&iucv->skbs_in_xmit) == 0) { sk->sk_state = IUCV_CLOSED; sk->sk_state_change(sk); } -- cgit v1.2.3 From 80bc97aa0aaab974bbbfb99a78d7515414004616 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 28 Jan 2021 12:41:07 +0100 Subject: net/af_iucv: don't track individual TX skbs for TRANS_HIPER sockets Stop maintaining the skb_send_q list for TRANS_HIPER sockets. Not only is it extra overhead, but keeping around a list of skb clones means that we later also have to match the ->sk_txnotify() calls against these clones and free them accordingly. The current matching logic (comparing the skbs' shinfo location) is frustratingly fragile, and breaks if the skb's head is mangled in any sort of way while passing from dev_queue_xmit() to the device's HW queue. Also adjust the interface for ->sk_txnotify(), to make clear that we don't actually care about any skb internals. Signed-off-by: Julian Wiedmann Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_core_main.c | 6 ++- include/net/iucv/af_iucv.h | 2 +- net/iucv/af_iucv.c | 80 ++++++++++----------------------------- 3 files changed, 26 insertions(+), 62 deletions(-) (limited to 'include') diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index ea2e139cd592..89b223885b0c 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1409,10 +1409,12 @@ static void qeth_notify_skbs(struct qeth_qdio_out_q *q, struct sk_buff *skb; skb_queue_walk(&buf->skb_list, skb) { + struct sock *sk = skb->sk; + QETH_CARD_TEXT_(q->card, 5, "skbn%d", notification); QETH_CARD_TEXT_(q->card, 5, "%lx", (long) skb); - if (skb->sk && skb->sk->sk_family == PF_IUCV) - iucv_sk(skb->sk)->sk_txnotify(skb, notification); + if (sk && sk->sk_family == PF_IUCV) + iucv_sk(sk)->sk_txnotify(sk, notification); } } diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index 0816bcd44dd1..ff06246dbbb9 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -133,7 +133,7 @@ struct iucv_sock { atomic_t msg_recv; atomic_t pendings; int transport; - void (*sk_txnotify)(struct sk_buff *skb, + void (*sk_txnotify)(struct sock *sk, enum iucv_tx_notify n); }; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index e487f472027a..0e0656db4ae7 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -89,7 +89,7 @@ static struct sock *iucv_accept_dequeue(struct sock *parent, static void iucv_sock_kill(struct sock *sk); static void iucv_sock_close(struct sock *sk); -static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify); +static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify); /* Call Back functions */ static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); @@ -211,7 +211,6 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, { struct iucv_sock *iucv = iucv_sk(sock); struct af_iucv_trans_hdr *phs_hdr; - struct sk_buff *nskb; int err, confirm_recv = 0; phs_hdr = skb_push(skb, sizeof(*phs_hdr)); @@ -261,20 +260,10 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, } skb->protocol = cpu_to_be16(ETH_P_AF_IUCV); - __skb_header_release(skb); - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) { - err = -ENOMEM; - goto err_free; - } - - skb_queue_tail(&iucv->send_skb_q, nskb); atomic_inc(&iucv->skbs_in_xmit); err = dev_queue_xmit(skb); if (net_xmit_eval(err)) { atomic_dec(&iucv->skbs_in_xmit); - skb_unlink(nskb, &iucv->send_skb_q); - kfree_skb(nskb); } else { atomic_sub(confirm_recv, &iucv->msg_recv); WARN_ON(atomic_read(&iucv->msg_recv) < 0); @@ -2146,59 +2135,33 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, * afiucv_hs_callback_txnotify() - handle send notifcations from HiperSockets * transport **/ -static void afiucv_hs_callback_txnotify(struct sk_buff *skb, - enum iucv_tx_notify n) +static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify n) { - struct iucv_sock *iucv = iucv_sk(skb->sk); - struct sock *sk = skb->sk; - struct sk_buff_head *list; - struct sk_buff *list_skb; - struct sk_buff *nskb; - unsigned long flags; + struct iucv_sock *iucv = iucv_sk(sk); if (sock_flag(sk, SOCK_ZAPPED)) return; - list = &iucv->send_skb_q; - spin_lock_irqsave(&list->lock, flags); - skb_queue_walk_safe(list, list_skb, nskb) { - if (skb_shinfo(list_skb) == skb_shinfo(skb)) { - switch (n) { - case TX_NOTIFY_OK: - atomic_dec(&iucv->skbs_in_xmit); - __skb_unlink(list_skb, list); - kfree_skb(list_skb); - iucv_sock_wake_msglim(sk); - break; - case TX_NOTIFY_PENDING: - atomic_inc(&iucv->pendings); - break; - case TX_NOTIFY_DELAYED_OK: - atomic_dec(&iucv->skbs_in_xmit); - __skb_unlink(list_skb, list); - atomic_dec(&iucv->pendings); - if (atomic_read(&iucv->pendings) <= 0) - iucv_sock_wake_msglim(sk); - kfree_skb(list_skb); - break; - case TX_NOTIFY_UNREACHABLE: - case TX_NOTIFY_DELAYED_UNREACHABLE: - case TX_NOTIFY_TPQFULL: /* not yet used */ - case TX_NOTIFY_GENERALERROR: - case TX_NOTIFY_DELAYED_GENERALERROR: - atomic_dec(&iucv->skbs_in_xmit); - __skb_unlink(list_skb, list); - kfree_skb(list_skb); - if (sk->sk_state == IUCV_CONNECTED) { - sk->sk_state = IUCV_DISCONN; - sk->sk_state_change(sk); - } - break; - } - break; + switch (n) { + case TX_NOTIFY_OK: + atomic_dec(&iucv->skbs_in_xmit); + iucv_sock_wake_msglim(sk); + break; + case TX_NOTIFY_PENDING: + atomic_inc(&iucv->pendings); + break; + case TX_NOTIFY_DELAYED_OK: + atomic_dec(&iucv->skbs_in_xmit); + if (atomic_dec_return(&iucv->pendings) <= 0) + iucv_sock_wake_msglim(sk); + break; + default: + atomic_dec(&iucv->skbs_in_xmit); + if (sk->sk_state == IUCV_CONNECTED) { + sk->sk_state = IUCV_DISCONN; + sk->sk_state_change(sk); } } - spin_unlock_irqrestore(&list->lock, flags); if (sk->sk_state == IUCV_CLOSING) { if (atomic_read(&iucv->skbs_in_xmit) == 0) { @@ -2206,7 +2169,6 @@ static void afiucv_hs_callback_txnotify(struct sk_buff *skb, sk->sk_state_change(sk); } } - } /* -- cgit v1.2.3 From b9bae61be46645aa3b8b5d79d5cdfabe55ae1507 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 28 Jan 2021 13:49:15 +0100 Subject: nexthop: Introduce to struct nh_grp_entry a per-type union The values that a next-hop group needs to keep track of depend on the group type. Introduce a union to separate fields specific to the mpath groups from fields specific to other group types. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/nexthop.h | 7 ++++++- net/ipv4/nexthop.c | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 226930d66b63..d0e245b0635d 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -66,7 +66,12 @@ struct nh_info { struct nh_grp_entry { struct nexthop *nh; u8 weight; - atomic_t upper_bound; + + union { + struct { + atomic_t upper_bound; + } mpath; + }; struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 43bb5f451343..7a30df5aea75 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -689,7 +689,7 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->upper_bound)) + if (hash > atomic_read(&nhge->mpath.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -924,7 +924,7 @@ static void nh_group_rebalance(struct nh_group *nhg) w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->upper_bound, upper_bound); + atomic_set(&nhge->mpath.upper_bound, upper_bound); } } -- cgit v1.2.3 From 09ad6becf5355fe0645f500f518fbbd531715722 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 28 Jan 2021 13:49:17 +0100 Subject: nexthop: Use enum to encode notification type Currently there are only two types of in-kernel nexthop notification. The two are distinguished by the 'is_grp' boolean field in 'struct nh_notifier_info'. As more notification types are introduced for more next-hop group types, a boolean is not an easily extensible interface. Instead, convert it to an enum. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 54 ++++++++++++++++------ drivers/net/netdevsim/fib.c | 23 +++++---- include/net/nexthop.h | 7 ++- net/ipv4/nexthop.c | 14 +++--- 4 files changed, 69 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 41424ee909a0..0ac7014703aa 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4309,11 +4309,18 @@ static int mlxsw_sp_nexthop_obj_validate(struct mlxsw_sp *mlxsw_sp, if (event != NEXTHOP_EVENT_REPLACE) return 0; - if (!info->is_grp) + switch (info->type) { + case NH_NOTIFIER_INFO_TYPE_SINGLE: return mlxsw_sp_nexthop_obj_single_validate(mlxsw_sp, info->nh, info->extack); - return mlxsw_sp_nexthop_obj_group_validate(mlxsw_sp, info->nh_grp, - info->extack); + case NH_NOTIFIER_INFO_TYPE_GRP: + return mlxsw_sp_nexthop_obj_group_validate(mlxsw_sp, + info->nh_grp, + info->extack); + default: + NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type"); + return -EOPNOTSUPP; + } } static bool mlxsw_sp_nexthop_obj_is_gateway(struct mlxsw_sp *mlxsw_sp, @@ -4321,13 +4328,17 @@ static bool mlxsw_sp_nexthop_obj_is_gateway(struct mlxsw_sp *mlxsw_sp, { const struct net_device *dev; - if (info->is_grp) + switch (info->type) { + case NH_NOTIFIER_INFO_TYPE_SINGLE: + dev = info->nh->dev; + return info->nh->gw_family || info->nh->is_reject || + mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL); + case NH_NOTIFIER_INFO_TYPE_GRP: /* Already validated earlier. */ return true; - - dev = info->nh->dev; - return info->nh->gw_family || info->nh->is_reject || - mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL); + default: + return false; + } } static void mlxsw_sp_nexthop_obj_blackhole_init(struct mlxsw_sp *mlxsw_sp, @@ -4410,11 +4421,22 @@ mlxsw_sp_nexthop_obj_group_info_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop_group *nh_grp, struct nh_notifier_info *info) { - unsigned int nhs = info->is_grp ? info->nh_grp->num_nh : 1; struct mlxsw_sp_nexthop_group_info *nhgi; struct mlxsw_sp_nexthop *nh; + unsigned int nhs; int err, i; + switch (info->type) { + case NH_NOTIFIER_INFO_TYPE_SINGLE: + nhs = 1; + break; + case NH_NOTIFIER_INFO_TYPE_GRP: + nhs = info->nh_grp->num_nh; + break; + default: + return -EINVAL; + } + nhgi = kzalloc(struct_size(nhgi, nexthops, nhs), GFP_KERNEL); if (!nhgi) return -ENOMEM; @@ -4427,12 +4449,18 @@ mlxsw_sp_nexthop_obj_group_info_init(struct mlxsw_sp *mlxsw_sp, int weight; nh = &nhgi->nexthops[i]; - if (info->is_grp) { - nh_obj = &info->nh_grp->nh_entries[i].nh; - weight = info->nh_grp->nh_entries[i].weight; - } else { + switch (info->type) { + case NH_NOTIFIER_INFO_TYPE_SINGLE: nh_obj = info->nh; weight = 1; + break; + case NH_NOTIFIER_INFO_TYPE_GRP: + nh_obj = &info->nh_grp->nh_entries[i].nh; + weight = info->nh_grp->nh_entries[i].weight; + break; + default: + err = -EINVAL; + goto err_nexthop_obj_init; } err = mlxsw_sp_nexthop_obj_init(mlxsw_sp, nh_grp, nh, nh_obj, weight); diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 45d8a7790bd5..f140bbca98c5 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -860,7 +860,7 @@ static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data, nexthop = kzalloc(sizeof(*nexthop), GFP_KERNEL); if (!nexthop) - return NULL; + return ERR_PTR(-ENOMEM); nexthop->id = info->id; @@ -868,15 +868,20 @@ static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data, * occupy. */ - if (!info->is_grp) { + switch (info->type) { + case NH_NOTIFIER_INFO_TYPE_SINGLE: occ = 1; - goto out; + break; + case NH_NOTIFIER_INFO_TYPE_GRP: + for (i = 0; i < info->nh_grp->num_nh; i++) + occ += info->nh_grp->nh_entries[i].weight; + break; + default: + NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type"); + kfree(nexthop); + return ERR_PTR(-EOPNOTSUPP); } - for (i = 0; i < info->nh_grp->num_nh; i++) - occ += info->nh_grp->nh_entries[i].weight; - -out: nexthop->occ = occ; return nexthop; } @@ -972,8 +977,8 @@ static int nsim_nexthop_insert(struct nsim_fib_data *data, int err; nexthop = nsim_nexthop_create(data, info); - if (!nexthop) - return -ENOMEM; + if (IS_ERR(nexthop)) + return PTR_ERR(nexthop); nexthop_old = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d0e245b0635d..7bc057aee40b 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -114,6 +114,11 @@ enum nexthop_event_type { NEXTHOP_EVENT_REPLACE, }; +enum nh_notifier_info_type { + NH_NOTIFIER_INFO_TYPE_SINGLE, + NH_NOTIFIER_INFO_TYPE_GRP, +}; + struct nh_notifier_single_info { struct net_device *dev; u8 gw_family; @@ -142,7 +147,7 @@ struct nh_notifier_info { struct net *net; struct netlink_ext_ack *extack; u32 id; - bool is_grp; + enum nh_notifier_info_type type; union { struct nh_notifier_single_info *nh; struct nh_notifier_grp_info *nh_grp; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index c09b8231f56a..12f812b9538d 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -71,6 +71,7 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { + info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; @@ -92,6 +93,7 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, u16 num_nh = nhg->num_nh; int i; + info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) @@ -121,17 +123,17 @@ static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; - info->is_grp = nh->is_group; - if (info->is_grp) + if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } -static void nh_notifier_info_fini(struct nh_notifier_info *info) +static void nh_notifier_info_fini(struct nh_notifier_info *info, + const struct nexthop *nh) { - if (info->is_grp) + if (nh->is_group) nh_notifier_grp_info_fini(info); else nh_notifier_single_info_fini(info); @@ -161,7 +163,7 @@ static int call_nexthop_notifiers(struct net *net, err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); - nh_notifier_info_fini(&info); + nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } @@ -182,7 +184,7 @@ static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, return err; err = nb->notifier_call(nb, event_type, &info); - nh_notifier_info_fini(&info); + nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } -- cgit v1.2.3 From 1d3f9bb1be8556bce237934ae82b3cdeda3242fd Mon Sep 17 00:00:00 2001 From: dingsenjie Date: Wed, 27 Jan 2021 10:28:01 +0800 Subject: linux/qed: fix spelling typo in qed_chain.h allocted -> allocated Signed-off-by: dingsenjie Link: https://lore.kernel.org/r/20210127022801.8028-1-dingsenjie@163.com Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_chain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 4d58dc8943f0..e339b48de32d 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -470,7 +470,7 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain) /** * @brief qed_chain_reset - Resets the chain to its start state * - * @param p_chain pointer to a previously allocted chain + * @param p_chain pointer to a previously allocated chain */ static inline void qed_chain_reset(struct qed_chain *p_chain) { -- cgit v1.2.3 From 219991e6be7f4a31d471611e265b72f75b2d0538 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 28 Jan 2021 17:33:12 +0100 Subject: Bluetooth: Add new HCI_QUIRK_NO_SUSPEND_NOTIFIER quirk Some devices, e.g. the RTL8723BS bluetooth part, some USB attached devices, completely drop from the bus on a system-suspend. These devices will have their driver unbound and rebound on resume (when the dropping of the bus gets detected) and will show up as a new HCI after resume. These devices do not benefit from the suspend / resume handling work done by the hci_suspend_notifier. At best this unnecessarily adds some time to the suspend/resume time. But this may also actually cause problems, if the code doing the driver unbinding runs after the pm-notifier then the hci_suspend_notifier code will try to talk to a device which is now in an uninitialized state. This commit adds a new HCI_QUIRK_NO_SUSPEND_NOTIFIER quirk which allows drivers to opt-out of the hci_suspend_notifier when they know beforehand that their device will be fully re-initialized / reprobed on resume. Signed-off-by: Hans de Goede Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ net/bluetooth/hci_core.c | 18 +++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index c1504aa3d9cf..ba2f439bc04d 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -238,6 +238,14 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, + + /* + * When this quirk is set, then the hci_suspend_notifier is not + * registered. This is intended for devices which drop completely + * from the bus on system-suspend and which will show up as a new + * HCI after resume. + */ + HCI_QUIRK_NO_SUSPEND_NOTIFIER, }; /* HCI device flags */ diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9ac258c4dab7..b0d9c36acc03 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3940,10 +3940,12 @@ int hci_register_dev(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); - hdev->suspend_notifier.notifier_call = hci_suspend_notifier; - error = register_pm_notifier(&hdev->suspend_notifier); - if (error) - goto err_wqueue; + if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + hdev->suspend_notifier.notifier_call = hci_suspend_notifier; + error = register_pm_notifier(&hdev->suspend_notifier); + if (error) + goto err_wqueue; + } queue_work(hdev->req_workqueue, &hdev->power_on); @@ -3978,9 +3980,11 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->power_on); - hci_suspend_clear_tasks(hdev); - unregister_pm_notifier(&hdev->suspend_notifier); - cancel_work_sync(&hdev->suspend_prepare); + if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + hci_suspend_clear_tasks(hdev); + unregister_pm_notifier(&hdev->suspend_notifier); + cancel_work_sync(&hdev->suspend_prepare); + } hci_dev_do_close(hdev); -- cgit v1.2.3 From 8c85d18ce647ac2517a1a1bb01b02648e23700e6 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 27 Jan 2021 16:32:45 +0200 Subject: net/sched: cls_flower: Add match on the ct_state reply flag Add match on the ct_state reply flag. Example: $ tc filter add dev ens1f0_0 ingress prio 1 chain 1 proto ip flower \ ct_state +trk+est+rpl \ action mirred egress redirect dev ens1f0_1 $ tc filter add dev ens1f0_1 ingress prio 1 chain 1 proto ip flower \ ct_state +trk+est-rpl \ action mirred egress redirect dev ens1f0_0 Signed-off-by: Paul Blakey Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 1 + net/sched/cls_flower.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 709668e264b0..afe6836e44b1 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -592,6 +592,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ + TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ }; enum { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 4a9297a89c77..caf7643e9c83 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -291,9 +291,11 @@ static u16 fl_ct_info_to_flower_map[] = { [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_RELATED, [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | - TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED, + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED | + TCA_FLOWER_KEY_CT_FLAGS_REPLY, [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | - TCA_FLOWER_KEY_CT_FLAGS_RELATED, + TCA_FLOWER_KEY_CT_FLAGS_RELATED | + TCA_FLOWER_KEY_CT_FLAGS_REPLY, [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_NEW, }; -- cgit v1.2.3 From 941eff5aea5d4371fb8a496a66e29aa8fc7a0c23 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 27 Jan 2021 16:32:46 +0200 Subject: net: flow_offload: Add original direction flag to ct_metadata Give offloading drivers the direction of the offloaded ct flow, this will be used for matches on direction (ct_state +/-rpl). Signed-off-by: Paul Blakey Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/flow_offload.h | 1 + net/sched/act_ct.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 123b1e9ea304..e6bd8ebf9ac3 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -245,6 +245,7 @@ struct flow_action_entry { unsigned long cookie; u32 mark; u32 labels[4]; + bool orig_dir; } ct_metadata; struct { /* FLOW_ACTION_MPLS_PUSH */ u32 label; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index b3442078aabc..f0a0aa125b00 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -183,6 +183,7 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, IP_CT_ESTABLISHED_REPLY; /* aligns with the CT reference on the SKB nf_ct_set */ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; + entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; act_ct_labels = entry->ct_metadata.labels; ct_labels = nf_ct_labels_find(ct); -- cgit v1.2.3 From efa1a65c7e1946ff174c56d36bf015ff9e11c4a1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Jan 2021 17:18:32 +0800 Subject: ip_gre: add csum offload support for gre header This patch is to add csum offload support for gre header: On the TX path in gre_build_header(), when CHECKSUM_PARTIAL's set for inner proto, it will calculate the csum for outer proto, and inner csum will be offloaded later. Otherwise, CHECKSUM_PARTIAL and csum_start/offset will be set for outer proto, and the outer csum will be offloaded later. On the GSO path in gre_gso_segment(), when CHECKSUM_PARTIAL is not set for inner proto and the hardware supports csum offload, CHECKSUM_PARTIAL and csum_start/offset will be set for outer proto, and outer csum will be offloaded later. Otherwise, it will do csum for outer proto by calling gso_make_checksum(). Note that SCTP has to do the csum by itself for non GSO path in sctp_packet_pack(), as gre_build_header() can't handle the csum with CHECKSUM_PARTIAL set for SCTP CRC csum offload. v1->v2: - remove the SCTP part, as GRE dev doesn't support SCTP CRC CSUM and it will always do checksum for SCTP in sctp_packet_pack() when it's not a GSO packet. Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- include/net/gre.h | 19 +++++++------------ net/ipv4/gre_offload.c | 15 +++++++++++++-- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/gre.h b/include/net/gre.h index b60f212c16c6..4e209708b754 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -106,17 +106,6 @@ static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) return flags; } -static inline __sum16 gre_checksum(struct sk_buff *skb) -{ - __wsum csum; - - if (skb->ip_summed == CHECKSUM_PARTIAL) - csum = lco_csum(skb); - else - csum = skb_checksum(skb, 0, skb->len, 0); - return csum_fold(csum); -} - static inline void gre_build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) @@ -146,7 +135,13 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len, !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = gre_checksum(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + *(__sum16 *)ptr = csum_fold(lco_csum(skb)); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = sizeof(*greh); + } } } } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 10bc49bde9a1..1121a9d5fed9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,10 +15,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool need_csum, offload_csum, gso_partial, need_ipsec; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; - bool need_csum, gso_partial; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; @@ -47,6 +47,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, if (need_csum) features &= ~NETIF_F_SCTP_CRC; + need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && !need_ipsec && + (skb->dev->features & NETIF_F_HW_CSUM)); + /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { @@ -100,7 +105,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - *pcsum = gso_make_checksum(skb, 0); + if (skb->encapsulation || !offload_csum) { + *pcsum = gso_make_checksum(skb, 0); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = sizeof(*greh); + } } while ((skb = skb->next)); out: return segs; -- cgit v1.2.3 From 9c7caf28068421c9e0d1faea437e35e6b8983ac6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 02:59:59 +0200 Subject: net: dsa: tag_8021q: add helpers to deduce whether a VLAN ID is RX or TX VLAN The sja1105 implementation can be blind about this, but the felix driver doesn't do exactly what it's being told, so it needs to know whether it is a TX or an RX VLAN, so it can install the appropriate type of TCAM rule. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 14 ++++++++++++++ net/dsa/tag_8021q.c | 15 +++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 88cd72dfa4e0..b12b05f1c8b4 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -64,6 +64,10 @@ int dsa_8021q_rx_source_port(u16 vid); u16 dsa_8021q_rx_subvlan(u16 vid); +bool vid_is_dsa_8021q_rxvlan(u16 vid); + +bool vid_is_dsa_8021q_txvlan(u16 vid); + bool vid_is_dsa_8021q(u16 vid); #else @@ -123,6 +127,16 @@ u16 dsa_8021q_rx_subvlan(u16 vid) return 0; } +bool vid_is_dsa_8021q_rxvlan(u16 vid) +{ + return false; +} + +bool vid_is_dsa_8021q_txvlan(u16 vid) +{ + return false; +} + bool vid_is_dsa_8021q(u16 vid) { return false; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 8e3e8a5b8559..008c1ec6e20c 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -133,10 +133,21 @@ u16 dsa_8021q_rx_subvlan(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); +bool vid_is_dsa_8021q_rxvlan(u16 vid) +{ + return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; +} +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_rxvlan); + +bool vid_is_dsa_8021q_txvlan(u16 vid) +{ + return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX; +} +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_txvlan); + bool vid_is_dsa_8021q(u16 vid) { - return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || - (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX); + return vid_is_dsa_8021q_rxvlan(vid) || vid_is_dsa_8021q_txvlan(vid); } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); -- cgit v1.2.3 From 0e9bb4e9d93f2711897b8e2a613899f7b8a15a3b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:00 +0200 Subject: net: mscc: ocelot: export VCAP structures to include/soc/mscc The Felix driver will need to preinstall some VCAP filters for its tag_8021q implementation (outside of the tc-flower offload logic), so these need to be exported to the common includes. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_net.c | 1 + drivers/net/ethernet/mscc/ocelot_vcap.c | 2 + drivers/net/ethernet/mscc/ocelot_vcap.h | 293 +------------------------------- include/soc/mscc/ocelot_vcap.h | 289 +++++++++++++++++++++++++++++++ 4 files changed, 294 insertions(+), 291 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 9553eb3e441c..05142803a463 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -9,6 +9,7 @@ */ #include +#include #include "ocelot.h" #include "ocelot_vcap.h" diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index d8c778ee6f1b..489bf16362a7 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -1150,6 +1150,7 @@ int ocelot_vcap_filter_add(struct ocelot *ocelot, vcap_entry_set(ocelot, index, filter); return 0; } +EXPORT_SYMBOL(ocelot_vcap_filter_add); static void ocelot_vcap_block_remove_filter(struct ocelot *ocelot, struct ocelot_vcap_block *block, @@ -1204,6 +1205,7 @@ int ocelot_vcap_filter_del(struct ocelot *ocelot, return 0; } +EXPORT_SYMBOL(ocelot_vcap_filter_del); int ocelot_vcap_filter_stats_update(struct ocelot *ocelot, struct ocelot_vcap_filter *filter) diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.h b/drivers/net/ethernet/mscc/ocelot_vcap.h index 82fd10581a14..cfc8b976d1de 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.h +++ b/drivers/net/ethernet/mscc/ocelot_vcap.h @@ -7,300 +7,11 @@ #define _MSCC_OCELOT_VCAP_H_ #include "ocelot.h" -#include "ocelot_police.h" -#include -#include +#include +#include #define OCELOT_POLICER_DISCARD 0x17f -struct ocelot_ipv4 { - u8 addr[4]; -}; - -enum ocelot_vcap_bit { - OCELOT_VCAP_BIT_ANY, - OCELOT_VCAP_BIT_0, - OCELOT_VCAP_BIT_1 -}; - -struct ocelot_vcap_u8 { - u8 value[1]; - u8 mask[1]; -}; - -struct ocelot_vcap_u16 { - u8 value[2]; - u8 mask[2]; -}; - -struct ocelot_vcap_u24 { - u8 value[3]; - u8 mask[3]; -}; - -struct ocelot_vcap_u32 { - u8 value[4]; - u8 mask[4]; -}; - -struct ocelot_vcap_u40 { - u8 value[5]; - u8 mask[5]; -}; - -struct ocelot_vcap_u48 { - u8 value[6]; - u8 mask[6]; -}; - -struct ocelot_vcap_u64 { - u8 value[8]; - u8 mask[8]; -}; - -struct ocelot_vcap_u128 { - u8 value[16]; - u8 mask[16]; -}; - -struct ocelot_vcap_vid { - u16 value; - u16 mask; -}; - -struct ocelot_vcap_ipv4 { - struct ocelot_ipv4 value; - struct ocelot_ipv4 mask; -}; - -struct ocelot_vcap_udp_tcp { - u16 value; - u16 mask; -}; - -struct ocelot_vcap_port { - u8 value; - u8 mask; -}; - -enum ocelot_vcap_key_type { - OCELOT_VCAP_KEY_ANY, - OCELOT_VCAP_KEY_ETYPE, - OCELOT_VCAP_KEY_LLC, - OCELOT_VCAP_KEY_SNAP, - OCELOT_VCAP_KEY_ARP, - OCELOT_VCAP_KEY_IPV4, - OCELOT_VCAP_KEY_IPV6 -}; - -struct ocelot_vcap_key_vlan { - struct ocelot_vcap_vid vid; /* VLAN ID (12 bit) */ - struct ocelot_vcap_u8 pcp; /* PCP (3 bit) */ - enum ocelot_vcap_bit dei; /* DEI */ - enum ocelot_vcap_bit tagged; /* Tagged/untagged frame */ -}; - -struct ocelot_vcap_key_etype { - struct ocelot_vcap_u48 dmac; - struct ocelot_vcap_u48 smac; - struct ocelot_vcap_u16 etype; - struct ocelot_vcap_u16 data; /* MAC data */ -}; - -struct ocelot_vcap_key_llc { - struct ocelot_vcap_u48 dmac; - struct ocelot_vcap_u48 smac; - - /* LLC header: DSAP at byte 0, SSAP at byte 1, Control at byte 2 */ - struct ocelot_vcap_u32 llc; -}; - -struct ocelot_vcap_key_snap { - struct ocelot_vcap_u48 dmac; - struct ocelot_vcap_u48 smac; - - /* SNAP header: Organization Code at byte 0, Type at byte 3 */ - struct ocelot_vcap_u40 snap; -}; - -struct ocelot_vcap_key_arp { - struct ocelot_vcap_u48 smac; - enum ocelot_vcap_bit arp; /* Opcode ARP/RARP */ - enum ocelot_vcap_bit req; /* Opcode request/reply */ - enum ocelot_vcap_bit unknown; /* Opcode unknown */ - enum ocelot_vcap_bit smac_match; /* Sender MAC matches SMAC */ - enum ocelot_vcap_bit dmac_match; /* Target MAC matches DMAC */ - - /**< Protocol addr. length 4, hardware length 6 */ - enum ocelot_vcap_bit length; - - enum ocelot_vcap_bit ip; /* Protocol address type IP */ - enum ocelot_vcap_bit ethernet; /* Hardware address type Ethernet */ - struct ocelot_vcap_ipv4 sip; /* Sender IP address */ - struct ocelot_vcap_ipv4 dip; /* Target IP address */ -}; - -struct ocelot_vcap_key_ipv4 { - enum ocelot_vcap_bit ttl; /* TTL zero */ - enum ocelot_vcap_bit fragment; /* Fragment */ - enum ocelot_vcap_bit options; /* Header options */ - struct ocelot_vcap_u8 ds; - struct ocelot_vcap_u8 proto; /* Protocol */ - struct ocelot_vcap_ipv4 sip; /* Source IP address */ - struct ocelot_vcap_ipv4 dip; /* Destination IP address */ - struct ocelot_vcap_u48 data; /* Not UDP/TCP: IP data */ - struct ocelot_vcap_udp_tcp sport; /* UDP/TCP: Source port */ - struct ocelot_vcap_udp_tcp dport; /* UDP/TCP: Destination port */ - enum ocelot_vcap_bit tcp_fin; - enum ocelot_vcap_bit tcp_syn; - enum ocelot_vcap_bit tcp_rst; - enum ocelot_vcap_bit tcp_psh; - enum ocelot_vcap_bit tcp_ack; - enum ocelot_vcap_bit tcp_urg; - enum ocelot_vcap_bit sip_eq_dip; /* SIP equals DIP */ - enum ocelot_vcap_bit sport_eq_dport; /* SPORT equals DPORT */ - enum ocelot_vcap_bit seq_zero; /* TCP sequence number is zero */ -}; - -struct ocelot_vcap_key_ipv6 { - struct ocelot_vcap_u8 proto; /* IPv6 protocol */ - struct ocelot_vcap_u128 sip; /* IPv6 source (byte 0-7 ignored) */ - struct ocelot_vcap_u128 dip; /* IPv6 destination (byte 0-7 ignored) */ - enum ocelot_vcap_bit ttl; /* TTL zero */ - struct ocelot_vcap_u8 ds; - struct ocelot_vcap_u48 data; /* Not UDP/TCP: IP data */ - struct ocelot_vcap_udp_tcp sport; - struct ocelot_vcap_udp_tcp dport; - enum ocelot_vcap_bit tcp_fin; - enum ocelot_vcap_bit tcp_syn; - enum ocelot_vcap_bit tcp_rst; - enum ocelot_vcap_bit tcp_psh; - enum ocelot_vcap_bit tcp_ack; - enum ocelot_vcap_bit tcp_urg; - enum ocelot_vcap_bit sip_eq_dip; /* SIP equals DIP */ - enum ocelot_vcap_bit sport_eq_dport; /* SPORT equals DPORT */ - enum ocelot_vcap_bit seq_zero; /* TCP sequence number is zero */ -}; - -enum ocelot_mask_mode { - OCELOT_MASK_MODE_NONE, - OCELOT_MASK_MODE_PERMIT_DENY, - OCELOT_MASK_MODE_POLICY, - OCELOT_MASK_MODE_REDIRECT, -}; - -enum ocelot_es0_tag { - OCELOT_NO_ES0_TAG, - OCELOT_ES0_TAG, - OCELOT_FORCE_PORT_TAG, - OCELOT_FORCE_UNTAG, -}; - -enum ocelot_tag_tpid_sel { - OCELOT_TAG_TPID_SEL_8021Q, - OCELOT_TAG_TPID_SEL_8021AD, -}; - -struct ocelot_vcap_action { - union { - /* VCAP ES0 */ - struct { - enum ocelot_es0_tag push_outer_tag; - enum ocelot_es0_tag push_inner_tag; - enum ocelot_tag_tpid_sel tag_a_tpid_sel; - int tag_a_vid_sel; - int tag_a_pcp_sel; - u16 vid_a_val; - u8 pcp_a_val; - u8 dei_a_val; - enum ocelot_tag_tpid_sel tag_b_tpid_sel; - int tag_b_vid_sel; - int tag_b_pcp_sel; - u16 vid_b_val; - u8 pcp_b_val; - u8 dei_b_val; - }; - - /* VCAP IS1 */ - struct { - bool vid_replace_ena; - u16 vid; - bool vlan_pop_cnt_ena; - int vlan_pop_cnt; - bool pcp_dei_ena; - u8 pcp; - u8 dei; - bool qos_ena; - u8 qos_val; - u8 pag_override_mask; - u8 pag_val; - }; - - /* VCAP IS2 */ - struct { - bool cpu_copy_ena; - u8 cpu_qu_num; - enum ocelot_mask_mode mask_mode; - unsigned long port_mask; - bool police_ena; - struct ocelot_policer pol; - u32 pol_ix; - }; - }; -}; - -struct ocelot_vcap_stats { - u64 bytes; - u64 pkts; - u64 used; -}; - -enum ocelot_vcap_filter_type { - OCELOT_VCAP_FILTER_DUMMY, - OCELOT_VCAP_FILTER_PAG, - OCELOT_VCAP_FILTER_OFFLOAD, -}; - -struct ocelot_vcap_filter { - struct list_head list; - - enum ocelot_vcap_filter_type type; - int block_id; - int goto_target; - int lookup; - u8 pag; - u16 prio; - u32 id; - - struct ocelot_vcap_action action; - struct ocelot_vcap_stats stats; - /* For VCAP IS1 and IS2 */ - unsigned long ingress_port_mask; - /* For VCAP ES0 */ - struct ocelot_vcap_port ingress_port; - struct ocelot_vcap_port egress_port; - - enum ocelot_vcap_bit dmac_mc; - enum ocelot_vcap_bit dmac_bc; - struct ocelot_vcap_key_vlan vlan; - - enum ocelot_vcap_key_type key_type; - union { - /* OCELOT_VCAP_KEY_ANY: No specific fields */ - struct ocelot_vcap_key_etype etype; - struct ocelot_vcap_key_llc llc; - struct ocelot_vcap_key_snap snap; - struct ocelot_vcap_key_arp arp; - struct ocelot_vcap_key_ipv4 ipv4; - struct ocelot_vcap_key_ipv6 ipv6; - } key; -}; - -int ocelot_vcap_filter_add(struct ocelot *ocelot, - struct ocelot_vcap_filter *rule, - struct netlink_ext_ack *extack); -int ocelot_vcap_filter_del(struct ocelot *ocelot, - struct ocelot_vcap_filter *rule); int ocelot_vcap_filter_stats_update(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); struct ocelot_vcap_filter * diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 96300adf3648..7f1b82fba63c 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -400,4 +400,293 @@ enum vcap_es0_action_field { VCAP_ES0_ACT_HIT_STICKY, }; +struct ocelot_ipv4 { + u8 addr[4]; +}; + +enum ocelot_vcap_bit { + OCELOT_VCAP_BIT_ANY, + OCELOT_VCAP_BIT_0, + OCELOT_VCAP_BIT_1 +}; + +struct ocelot_vcap_u8 { + u8 value[1]; + u8 mask[1]; +}; + +struct ocelot_vcap_u16 { + u8 value[2]; + u8 mask[2]; +}; + +struct ocelot_vcap_u24 { + u8 value[3]; + u8 mask[3]; +}; + +struct ocelot_vcap_u32 { + u8 value[4]; + u8 mask[4]; +}; + +struct ocelot_vcap_u40 { + u8 value[5]; + u8 mask[5]; +}; + +struct ocelot_vcap_u48 { + u8 value[6]; + u8 mask[6]; +}; + +struct ocelot_vcap_u64 { + u8 value[8]; + u8 mask[8]; +}; + +struct ocelot_vcap_u128 { + u8 value[16]; + u8 mask[16]; +}; + +struct ocelot_vcap_vid { + u16 value; + u16 mask; +}; + +struct ocelot_vcap_ipv4 { + struct ocelot_ipv4 value; + struct ocelot_ipv4 mask; +}; + +struct ocelot_vcap_udp_tcp { + u16 value; + u16 mask; +}; + +struct ocelot_vcap_port { + u8 value; + u8 mask; +}; + +enum ocelot_vcap_key_type { + OCELOT_VCAP_KEY_ANY, + OCELOT_VCAP_KEY_ETYPE, + OCELOT_VCAP_KEY_LLC, + OCELOT_VCAP_KEY_SNAP, + OCELOT_VCAP_KEY_ARP, + OCELOT_VCAP_KEY_IPV4, + OCELOT_VCAP_KEY_IPV6 +}; + +struct ocelot_vcap_key_vlan { + struct ocelot_vcap_vid vid; /* VLAN ID (12 bit) */ + struct ocelot_vcap_u8 pcp; /* PCP (3 bit) */ + enum ocelot_vcap_bit dei; /* DEI */ + enum ocelot_vcap_bit tagged; /* Tagged/untagged frame */ +}; + +struct ocelot_vcap_key_etype { + struct ocelot_vcap_u48 dmac; + struct ocelot_vcap_u48 smac; + struct ocelot_vcap_u16 etype; + struct ocelot_vcap_u16 data; /* MAC data */ +}; + +struct ocelot_vcap_key_llc { + struct ocelot_vcap_u48 dmac; + struct ocelot_vcap_u48 smac; + + /* LLC header: DSAP at byte 0, SSAP at byte 1, Control at byte 2 */ + struct ocelot_vcap_u32 llc; +}; + +struct ocelot_vcap_key_snap { + struct ocelot_vcap_u48 dmac; + struct ocelot_vcap_u48 smac; + + /* SNAP header: Organization Code at byte 0, Type at byte 3 */ + struct ocelot_vcap_u40 snap; +}; + +struct ocelot_vcap_key_arp { + struct ocelot_vcap_u48 smac; + enum ocelot_vcap_bit arp; /* Opcode ARP/RARP */ + enum ocelot_vcap_bit req; /* Opcode request/reply */ + enum ocelot_vcap_bit unknown; /* Opcode unknown */ + enum ocelot_vcap_bit smac_match; /* Sender MAC matches SMAC */ + enum ocelot_vcap_bit dmac_match; /* Target MAC matches DMAC */ + + /**< Protocol addr. length 4, hardware length 6 */ + enum ocelot_vcap_bit length; + + enum ocelot_vcap_bit ip; /* Protocol address type IP */ + enum ocelot_vcap_bit ethernet; /* Hardware address type Ethernet */ + struct ocelot_vcap_ipv4 sip; /* Sender IP address */ + struct ocelot_vcap_ipv4 dip; /* Target IP address */ +}; + +struct ocelot_vcap_key_ipv4 { + enum ocelot_vcap_bit ttl; /* TTL zero */ + enum ocelot_vcap_bit fragment; /* Fragment */ + enum ocelot_vcap_bit options; /* Header options */ + struct ocelot_vcap_u8 ds; + struct ocelot_vcap_u8 proto; /* Protocol */ + struct ocelot_vcap_ipv4 sip; /* Source IP address */ + struct ocelot_vcap_ipv4 dip; /* Destination IP address */ + struct ocelot_vcap_u48 data; /* Not UDP/TCP: IP data */ + struct ocelot_vcap_udp_tcp sport; /* UDP/TCP: Source port */ + struct ocelot_vcap_udp_tcp dport; /* UDP/TCP: Destination port */ + enum ocelot_vcap_bit tcp_fin; + enum ocelot_vcap_bit tcp_syn; + enum ocelot_vcap_bit tcp_rst; + enum ocelot_vcap_bit tcp_psh; + enum ocelot_vcap_bit tcp_ack; + enum ocelot_vcap_bit tcp_urg; + enum ocelot_vcap_bit sip_eq_dip; /* SIP equals DIP */ + enum ocelot_vcap_bit sport_eq_dport; /* SPORT equals DPORT */ + enum ocelot_vcap_bit seq_zero; /* TCP sequence number is zero */ +}; + +struct ocelot_vcap_key_ipv6 { + struct ocelot_vcap_u8 proto; /* IPv6 protocol */ + struct ocelot_vcap_u128 sip; /* IPv6 source (byte 0-7 ignored) */ + struct ocelot_vcap_u128 dip; /* IPv6 destination (byte 0-7 ignored) */ + enum ocelot_vcap_bit ttl; /* TTL zero */ + struct ocelot_vcap_u8 ds; + struct ocelot_vcap_u48 data; /* Not UDP/TCP: IP data */ + struct ocelot_vcap_udp_tcp sport; + struct ocelot_vcap_udp_tcp dport; + enum ocelot_vcap_bit tcp_fin; + enum ocelot_vcap_bit tcp_syn; + enum ocelot_vcap_bit tcp_rst; + enum ocelot_vcap_bit tcp_psh; + enum ocelot_vcap_bit tcp_ack; + enum ocelot_vcap_bit tcp_urg; + enum ocelot_vcap_bit sip_eq_dip; /* SIP equals DIP */ + enum ocelot_vcap_bit sport_eq_dport; /* SPORT equals DPORT */ + enum ocelot_vcap_bit seq_zero; /* TCP sequence number is zero */ +}; + +enum ocelot_mask_mode { + OCELOT_MASK_MODE_NONE, + OCELOT_MASK_MODE_PERMIT_DENY, + OCELOT_MASK_MODE_POLICY, + OCELOT_MASK_MODE_REDIRECT, +}; + +enum ocelot_es0_tag { + OCELOT_NO_ES0_TAG, + OCELOT_ES0_TAG, + OCELOT_FORCE_PORT_TAG, + OCELOT_FORCE_UNTAG, +}; + +enum ocelot_tag_tpid_sel { + OCELOT_TAG_TPID_SEL_8021Q, + OCELOT_TAG_TPID_SEL_8021AD, +}; + +struct ocelot_vcap_action { + union { + /* VCAP ES0 */ + struct { + enum ocelot_es0_tag push_outer_tag; + enum ocelot_es0_tag push_inner_tag; + enum ocelot_tag_tpid_sel tag_a_tpid_sel; + int tag_a_vid_sel; + int tag_a_pcp_sel; + u16 vid_a_val; + u8 pcp_a_val; + u8 dei_a_val; + enum ocelot_tag_tpid_sel tag_b_tpid_sel; + int tag_b_vid_sel; + int tag_b_pcp_sel; + u16 vid_b_val; + u8 pcp_b_val; + u8 dei_b_val; + }; + + /* VCAP IS1 */ + struct { + bool vid_replace_ena; + u16 vid; + bool vlan_pop_cnt_ena; + int vlan_pop_cnt; + bool pcp_dei_ena; + u8 pcp; + u8 dei; + bool qos_ena; + u8 qos_val; + u8 pag_override_mask; + u8 pag_val; + }; + + /* VCAP IS2 */ + struct { + bool cpu_copy_ena; + u8 cpu_qu_num; + enum ocelot_mask_mode mask_mode; + unsigned long port_mask; + bool police_ena; + struct ocelot_policer pol; + u32 pol_ix; + }; + }; +}; + +struct ocelot_vcap_stats { + u64 bytes; + u64 pkts; + u64 used; +}; + +enum ocelot_vcap_filter_type { + OCELOT_VCAP_FILTER_DUMMY, + OCELOT_VCAP_FILTER_PAG, + OCELOT_VCAP_FILTER_OFFLOAD, +}; + +struct ocelot_vcap_filter { + struct list_head list; + + enum ocelot_vcap_filter_type type; + int block_id; + int goto_target; + int lookup; + u8 pag; + u16 prio; + u32 id; + + struct ocelot_vcap_action action; + struct ocelot_vcap_stats stats; + /* For VCAP IS1 and IS2 */ + unsigned long ingress_port_mask; + /* For VCAP ES0 */ + struct ocelot_vcap_port ingress_port; + struct ocelot_vcap_port egress_port; + + enum ocelot_vcap_bit dmac_mc; + enum ocelot_vcap_bit dmac_bc; + struct ocelot_vcap_key_vlan vlan; + + enum ocelot_vcap_key_type key_type; + union { + /* OCELOT_VCAP_KEY_ANY: No specific fields */ + struct ocelot_vcap_key_etype etype; + struct ocelot_vcap_key_llc llc; + struct ocelot_vcap_key_snap snap; + struct ocelot_vcap_key_arp arp; + struct ocelot_vcap_key_ipv4 ipv4; + struct ocelot_vcap_key_ipv6 ipv6; + } key; +}; + +int ocelot_vcap_filter_add(struct ocelot *ocelot, + struct ocelot_vcap_filter *rule, + struct netlink_ext_ack *extack); +int ocelot_vcap_filter_del(struct ocelot *ocelot, + struct ocelot_vcap_filter *rule); + #endif /* _OCELOT_VCAP_H_ */ -- cgit v1.2.3 From 50c6cc5b9283efdbf72162dee467bd68c7167807 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:01 +0200 Subject: net: mscc: ocelot: store a namespaced VCAP filter ID We will be adding some private VCAP filters that should not interfere in any way with the filters added using tc-flower. So we need to allocate some IDs which will not be used by tc. Currently ocelot uses an u32 id derived from the flow cookie, which in itself is an unsigned long. This is a problem in itself, since on 64 bit systems, sizeof(unsigned long)=8, so the driver is already truncating these. Create a struct ocelot_vcap_id which contains the full unsigned long cookie from tc, as well as a boolean that is supposed to namespace the filters added by tc with the ones that aren't. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_flower.c | 7 ++++--- drivers/net/ethernet/mscc/ocelot_vcap.c | 16 ++++++++++++---- drivers/net/ethernet/mscc/ocelot_vcap.h | 3 ++- include/soc/mscc/ocelot_vcap.h | 7 ++++++- 4 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 729495a1a77e..c3ac026f6aea 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -622,7 +622,8 @@ static int ocelot_flower_parse(struct ocelot *ocelot, int port, bool ingress, int ret; filter->prio = f->common.prio; - filter->id = f->cookie; + filter->id.cookie = f->cookie; + filter->id.tc_offload = true; ret = ocelot_flower_parse_action(ocelot, port, ingress, f, filter); if (ret) @@ -717,7 +718,7 @@ int ocelot_cls_flower_destroy(struct ocelot *ocelot, int port, block = &ocelot->block[block_id]; - filter = ocelot_vcap_block_find_filter_by_id(block, f->cookie); + filter = ocelot_vcap_block_find_filter_by_id(block, f->cookie, true); if (!filter) return 0; @@ -741,7 +742,7 @@ int ocelot_cls_flower_stats(struct ocelot *ocelot, int port, block = &ocelot->block[block_id]; - filter = ocelot_vcap_block_find_filter_by_id(block, f->cookie); + filter = ocelot_vcap_block_find_filter_by_id(block, f->cookie, true); if (!filter || filter->type == OCELOT_VCAP_FILTER_DUMMY) return 0; diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index 489bf16362a7..b82fd4103a68 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -959,6 +959,12 @@ static void ocelot_vcap_filter_add_to_block(struct ocelot *ocelot, list_add(&filter->list, pos->prev); } +static bool ocelot_vcap_filter_equal(const struct ocelot_vcap_filter *a, + const struct ocelot_vcap_filter *b) +{ + return !memcmp(&a->id, &b->id, sizeof(struct ocelot_vcap_id)); +} + static int ocelot_vcap_block_get_filter_index(struct ocelot_vcap_block *block, struct ocelot_vcap_filter *filter) { @@ -966,7 +972,7 @@ static int ocelot_vcap_block_get_filter_index(struct ocelot_vcap_block *block, int index = 0; list_for_each_entry(tmp, &block->rules, list) { - if (filter->id == tmp->id) + if (ocelot_vcap_filter_equal(filter, tmp)) return index; index++; } @@ -991,12 +997,14 @@ ocelot_vcap_block_find_filter_by_index(struct ocelot_vcap_block *block, } struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id) +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int cookie, + bool tc_offload) { struct ocelot_vcap_filter *filter; list_for_each_entry(filter, &block->rules, list) - if (filter->id == id) + if (filter->id.tc_offload == tc_offload && + filter->id.cookie == cookie) return filter; return NULL; @@ -1161,7 +1169,7 @@ static void ocelot_vcap_block_remove_filter(struct ocelot *ocelot, list_for_each_safe(pos, q, &block->rules) { tmp = list_entry(pos, struct ocelot_vcap_filter, list); - if (tmp->id == filter->id) { + if (ocelot_vcap_filter_equal(filter, tmp)) { if (tmp->block_id == VCAP_IS2 && tmp->action.police_ena) ocelot_vcap_policer_del(ocelot, block, diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.h b/drivers/net/ethernet/mscc/ocelot_vcap.h index cfc8b976d1de..3b0c7916056e 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.h +++ b/drivers/net/ethernet/mscc/ocelot_vcap.h @@ -15,7 +15,8 @@ int ocelot_vcap_filter_stats_update(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id); +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id, + bool tc_offload); void ocelot_detect_vcap_constants(struct ocelot *ocelot); int ocelot_vcap_init(struct ocelot *ocelot); diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 7f1b82fba63c..76e01c927e17 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -648,6 +648,11 @@ enum ocelot_vcap_filter_type { OCELOT_VCAP_FILTER_OFFLOAD, }; +struct ocelot_vcap_id { + unsigned long cookie; + bool tc_offload; +}; + struct ocelot_vcap_filter { struct list_head list; @@ -657,7 +662,7 @@ struct ocelot_vcap_filter { int lookup; u8 pag; u16 prio; - u32 id; + struct ocelot_vcap_id id; struct ocelot_vcap_action action; struct ocelot_vcap_stats stats; -- cgit v1.2.3 From cacea62fcdda5656cb5b8104e73a00e043b61730 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:03 +0200 Subject: net: mscc: ocelot: don't use NPI tag prefix for the CPU port module Context: Ocelot switches put the injection/extraction frame header in front of the Ethernet header. When used in NPI mode, a DSA master would see junk instead of the destination MAC address, and it would most likely drop the packets. So the Ocelot frame header can have an optional prefix, which is just "ff:ff:ff:ff:ff:fe > ff:ff:ff:ff:ff:ff" padding put before the actual tag (still before the real Ethernet header) such that the DSA master thinks it's looking at a broadcast frame with a strange EtherType. Unfortunately, a lesson learned in commit 69df578c5f4b ("net: mscc: ocelot: eliminate confusion between CPU and NPI port") seems to have been forgotten in the meanwhile. The CPU port module and the NPI port have independent settings for the length of the tag prefix. However, the driver is using the same variable to program both of them. There is no reason really to use any tag prefix with the CPU port module, since that is not connected to any Ethernet port. So this patch makes the inj_prefix and xtr_prefix variables apply only to the NPI port (which the switchdev ocelot_vsc7514 driver does not use). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 8 ++++---- drivers/net/ethernet/mscc/ocelot.c | 12 ++++++------ drivers/net/ethernet/mscc/ocelot_vsc7514.c | 2 -- include/soc/mscc/ocelot.h | 4 ++-- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 767cbdccdb3e..054e57dd4383 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -425,8 +425,8 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) ocelot->num_mact_rows = felix->info->num_mact_rows; ocelot->vcap = felix->info->vcap; ocelot->ops = felix->info->ops; - ocelot->inj_prefix = OCELOT_TAG_PREFIX_SHORT; - ocelot->xtr_prefix = OCELOT_TAG_PREFIX_SHORT; + ocelot->npi_inj_prefix = OCELOT_TAG_PREFIX_SHORT; + ocelot->npi_xtr_prefix = OCELOT_TAG_PREFIX_SHORT; ocelot->devlink = felix->ds->devlink; port_phy_modes = kcalloc(num_phys_ports, sizeof(phy_interface_t), @@ -541,9 +541,9 @@ static void felix_npi_port_init(struct ocelot *ocelot, int port) /* NPI port Injection/Extraction configuration */ ocelot_fields_write(ocelot, port, SYS_PORT_MODE_INCL_XTR_HDR, - ocelot->xtr_prefix); + ocelot->npi_xtr_prefix); ocelot_fields_write(ocelot, port, SYS_PORT_MODE_INCL_INJ_HDR, - ocelot->inj_prefix); + ocelot->npi_inj_prefix); /* Disable transmission of pause frames */ ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 0); diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 7352f58f9bc2..714165c2f85a 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1359,9 +1359,9 @@ void ocelot_port_set_maxlen(struct ocelot *ocelot, int port, size_t sdu) if (port == ocelot->npi) { maxlen += OCELOT_TAG_LEN; - if (ocelot->inj_prefix == OCELOT_TAG_PREFIX_SHORT) + if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_SHORT) maxlen += OCELOT_SHORT_PREFIX_LEN; - else if (ocelot->inj_prefix == OCELOT_TAG_PREFIX_LONG) + else if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_LONG) maxlen += OCELOT_LONG_PREFIX_LEN; } @@ -1391,9 +1391,9 @@ int ocelot_get_max_mtu(struct ocelot *ocelot, int port) if (port == ocelot->npi) { max_mtu -= OCELOT_TAG_LEN; - if (ocelot->inj_prefix == OCELOT_TAG_PREFIX_SHORT) + if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_SHORT) max_mtu -= OCELOT_SHORT_PREFIX_LEN; - else if (ocelot->inj_prefix == OCELOT_TAG_PREFIX_LONG) + else if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_LONG) max_mtu -= OCELOT_LONG_PREFIX_LEN; } @@ -1478,9 +1478,9 @@ static void ocelot_cpu_port_init(struct ocelot *ocelot) ocelot_fields_write(ocelot, cpu, QSYS_SWITCH_PORT_MODE_PORT_ENA, 1); /* CPU port Injection/Extraction configuration */ ocelot_fields_write(ocelot, cpu, SYS_PORT_MODE_INCL_XTR_HDR, - ocelot->xtr_prefix); + OCELOT_TAG_PREFIX_NONE); ocelot_fields_write(ocelot, cpu, SYS_PORT_MODE_INCL_INJ_HDR, - ocelot->inj_prefix); + OCELOT_TAG_PREFIX_NONE); /* Configure the CPU port to be VLAN aware */ ocelot_write_gix(ocelot, ANA_PORT_VLAN_CFG_VLAN_VID(0) | diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 30a38df08a21..407244fe5b17 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -1347,8 +1347,6 @@ static int mscc_ocelot_probe(struct platform_device *pdev) ocelot->num_flooding_pgids = 1; ocelot->vcap = vsc7514_vcap_props; - ocelot->inj_prefix = OCELOT_TAG_PREFIX_NONE; - ocelot->xtr_prefix = OCELOT_TAG_PREFIX_NONE; ocelot->npi = -1; err = ocelot_init(ocelot); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index cdc33fa05660..93c22627dedd 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -651,8 +651,8 @@ struct ocelot { int npi; - enum ocelot_tag_prefix inj_prefix; - enum ocelot_tag_prefix xtr_prefix; + enum ocelot_tag_prefix npi_inj_prefix; + enum ocelot_tag_prefix npi_xtr_prefix; u32 *lags; -- cgit v1.2.3 From 357f203bb3b529fa7471494c7ad6a7a54d070353 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:05 +0200 Subject: net: dsa: keep a copy of the tagging protocol in the DSA switch tree Cascading DSA switches can be done multiple ways. There is the brute force approach / tag stacking, where one upstream switch, located between leaf switches and the host Ethernet controller, will just happily transport the DSA header of those leaf switches as payload. For this kind of setups, DSA works without any special kind of treatment compared to a single switch - they just aren't aware of each other. Then there's the approach where the upstream switch understands the tags it transports from its leaves below, as it doesn't push a tag of its own, but it routes based on the source port & switch id information present in that tag (as opposed to DMAC & VID) and it strips the tag when egressing a front-facing port. Currently only Marvell implements the latter, and Marvell DSA trees contain only Marvell switches. So it is safe to say that DSA trees already have a single tag protocol shared by all switches, and in fact this is what makes the switches able to understand each other. This fact is also implied by the fact that currently, the tagging protocol is reported as part of a sysfs installed on the DSA master and not per port, so it must be the same for all the ports connected to that DSA master regardless of the switch that they belong to. It's time to make this official and enforce it (yes, this also means we won't have any "switch understands tag to some extent but is not able to speak it" hardware oddities that we'll support in the future). This is needed due to the imminent introduction of the dsa_switch_ops:: change_tag_protocol driver API. When that is introduced, we'll have to notify switches of the tagging protocol that they're configured to use. Currently the tag_ops structure pointer is held only for CPU ports. But there are switches which don't have CPU ports and nonetheless still need to be configured. These would be Marvell leaf switches whose upstream port is just a DSA link. How do we inform these of their tagging protocol setup/deletion? One answer to the above would be: iterate through the DSA switch tree's ports once, list the CPU ports, get their tag_ops, then iterate again now that we have it, and notify everybody of that tag_ops. But what to do if conflicts appear between one cpu_dp->tag_ops and another? There's no escaping the fact that conflict resolution needs to be done, so we can be upfront about it. Ease our work and just keep the master copy of the tag_ops inside the struct dsa_switch_tree. Reference counting is now moved to be per-tree too, instead of per-CPU port. There are many places in the data path that access master->dsa_ptr->tag_ops and we would introduce unnecessary performance penalty going through yet another indirection, so keep those right where they are. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 7 ++++++- net/dsa/dsa2.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2f5435d3d1db..b8af1d6c879a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -140,6 +140,9 @@ struct dsa_switch_tree { /* Has this tree been applied to the hardware? */ bool setup; + /* Tagging protocol operations */ + const struct dsa_device_ops *tag_ops; + /* * Configuration data for the platform device that owns * this dsa switch tree instance. @@ -225,7 +228,9 @@ struct dsa_port { struct net_device *slave; }; - /* CPU port tagging operations used by master or slave devices */ + /* Copy of the tagging protocol operations, for quicker access + * in the data path. Valid only for the CPU ports. + */ const struct dsa_device_ops *tag_ops; /* Copies for faster access in master receive hot path */ diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 2953d0c1c7bc..63035a898ca8 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -179,6 +179,8 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) static void dsa_tree_free(struct dsa_switch_tree *dst) { + if (dst->tag_ops) + dsa_tag_driver_put(dst->tag_ops); list_del(&dst->list); kfree(dst); } @@ -467,7 +469,6 @@ static void dsa_port_teardown(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: dsa_port_disable(dp); - dsa_tag_driver_put(dp->tag_ops); dsa_port_link_unregister_of(dp); break; case DSA_PORT_TYPE_DSA: @@ -1011,24 +1012,35 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; - const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = dsa_get_tag_protocol(dp, master); - tag_ops = dsa_tag_driver_get(tag_protocol); - if (IS_ERR(tag_ops)) { - if (PTR_ERR(tag_ops) == -ENOPROTOOPT) - return -EPROBE_DEFER; - dev_warn(ds->dev, "No tagger for this switch\n"); - dp->master = NULL; - return PTR_ERR(tag_ops); + if (dst->tag_ops) { + if (dst->tag_ops->proto != tag_protocol) { + dev_err(ds->dev, + "A DSA switch tree can have only one tagging protocol\n"); + return -EINVAL; + } + /* In the case of multiple CPU ports per switch, the tagging + * protocol is still reference-counted only per switch tree, so + * nothing to do here. + */ + } else { + dst->tag_ops = dsa_tag_driver_get(tag_protocol); + if (IS_ERR(dst->tag_ops)) { + if (PTR_ERR(dst->tag_ops) == -ENOPROTOOPT) + return -EPROBE_DEFER; + dev_warn(ds->dev, "No tagger for this switch\n"); + dp->master = NULL; + return PTR_ERR(dst->tag_ops); + } } dp->master = master; dp->type = DSA_PORT_TYPE_CPU; - dp->filter = tag_ops->filter; - dp->rcv = tag_ops->rcv; - dp->tag_ops = tag_ops; + dp->filter = dst->tag_ops->filter; + dp->rcv = dst->tag_ops->rcv; + dp->tag_ops = dst->tag_ops; dp->dst = dst; return 0; -- cgit v1.2.3 From 53da0ebaad102626f56495e0967a614f89a2acc8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:06 +0200 Subject: net: dsa: allow changing the tag protocol via the "tagging" device attribute Currently DSA exposes the following sysfs: $ cat /sys/class/net/eno2/dsa/tagging ocelot which is a read-only device attribute, introduced in the kernel as commit 98cdb4807123 ("net: dsa: Expose tagging protocol to user-space"), and used by libpcap since its commit 993db3800d7d ("Add support for DSA link-layer types"). It would be nice if we could extend this device attribute by making it writable: $ echo ocelot-8021q > /sys/class/net/eno2/dsa/tagging This is useful with DSA switches that can make use of more than one tagging protocol. It may be useful in dsa_loop in the future too, to perform offline testing of various taggers, or for changing between dsa and edsa on Marvell switches, if that is desirable. In terms of implementation, drivers can support this feature by implementing .change_tag_protocol, which should always leave the switch in a consistent state: either with the new protocol if things went well, or with the old one if something failed. Teardown of the old protocol, if necessary, must be handled by the driver. Some things remain as before: - The .get_tag_protocol is currently only called at probe time, to load the initial tagging protocol driver. Nonetheless, new drivers should report the tagging protocol in current use now. - The driver should manage by itself the initial setup of tagging protocol, no later than the .setup() method, as well as destroying resources used by the last tagger in use, no earlier than the .teardown() method. For multi-switch DSA trees, error handling is a bit more complicated, since e.g. the 5th out of 7 switches may fail to change the tag protocol. When that happens, a revert to the original tag protocol is attempted, but that may fail too, leaving the tree in an inconsistent state despite each individual switch implementing .change_tag_protocol transactionally. Since the intersection between drivers that implement .change_tag_protocol and drivers that support D in DSA is currently the empty set, the possibility for this error to happen is ignored for now. Testing: $ insmod mscc_felix.ko [ 79.549784] mscc_felix 0000:00:00.5: Adding to iommu group 14 [ 79.565712] mscc_felix 0000:00:00.5: Failed to register DSA switch: -517 $ insmod tag_ocelot.ko $ rmmod mscc_felix.ko $ insmod mscc_felix.ko [ 97.261724] libphy: VSC9959 internal MDIO bus: probed [ 97.267363] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 0 [ 97.274998] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 1 [ 97.282561] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 2 [ 97.289700] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 3 [ 97.599163] mscc_felix 0000:00:00.5 swp0 (uninitialized): PHY [0000:00:00.3:10] driver [Microsemi GE VSC8514 SyncE] (irq=POLL) [ 97.862034] mscc_felix 0000:00:00.5 swp1 (uninitialized): PHY [0000:00:00.3:11] driver [Microsemi GE VSC8514 SyncE] (irq=POLL) [ 97.950731] mscc_felix 0000:00:00.5 swp0: configuring for inband/qsgmii link mode [ 97.964278] 8021q: adding VLAN 0 to HW filter on device swp0 [ 98.146161] mscc_felix 0000:00:00.5 swp2 (uninitialized): PHY [0000:00:00.3:12] driver [Microsemi GE VSC8514 SyncE] (irq=POLL) [ 98.238649] mscc_felix 0000:00:00.5 swp1: configuring for inband/qsgmii link mode [ 98.251845] 8021q: adding VLAN 0 to HW filter on device swp1 [ 98.433916] mscc_felix 0000:00:00.5 swp3 (uninitialized): PHY [0000:00:00.3:13] driver [Microsemi GE VSC8514 SyncE] (irq=POLL) [ 98.485542] mscc_felix 0000:00:00.5: configuring for fixed/internal link mode [ 98.503584] mscc_felix 0000:00:00.5: Link is Up - 2.5Gbps/Full - flow control rx/tx [ 98.527948] device eno2 entered promiscuous mode [ 98.544755] DSA: tree 0 setup $ ping 10.0.0.1 PING 10.0.0.1 (10.0.0.1): 56 data bytes 64 bytes from 10.0.0.1: seq=0 ttl=64 time=2.337 ms 64 bytes from 10.0.0.1: seq=1 ttl=64 time=0.754 ms ^C - 10.0.0.1 ping statistics - 2 packets transmitted, 2 packets received, 0% packet loss round-trip min/avg/max = 0.754/1.545/2.337 ms $ cat /sys/class/net/eno2/dsa/tagging ocelot $ cat ./test_ocelot_8021q.sh #!/bin/bash ip link set swp0 down ip link set swp1 down ip link set swp2 down ip link set swp3 down ip link set swp5 down ip link set eno2 down echo ocelot-8021q > /sys/class/net/eno2/dsa/tagging ip link set eno2 up ip link set swp0 up ip link set swp1 up ip link set swp2 up ip link set swp3 up ip link set swp5 up $ ./test_ocelot_8021q.sh ./test_ocelot_8021q.sh: line 9: echo: write error: Protocol not available $ rmmod tag_ocelot.ko rmmod: can't unload module 'tag_ocelot': Resource temporarily unavailable $ insmod tag_ocelot_8021q.ko $ ./test_ocelot_8021q.sh $ cat /sys/class/net/eno2/dsa/tagging ocelot-8021q $ rmmod tag_ocelot.ko $ rmmod tag_ocelot_8021q.ko rmmod: can't unload module 'tag_ocelot_8021q': Resource temporarily unavailable $ ping 10.0.0.1 PING 10.0.0.1 (10.0.0.1): 56 data bytes 64 bytes from 10.0.0.1: seq=0 ttl=64 time=0.953 ms 64 bytes from 10.0.0.1: seq=1 ttl=64 time=0.787 ms 64 bytes from 10.0.0.1: seq=2 ttl=64 time=0.771 ms $ rmmod mscc_felix.ko [ 645.544426] mscc_felix 0000:00:00.5: Link is Down [ 645.838608] DSA: tree 0 torn down $ rmmod tag_ocelot_8021q.ko Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- Documentation/ABI/testing/sysfs-class-net-dsa | 11 +++++- include/net/dsa.h | 9 +++++ net/dsa/dsa.c | 26 +++++++++++++ net/dsa/dsa2.c | 55 +++++++++++++++++++++++++-- net/dsa/dsa_priv.h | 15 ++++++++ net/dsa/master.c | 39 ++++++++++++++++++- net/dsa/port.c | 8 ++++ net/dsa/slave.c | 35 +++++++++++------ net/dsa/switch.c | 55 +++++++++++++++++++++++++++ 9 files changed, 235 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-class-net-dsa b/Documentation/ABI/testing/sysfs-class-net-dsa index 985d84c585c6..e2da26b44dd0 100644 --- a/Documentation/ABI/testing/sysfs-class-net-dsa +++ b/Documentation/ABI/testing/sysfs-class-net-dsa @@ -3,5 +3,12 @@ Date: August 2018 KernelVersion: 4.20 Contact: netdev@vger.kernel.org Description: - String indicating the type of tagging protocol used by the - DSA slave network device. + On read, this file returns a string indicating the type of + tagging protocol used by the DSA network devices that are + attached to this master interface. + On write, this file changes the tagging protocol of the + attached DSA switches, if this operation is supported by the + driver. Changing the tagging protocol must be done with the DSA + interfaces and the master interface all administratively down. + See the "name" field of each registered struct dsa_device_ops + for a list of valid values. diff --git a/include/net/dsa.h b/include/net/dsa.h index b8af1d6c879a..7ea3918b2e61 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -485,9 +485,18 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { + /* + * Tagging protocol helpers called for the CPU ports and DSA links. + * @get_tag_protocol retrieves the initial tagging protocol and is + * mandatory. Switches which can operate using multiple tagging + * protocols should implement @change_tag_protocol and report in + * @get_tag_protocol the tagger in current use. + */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); + int (*change_tag_protocol)(struct dsa_switch *ds, int port, + enum dsa_tag_protocol proto); int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index f4ce3c5826a0..84cad1be9ce4 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -84,6 +84,32 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) return ops->name; }; +/* Function takes a reference on the module owning the tagger, + * so dsa_tag_driver_put must be called afterwards. + */ +const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf) +{ + const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); + struct dsa_tag_driver *dsa_tag_driver; + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + const struct dsa_device_ops *tmp = dsa_tag_driver->ops; + + if (!sysfs_streq(buf, tmp->name)) + continue; + + if (!try_module_get(dsa_tag_driver->owner)) + break; + + ops = tmp; + break; + } + mutex_unlock(&dsa_tag_drivers_lock); + + return ops; +} + const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 63035a898ca8..96249c4ad5f2 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -942,6 +942,57 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dst->setup = false; } +/* Since the dsa/tagging sysfs device attribute is per master, the assumption + * is that all DSA switches within a tree share the same tagger, otherwise + * they would have formed disjoint trees (different "dsa,member" values). + */ +int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, + struct net_device *master, + const struct dsa_device_ops *tag_ops, + const struct dsa_device_ops *old_tag_ops) +{ + struct dsa_notifier_tag_proto_info info; + struct dsa_port *dp; + int err = -EBUSY; + + if (!rtnl_trylock()) + return restart_syscall(); + + /* At the moment we don't allow changing the tag protocol under + * traffic. The rtnl_mutex also happens to serialize concurrent + * attempts to change the tagging protocol. If we ever lift the IFF_UP + * restriction, there needs to be another mutex which serializes this. + */ + if (master->flags & IFF_UP) + goto out_unlock; + + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_is_user_port(dp->ds, dp->index)) + continue; + + if (dp->slave->flags & IFF_UP) + goto out_unlock; + } + + info.tag_ops = tag_ops; + err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); + if (err) + goto out_unwind_tagger; + + dst->tag_ops = tag_ops; + + rtnl_unlock(); + + return 0; + +out_unwind_tagger: + info.tag_ops = old_tag_ops; + dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); +out_unlock: + rtnl_unlock(); + return err; +} + static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) { struct dsa_switch_tree *dst = ds->dst; @@ -1038,9 +1089,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) dp->master = master; dp->type = DSA_PORT_TYPE_CPU; - dp->filter = dst->tag_ops->filter; - dp->rcv = dst->tag_ops->rcv; - dp->tag_ops = dst->tag_ops; + dsa_port_set_tag_protocol(dp, dst->tag_ops); dp->dst = dst; return 0; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3cc1e6d76e3a..edca57b558ad 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -28,6 +28,7 @@ enum { DSA_NOTIFIER_VLAN_ADD, DSA_NOTIFIER_VLAN_DEL, DSA_NOTIFIER_MTU, + DSA_NOTIFIER_TAG_PROTO, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -82,6 +83,11 @@ struct dsa_notifier_mtu_info { int mtu; }; +/* DSA_NOTIFIER_TAG_PROTO_* */ +struct dsa_notifier_tag_proto_info { + const struct dsa_device_ops *tag_ops; +}; + struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; @@ -115,6 +121,7 @@ struct dsa_slave_priv { /* dsa.c */ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol); void dsa_tag_driver_put(const struct dsa_device_ops *ops); +const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); @@ -139,6 +146,8 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, } /* port.c */ +void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, + const struct dsa_device_ops *tag_ops); int dsa_port_set_state(struct dsa_port *dp, u8 state); int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); @@ -201,6 +210,8 @@ int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); +void dsa_slave_setup_tagger(struct net_device *slave); +int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { @@ -285,6 +296,10 @@ void dsa_lag_map(struct dsa_switch_tree *dst, struct net_device *lag); void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag); int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); int dsa_broadcast(unsigned long e, void *v); +int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, + struct net_device *master, + const struct dsa_device_ops *tag_ops, + const struct dsa_device_ops *old_tag_ops); extern struct list_head dsa_tree_list; diff --git a/net/dsa/master.c b/net/dsa/master.c index cb3a5cf99b25..052a977914a6 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -280,7 +280,44 @@ static ssize_t tagging_show(struct device *d, struct device_attribute *attr, return sprintf(buf, "%s\n", dsa_tag_protocol_to_str(cpu_dp->tag_ops)); } -static DEVICE_ATTR_RO(tagging); + +static ssize_t tagging_store(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + const struct dsa_device_ops *new_tag_ops, *old_tag_ops; + struct net_device *dev = to_net_dev(d); + struct dsa_port *cpu_dp = dev->dsa_ptr; + int err; + + old_tag_ops = cpu_dp->tag_ops; + new_tag_ops = dsa_find_tagger_by_name(buf); + /* Bad tagger name, or module is not loaded? */ + if (IS_ERR(new_tag_ops)) + return PTR_ERR(new_tag_ops); + + if (new_tag_ops == old_tag_ops) + /* Drop the temporarily held duplicate reference, since + * the DSA switch tree uses this tagger. + */ + goto out; + + err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, dev, new_tag_ops, + old_tag_ops); + if (err) { + /* On failure the old tagger is restored, so we don't need the + * driver for the new one. + */ + dsa_tag_driver_put(new_tag_ops); + return err; + } + + /* On success we no longer need the module for the old tagging protocol + */ +out: + dsa_tag_driver_put(old_tag_ops); + return count; +} +static DEVICE_ATTR_RW(tagging); static struct attribute *dsa_slave_attrs[] = { &dev_attr_tagging.attr, diff --git a/net/dsa/port.c b/net/dsa/port.c index a8886cf40160..5e079a61528e 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -526,6 +526,14 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } +void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, + const struct dsa_device_ops *tag_ops) +{ + cpu_dp->filter = tag_ops->filter; + cpu_dp->rcv = tag_ops->rcv; + cpu_dp->tag_ops = tag_ops; +} + static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) { struct device_node *phy_dn; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f2fb433f3828..b0571ab4e5a7 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1430,7 +1430,7 @@ out: dsa_hw_port_list_free(&hw_port_list); } -static int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) +int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1708,6 +1708,27 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +void dsa_slave_setup_tagger(struct net_device *slave) +{ + struct dsa_port *dp = dsa_slave_to_port(slave); + struct dsa_slave_priv *p = netdev_priv(slave); + const struct dsa_port *cpu_dp = dp->cpu_dp; + struct net_device *master = cpu_dp->master; + + if (cpu_dp->tag_ops->tail_tag) + slave->needed_tailroom = cpu_dp->tag_ops->overhead; + else + slave->needed_headroom = cpu_dp->tag_ops->overhead; + /* Try to save one extra realloc later in the TX path (in the master) + * by also inheriting the master's needed headroom and tailroom. + * The 8021q driver also does this. + */ + slave->needed_headroom += master->needed_headroom; + slave->needed_tailroom += master->needed_tailroom; + + p->xmit = cpu_dp->tag_ops->xmit; +} + static struct lock_class_key dsa_slave_netdev_xmit_lock_key; static void dsa_slave_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -1782,16 +1803,6 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->netdev_ops = &dsa_slave_netdev_ops; if (ds->ops->port_max_mtu) slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); - if (cpu_dp->tag_ops->tail_tag) - slave_dev->needed_tailroom = cpu_dp->tag_ops->overhead; - else - slave_dev->needed_headroom = cpu_dp->tag_ops->overhead; - /* Try to save one extra realloc later in the TX path (in the master) - * by also inheriting the master's needed headroom and tailroom. - * The 8021q driver also does this. - */ - slave_dev->needed_headroom += master->needed_headroom; - slave_dev->needed_tailroom += master->needed_tailroom; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, @@ -1814,8 +1825,8 @@ int dsa_slave_create(struct dsa_port *port) p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; + dsa_slave_setup_tagger(slave_dev); rtnl_lock(); ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index cc0b25f3adea..5026e4143663 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -297,6 +297,58 @@ static int dsa_switch_vlan_del(struct dsa_switch *ds, return 0; } +static bool dsa_switch_tag_proto_match(struct dsa_switch *ds, int port, + struct dsa_notifier_tag_proto_info *info) +{ + if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_change_tag_proto(struct dsa_switch *ds, + struct dsa_notifier_tag_proto_info *info) +{ + const struct dsa_device_ops *tag_ops = info->tag_ops; + int port, err; + + if (!ds->ops->change_tag_protocol) + return -EOPNOTSUPP; + + ASSERT_RTNL(); + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_tag_proto_match(ds, port, info)) { + err = ds->ops->change_tag_protocol(ds, port, + tag_ops->proto); + if (err) + return err; + + if (dsa_is_cpu_port(ds, port)) + dsa_port_set_tag_protocol(dsa_to_port(ds, port), + tag_ops); + } + } + + /* Now that changing the tag protocol can no longer fail, let's update + * the remaining bits which are "duplicated for faster access", and the + * bits that depend on the tagger, such as the MTU. + */ + for (port = 0; port < ds->num_ports; port++) { + if (dsa_is_user_port(ds, port)) { + struct net_device *slave; + + slave = dsa_to_port(ds, port)->slave; + dsa_slave_setup_tagger(slave); + + /* rtnl_mutex is held in dsa_tree_change_tag_proto */ + dsa_slave_change_mtu(slave, slave->mtu); + } + } + + return 0; +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -343,6 +395,9 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MTU: err = dsa_switch_mtu(ds, info); break; + case DSA_NOTIFIER_TAG_PROTO: + err = dsa_switch_change_tag_proto(ds, info); + break; default: err = -EOPNOTSUPP; break; -- cgit v1.2.3 From 7c83a7c539abe9f980996063ac20532a7a7f6eb1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:08 +0200 Subject: net: dsa: add a second tagger for Ocelot switches based on tag_8021q There are use cases for which the existing tagger, based on the NPI (Node Processor Interface) functionality, is insufficient. Namely: - Frames injected through the NPI port bypass the frame analyzer, so no source address learning is performed, no TSN stream classification, etc. - Flow control is not functional over an NPI port (PAUSE frames are encapsulated in the same Extraction Frame Header as all other frames) - There can be at most one NPI port configured for an Ocelot switch. But in NXP LS1028A and T1040 there are two Ethernet CPU ports. The non-NPI port is currently either disabled, or operated as a plain user port (albeit an internally-facing one). Having the ability to configure the two CPU ports symmetrically could pave the way for e.g. creating a LAG between them, to increase bandwidth seamlessly for the system. So there is a desire to have an alternative to the NPI mode. This change keeps the default tagger for the Seville and Felix switches as "ocelot", but it can be changed via the following device attribute: echo ocelot-8021q > /sys/class//dsa/tagging Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + drivers/net/dsa/ocelot/Kconfig | 2 ++ include/net/dsa.h | 2 ++ net/dsa/Kconfig | 21 +++++++++++-- net/dsa/Makefile | 1 + net/dsa/tag_ocelot_8021q.c | 68 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 net/dsa/tag_ocelot_8021q.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 627e90a9e296..d1b0057a9797 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12859,6 +12859,7 @@ F: drivers/net/dsa/ocelot/* F: drivers/net/ethernet/mscc/ F: include/soc/mscc/ocelot* F: net/dsa/tag_ocelot.c +F: net/dsa/tag_ocelot_8021q.c F: tools/testing/selftests/drivers/net/ocelot/* OCXL (Open Coherent Accelerator Processor Interface OpenCAPI) DRIVER diff --git a/drivers/net/dsa/ocelot/Kconfig b/drivers/net/dsa/ocelot/Kconfig index c110e82a7973..932b6b6fe817 100644 --- a/drivers/net/dsa/ocelot/Kconfig +++ b/drivers/net/dsa/ocelot/Kconfig @@ -6,6 +6,7 @@ config NET_DSA_MSCC_FELIX depends on NET_VENDOR_FREESCALE depends on HAS_IOMEM select MSCC_OCELOT_SWITCH_LIB + select NET_DSA_TAG_OCELOT_8021Q select NET_DSA_TAG_OCELOT select FSL_ENETC_MDIO select PCS_LYNX @@ -19,6 +20,7 @@ config NET_DSA_MSCC_SEVILLE depends on NET_VENDOR_MICROSEMI depends on HAS_IOMEM select MSCC_OCELOT_SWITCH_LIB + select NET_DSA_TAG_OCELOT_8021Q select NET_DSA_TAG_OCELOT select PCS_LYNX help diff --git a/include/net/dsa.h b/include/net/dsa.h index 7ea3918b2e61..60acb9fca124 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -47,6 +47,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_RTL4_A_VALUE 17 #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 +#define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -69,6 +70,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, + DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, }; struct packet_type; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 2d226a5c085f..a45572cfb71a 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -105,11 +105,26 @@ config NET_DSA_TAG_RTL4_A the Realtek RTL8366RB. config NET_DSA_TAG_OCELOT - tristate "Tag driver for Ocelot family of switches" + tristate "Tag driver for Ocelot family of switches, using NPI port" select PACKING help - Say Y or M if you want to enable support for tagging frames for the - Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959). + Say Y or M if you want to enable NPI tagging for the Ocelot switches + (VSC7511, VSC7512, VSC7513, VSC7514, VSC9953, VSC9959). In this mode, + the frames over the Ethernet CPU port are prepended with a + hardware-defined injection/extraction frame header. Flow control + (PAUSE frames) over the CPU port is not supported when operating in + this mode. + +config NET_DSA_TAG_OCELOT_8021Q + tristate "Tag driver for Ocelot family of switches, using VLAN" + select NET_DSA_TAG_8021Q + help + Say Y or M if you want to enable support for tagging frames with a + custom VLAN-based header. Frames that require timestamping, such as + PTP, are not delivered over Ethernet but over register-based MMIO. + Flow control over the CPU port is functional in this mode. When using + this mode, less TCAM resources (VCAP IS1, IS2, ES0) are available for + use with tc-flower. config NET_DSA_TAG_QCA tristate "Tag driver for Qualcomm Atheros QCA8K switches" diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 92cea2132241..44bc79952b8b 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o +obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c new file mode 100644 index 000000000000..8991ebf098a3 --- /dev/null +++ b/net/dsa/tag_ocelot_8021q.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2020-2021 NXP Semiconductors + * + * An implementation of the software-defined tag_8021q.c tagger format, which + * also preserves full functionality under a vlan_filtering bridge. It does + * this by using the TCAM engines for: + * - pushing the RX VLAN as a second, outer tag, on egress towards the CPU port + * - redirecting towards the correct front port based on TX VLAN and popping + * that on egress + */ +#include +#include "dsa_priv.h" + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + + return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, + ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); +} + +static struct sk_buff *ocelot_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + int src_port, switch_id, qos_class; + u16 vid, tci; + + skb_push_rcsum(skb, ETH_HLEN); + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + __skb_vlan_pop(skb, &tci); + } + skb_pull_rcsum(skb, ETH_HLEN); + + vid = tci & VLAN_VID_MASK; + src_port = dsa_8021q_rx_source_port(vid); + switch_id = dsa_8021q_rx_switch_id(vid); + qos_class = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + + skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); + if (!skb->dev) + return NULL; + + skb->offload_fwd_mark = 1; + skb->priority = qos_class; + + return skb; +} + +static const struct dsa_device_ops ocelot_8021q_netdev_ops = { + .name = "ocelot-8021q", + .proto = DSA_TAG_PROTO_OCELOT_8021Q, + .xmit = ocelot_xmit, + .rcv = ocelot_rcv, + .overhead = VLAN_HLEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q); + +module_dsa_tag_driver(ocelot_8021q_netdev_ops); -- cgit v1.2.3 From e21268efbe26d9ab3f7468577d691b992d76e06a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 29 Jan 2021 03:00:09 +0200 Subject: net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev egress flower indev ${eth} \ action vlan push id protocol 802.1ad; \ tc filter add dev ingress protocol 802.1Q flower vlan_id action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 337 +++++++++++++++++++++++++++++++- drivers/net/dsa/ocelot/felix.h | 1 + drivers/net/ethernet/mscc/ocelot.c | 61 +++++- drivers/net/ethernet/mscc/ocelot_vcap.c | 1 + drivers/net/ethernet/mscc/ocelot_vcap.h | 3 - include/soc/mscc/ocelot.h | 2 + include/soc/mscc/ocelot_vcap.h | 3 + 7 files changed, 396 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index af3cee8762ce..167463010b55 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,329 @@ #include #include "felix.h" +static int felix_tag_8021q_rxvlan_add(struct felix *felix, int port, u16 vid, + bool pvid, bool untagged) +{ + struct ocelot_vcap_filter *outer_tagging_rule; + struct ocelot *ocelot = &felix->ocelot; + struct dsa_switch *ds = felix->ds; + int key_length, upstream, err; + + /* We don't need to install the rxvlan into the other ports' filtering + * tables, because we're just pushing the rxvlan when sending towards + * the CPU + */ + if (!pvid) + return 0; + + key_length = ocelot->vcap[VCAP_ES0].keys[VCAP_ES0_IGR_PORT].length; + upstream = dsa_upstream_port(ds, port); + + outer_tagging_rule = kzalloc(sizeof(struct ocelot_vcap_filter), + GFP_KERNEL); + if (!outer_tagging_rule) + return -ENOMEM; + + outer_tagging_rule->key_type = OCELOT_VCAP_KEY_ANY; + outer_tagging_rule->prio = 1; + outer_tagging_rule->id.cookie = port; + outer_tagging_rule->id.tc_offload = false; + outer_tagging_rule->block_id = VCAP_ES0; + outer_tagging_rule->type = OCELOT_VCAP_FILTER_OFFLOAD; + outer_tagging_rule->lookup = 0; + outer_tagging_rule->ingress_port.value = port; + outer_tagging_rule->ingress_port.mask = GENMASK(key_length - 1, 0); + outer_tagging_rule->egress_port.value = upstream; + outer_tagging_rule->egress_port.mask = GENMASK(key_length - 1, 0); + outer_tagging_rule->action.push_outer_tag = OCELOT_ES0_TAG; + outer_tagging_rule->action.tag_a_tpid_sel = OCELOT_TAG_TPID_SEL_8021AD; + outer_tagging_rule->action.tag_a_vid_sel = 1; + outer_tagging_rule->action.vid_a_val = vid; + + err = ocelot_vcap_filter_add(ocelot, outer_tagging_rule, NULL); + if (err) + kfree(outer_tagging_rule); + + return err; +} + +static int felix_tag_8021q_txvlan_add(struct felix *felix, int port, u16 vid, + bool pvid, bool untagged) +{ + struct ocelot_vcap_filter *untagging_rule, *redirect_rule; + struct ocelot *ocelot = &felix->ocelot; + struct dsa_switch *ds = felix->ds; + int upstream, err; + + /* tag_8021q.c assumes we are implementing this via port VLAN + * membership, which we aren't. So we don't need to add any VCAP filter + * for the CPU port. + */ + if (ocelot->ports[port]->is_dsa_8021q_cpu) + return 0; + + untagging_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); + if (!untagging_rule) + return -ENOMEM; + + redirect_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); + if (!redirect_rule) { + kfree(untagging_rule); + return -ENOMEM; + } + + upstream = dsa_upstream_port(ds, port); + + untagging_rule->key_type = OCELOT_VCAP_KEY_ANY; + untagging_rule->ingress_port_mask = BIT(upstream); + untagging_rule->vlan.vid.value = vid; + untagging_rule->vlan.vid.mask = VLAN_VID_MASK; + untagging_rule->prio = 1; + untagging_rule->id.cookie = port; + untagging_rule->id.tc_offload = false; + untagging_rule->block_id = VCAP_IS1; + untagging_rule->type = OCELOT_VCAP_FILTER_OFFLOAD; + untagging_rule->lookup = 0; + untagging_rule->action.vlan_pop_cnt_ena = true; + untagging_rule->action.vlan_pop_cnt = 1; + untagging_rule->action.pag_override_mask = 0xff; + untagging_rule->action.pag_val = port; + + err = ocelot_vcap_filter_add(ocelot, untagging_rule, NULL); + if (err) { + kfree(untagging_rule); + kfree(redirect_rule); + return err; + } + + redirect_rule->key_type = OCELOT_VCAP_KEY_ANY; + redirect_rule->ingress_port_mask = BIT(upstream); + redirect_rule->pag = port; + redirect_rule->prio = 1; + redirect_rule->id.cookie = port; + redirect_rule->id.tc_offload = false; + redirect_rule->block_id = VCAP_IS2; + redirect_rule->type = OCELOT_VCAP_FILTER_OFFLOAD; + redirect_rule->lookup = 0; + redirect_rule->action.mask_mode = OCELOT_MASK_MODE_REDIRECT; + redirect_rule->action.port_mask = BIT(port); + + err = ocelot_vcap_filter_add(ocelot, redirect_rule, NULL); + if (err) { + ocelot_vcap_filter_del(ocelot, untagging_rule); + kfree(redirect_rule); + return err; + } + + return 0; +} + +static int felix_tag_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid, + u16 flags) +{ + bool untagged = flags & BRIDGE_VLAN_INFO_UNTAGGED; + bool pvid = flags & BRIDGE_VLAN_INFO_PVID; + struct ocelot *ocelot = ds->priv; + + if (vid_is_dsa_8021q_rxvlan(vid)) + return felix_tag_8021q_rxvlan_add(ocelot_to_felix(ocelot), + port, vid, pvid, untagged); + + if (vid_is_dsa_8021q_txvlan(vid)) + return felix_tag_8021q_txvlan_add(ocelot_to_felix(ocelot), + port, vid, pvid, untagged); + + return 0; +} + +static int felix_tag_8021q_rxvlan_del(struct felix *felix, int port, u16 vid) +{ + struct ocelot_vcap_filter *outer_tagging_rule; + struct ocelot_vcap_block *block_vcap_es0; + struct ocelot *ocelot = &felix->ocelot; + + block_vcap_es0 = &ocelot->block[VCAP_ES0]; + + outer_tagging_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_es0, + port, false); + /* In rxvlan_add, we had the "if (!pvid) return 0" logic to avoid + * installing outer tagging ES0 rules where they weren't needed. + * But in rxvlan_del, the API doesn't give us the "flags" anymore, + * so that forces us to be slightly sloppy here, and just assume that + * if we didn't find an outer_tagging_rule it means that there was + * none in the first place, i.e. rxvlan_del is called on a non-pvid + * port. This is most probably true though. + */ + if (!outer_tagging_rule) + return 0; + + return ocelot_vcap_filter_del(ocelot, outer_tagging_rule); +} + +static int felix_tag_8021q_txvlan_del(struct felix *felix, int port, u16 vid) +{ + struct ocelot_vcap_filter *untagging_rule, *redirect_rule; + struct ocelot_vcap_block *block_vcap_is1; + struct ocelot_vcap_block *block_vcap_is2; + struct ocelot *ocelot = &felix->ocelot; + int err; + + if (ocelot->ports[port]->is_dsa_8021q_cpu) + return 0; + + block_vcap_is1 = &ocelot->block[VCAP_IS1]; + block_vcap_is2 = &ocelot->block[VCAP_IS2]; + + untagging_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_is1, + port, false); + if (!untagging_rule) + return 0; + + err = ocelot_vcap_filter_del(ocelot, untagging_rule); + if (err) + return err; + + redirect_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, + port, false); + if (!redirect_rule) + return 0; + + return ocelot_vcap_filter_del(ocelot, redirect_rule); +} + +static int felix_tag_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid) +{ + struct ocelot *ocelot = ds->priv; + + if (vid_is_dsa_8021q_rxvlan(vid)) + return felix_tag_8021q_rxvlan_del(ocelot_to_felix(ocelot), + port, vid); + + if (vid_is_dsa_8021q_txvlan(vid)) + return felix_tag_8021q_txvlan_del(ocelot_to_felix(ocelot), + port, vid); + + return 0; +} + +static const struct dsa_8021q_ops felix_tag_8021q_ops = { + .vlan_add = felix_tag_8021q_vlan_add, + .vlan_del = felix_tag_8021q_vlan_del, +}; + +/* Alternatively to using the NPI functionality, that same hardware MAC + * connected internally to the enetc or fman DSA master can be configured to + * use the software-defined tag_8021q frame format. As far as the hardware is + * concerned, it thinks it is a "dumb switch" - the queues of the CPU port + * module are now disconnected from it, but can still be accessed through + * register-based MMIO. + */ +static void felix_8021q_cpu_port_init(struct ocelot *ocelot, int port) +{ + ocelot->ports[port]->is_dsa_8021q_cpu = true; + ocelot->npi = -1; + + /* Overwrite PGID_CPU with the non-tagging port */ + ocelot_write_rix(ocelot, BIT(port), ANA_PGID_PGID, PGID_CPU); + + ocelot_apply_bridge_fwd_mask(ocelot); +} + +static void felix_8021q_cpu_port_deinit(struct ocelot *ocelot, int port) +{ + ocelot->ports[port]->is_dsa_8021q_cpu = false; + + /* Restore PGID_CPU */ + ocelot_write_rix(ocelot, BIT(ocelot->num_phys_ports), ANA_PGID_PGID, + PGID_CPU); + + ocelot_apply_bridge_fwd_mask(ocelot); +} + +static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu) +{ + struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + unsigned long cpu_flood; + int port, err; + + felix_8021q_cpu_port_init(ocelot, cpu); + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_is_unused_port(ds, port)) + continue; + + /* This overwrites ocelot_init(): + * Do not forward BPDU frames to the CPU port module, + * for 2 reasons: + * - When these packets are injected from the tag_8021q + * CPU port, we want them to go out, not loop back + * into the system. + * - STP traffic ingressing on a user port should go to + * the tag_8021q CPU port, not to the hardware CPU + * port module. + */ + ocelot_write_gix(ocelot, + ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_REDIR_ENA(0), + ANA_PORT_CPU_FWD_BPDU_CFG, port); + } + + /* In tag_8021q mode, the CPU port module is unused. So we + * want to disable flooding of any kind to the CPU port module, + * since packets going there will end in a black hole. + */ + cpu_flood = ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)); + ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_UC); + ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_MC); + + felix->dsa_8021q_ctx = kzalloc(sizeof(*felix->dsa_8021q_ctx), + GFP_KERNEL); + if (!felix->dsa_8021q_ctx) + return -ENOMEM; + + felix->dsa_8021q_ctx->ops = &felix_tag_8021q_ops; + felix->dsa_8021q_ctx->proto = htons(ETH_P_8021AD); + felix->dsa_8021q_ctx->ds = ds; + + err = dsa_8021q_setup(felix->dsa_8021q_ctx, true); + if (err) + goto out_free_dsa_8021_ctx; + + return 0; + +out_free_dsa_8021_ctx: + kfree(felix->dsa_8021q_ctx); + return err; +} + +static void felix_teardown_tag_8021q(struct dsa_switch *ds, int cpu) +{ + struct ocelot *ocelot = ds->priv; + struct felix *felix = ocelot_to_felix(ocelot); + int err, port; + + err = dsa_8021q_setup(felix->dsa_8021q_ctx, false); + if (err) + dev_err(ds->dev, "dsa_8021q_setup returned %d", err); + + kfree(felix->dsa_8021q_ctx); + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_is_unused_port(ds, port)) + continue; + + /* Restore the logic from ocelot_init: + * do not forward BPDU frames to the front ports. + */ + ocelot_write_gix(ocelot, + ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_REDIR_ENA(0xffff), + ANA_PORT_CPU_FWD_BPDU_CFG, + port); + } + + felix_8021q_cpu_port_deinit(ocelot, cpu); +} + /* The CPU port module is connected to the Node Processor Interface (NPI). This * is the mode through which frames can be injected from and extracted to an * external CPU, over Ethernet. In NXP SoCs, the "external CPU" is the ARM CPU @@ -107,6 +431,9 @@ static int felix_set_tag_protocol(struct dsa_switch *ds, int cpu, case DSA_TAG_PROTO_OCELOT: err = felix_setup_tag_npi(ds, cpu); break; + case DSA_TAG_PROTO_OCELOT_8021Q: + err = felix_setup_tag_8021q(ds, cpu); + break; default: err = -EPROTONOSUPPORT; } @@ -121,11 +448,18 @@ static void felix_del_tag_protocol(struct dsa_switch *ds, int cpu, case DSA_TAG_PROTO_OCELOT: felix_teardown_tag_npi(ds, cpu); break; + case DSA_TAG_PROTO_OCELOT_8021Q: + felix_teardown_tag_8021q(ds, cpu); + break; default: break; } } +/* This always leaves the switch in a consistent state, because although the + * tag_8021q setup can fail, the NPI setup can't. So either the change is made, + * or the restoration is guaranteed to work. + */ static int felix_change_tag_protocol(struct dsa_switch *ds, int cpu, enum dsa_tag_protocol proto) { @@ -134,7 +468,8 @@ static int felix_change_tag_protocol(struct dsa_switch *ds, int cpu, enum dsa_tag_protocol old_proto = felix->tag_proto; int err; - if (proto != DSA_TAG_PROTO_OCELOT) + if (proto != DSA_TAG_PROTO_OCELOT && + proto != DSA_TAG_PROTO_OCELOT_8021Q) return -EPROTONOSUPPORT; felix_del_tag_protocol(ds, cpu, old_proto); diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 264b3bbdc4d1..9d4459f2fffb 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -48,6 +48,7 @@ struct felix { struct lynx_pcs **pcs; resource_size_t switch_base; resource_size_t imdio_base; + struct dsa_8021q_context *dsa_8021q_ctx; enum dsa_tag_protocol tag_proto; }; diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 714165c2f85a..5f21799ad85b 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -889,18 +889,60 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, } EXPORT_SYMBOL(ocelot_get_ts_info); -static void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) +static u32 ocelot_get_dsa_8021q_cpu_mask(struct ocelot *ocelot) { + u32 mask = 0; int port; + for (port = 0; port < ocelot->num_phys_ports; port++) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + if (!ocelot_port) + continue; + + if (ocelot_port->is_dsa_8021q_cpu) + mask |= BIT(port); + } + + return mask; +} + +void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) +{ + unsigned long cpu_fwd_mask; + int port; + + /* If a DSA tag_8021q CPU exists, it needs to be included in the + * regular forwarding path of the front ports regardless of whether + * those are bridged or standalone. + * If DSA tag_8021q is not used, this returns 0, which is fine because + * the hardware-based CPU port module can be a destination for packets + * even if it isn't part of PGID_SRC. + */ + cpu_fwd_mask = ocelot_get_dsa_8021q_cpu_mask(ocelot); + /* Apply FWD mask. The loop is needed to add/remove the current port as * a source for the other ports. */ for (port = 0; port < ocelot->num_phys_ports; port++) { - if (ocelot->bridge_fwd_mask & BIT(port)) { - unsigned long mask = ocelot->bridge_fwd_mask & ~BIT(port); + struct ocelot_port *ocelot_port = ocelot->ports[port]; + unsigned long mask; + + if (!ocelot_port) { + /* Unused ports can't send anywhere */ + mask = 0; + } else if (ocelot_port->is_dsa_8021q_cpu) { + /* The DSA tag_8021q CPU ports need to be able to + * forward packets to all other ports except for + * themselves + */ + mask = GENMASK(ocelot->num_phys_ports - 1, 0); + mask &= ~cpu_fwd_mask; + } else if (ocelot->bridge_fwd_mask & BIT(port)) { int lag; + mask = ocelot->bridge_fwd_mask & ~BIT(port); + for (lag = 0; lag < ocelot->num_phys_ports; lag++) { unsigned long bond_mask = ocelot->lags[lag]; @@ -912,15 +954,18 @@ static void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) break; } } - - ocelot_write_rix(ocelot, mask, - ANA_PGID_PGID, PGID_SRC + port); } else { - ocelot_write_rix(ocelot, 0, - ANA_PGID_PGID, PGID_SRC + port); + /* Standalone ports forward only to DSA tag_8021q CPU + * ports (if those exist), or to the hardware CPU port + * module otherwise. + */ + mask = cpu_fwd_mask; } + + ocelot_write_rix(ocelot, mask, ANA_PGID_PGID, PGID_SRC + port); } } +EXPORT_SYMBOL(ocelot_apply_bridge_fwd_mask); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state) { diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index b82fd4103a68..37a232911395 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -1009,6 +1009,7 @@ ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int cookie, return NULL; } +EXPORT_SYMBOL(ocelot_vcap_block_find_filter_by_id); /* If @on=false, then SNAP, ARP, IP and OAM frames will not match on keys based * on destination and source MAC addresses, but only on higher-level protocol diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.h b/drivers/net/ethernet/mscc/ocelot_vcap.h index 3b0c7916056e..523611ccc48f 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.h +++ b/drivers/net/ethernet/mscc/ocelot_vcap.h @@ -14,9 +14,6 @@ int ocelot_vcap_filter_stats_update(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); -struct ocelot_vcap_filter * -ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id, - bool tc_offload); void ocelot_detect_vcap_constants(struct ocelot *ocelot); int ocelot_vcap_init(struct ocelot *ocelot); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 93c22627dedd..6a61c499a30d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -610,6 +610,7 @@ struct ocelot_port { phy_interface_t phy_mode; u8 *xmit_template; + bool is_dsa_8021q_cpu; }; struct ocelot { @@ -760,6 +761,7 @@ void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev); int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state); +void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot); int ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge); int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 76e01c927e17..25fd525aaf92 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -693,5 +693,8 @@ int ocelot_vcap_filter_add(struct ocelot *ocelot, struct netlink_ext_ack *extack); int ocelot_vcap_filter_del(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); +struct ocelot_vcap_filter * +ocelot_vcap_block_find_filter_by_id(struct ocelot_vcap_block *block, int id, + bool tc_offload); #endif /* _OCELOT_VCAP_H_ */ -- cgit v1.2.3 From 14e8e0f6008865d823a8184a276702a6c3cbef3d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 29 Jan 2021 13:54:38 -0500 Subject: tcp: shrink inet_connection_sock icsk_mtup enabled and probe_size This commit shrinks inet_connection_sock by 4 bytes, by shrinking icsk_mtup.enabled from 32 bits to 1 bit, and shrinking icsk_mtup.probe_size from s32 to an unsuigned 31 bit field. This is to save space to compensate for the recent introduction of a new u32 in inet_connection_sock, icsk_probes_tstamp, in the recent bug fix commit 9d9b1ee0b2d1 ("tcp: fix TCP_USER_TIMEOUT with zero window"). This should not change functionality, since icsk_mtup.enabled is only ever set to 0 or 1, and icsk_mtup.probe_size can only be either 0 or a positive MTU value returned by tcp_mss_to_mtu() Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20210129185438.1813237-1-ncardwell.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c11f80f328f1..10a625760de9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -120,14 +120,14 @@ struct inet_connection_sock { __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; struct { - int enabled; - /* Range of MTUs to search */ int search_high; int search_low; /* Information on the current probe. */ - int probe_size; + u32 probe_size:31, + /* Is the MTUP feature enabled for this connection? */ + enabled:1; u32 probe_timestamp; } icsk_mtup; -- cgit v1.2.3 From 8b1c324c9faeb3e159256df0b84f9251a7941a33 Mon Sep 17 00:00:00 2001 From: Yu Liu Date: Fri, 29 Jan 2021 13:53:48 -0800 Subject: Bluetooth: Skip eSCO 2M params when not supported If a peer device doesn't support eSCO 2M we should skip the params that use it when setting up sync connection since they will always fail. Signed-off-by: Yu Liu Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_conn.c | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index fd1d10fe2f11..ebdd4afe30d2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1365,6 +1365,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_le_capable(dev) ((dev)->features[0][4] & LMP_LE) #define lmp_sniffsubr_capable(dev) ((dev)->features[0][5] & LMP_SNIFF_SUBR) #define lmp_pause_enc_capable(dev) ((dev)->features[0][5] & LMP_PAUSE_ENC) +#define lmp_esco_2m_capable(dev) ((dev)->features[0][5] & LMP_EDR_ESCO_2M) #define lmp_ext_inq_capable(dev) ((dev)->features[0][6] & LMP_EXT_INQ) #define lmp_le_br_capable(dev) (!!((dev)->features[0][6] & LMP_SIMUL_LE_BR)) #define lmp_ssp_capable(dev) ((dev)->features[0][6] & LMP_SIMPLE_PAIR) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 23c0d77ea737..6ffa89e3ba0a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -293,6 +293,20 @@ static void hci_add_sco(struct hci_conn *conn, __u16 handle) hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); } +static bool find_next_esco_param(struct hci_conn *conn, + const struct sco_param *esco_param, int size) +{ + for (; conn->attempt <= size; conn->attempt++) { + if (lmp_esco_2m_capable(conn->link) || + (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3)) + break; + BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported", + conn, conn->attempt); + } + + return conn->attempt <= size; +} + bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; @@ -314,13 +328,15 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: - if (conn->attempt > ARRAY_SIZE(esco_param_msbc)) + if (!find_next_esco_param(conn, esco_param_msbc, + ARRAY_SIZE(esco_param_msbc))) return false; param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (lmp_esco_capable(conn->link)) { - if (conn->attempt > ARRAY_SIZE(esco_param_cvsd)) + if (!find_next_esco_param(conn, esco_param_cvsd, + ARRAY_SIZE(esco_param_cvsd))) return false; param = &esco_param_cvsd[conn->attempt - 1]; } else { -- cgit v1.2.3 From 40c575d1ec71f7a61c73ba1603a69650c130559c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 1 Feb 2021 19:20:50 +0100 Subject: cfg80211: fix netdev registration deadlock If register_netdevice() fails after having called cfg80211's netdev notifier (cfg80211_netdev_notifier_call) it will call the notifier again with UNREGISTER. This would then lock the wiphy mutex because we're marked as registered, which causes a deadlock. Fix this by separately keeping track of whether or not we're in the middle of registering to also skip the notifier call on this unregister. Reported-by: syzbot+2ae0ca9d7737ad1a62b7@syzkaller.appspotmail.com Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Link: https://lore.kernel.org/r/20210201192048.ed8bad436737.I7cae042c44b15f80919a285799a15df467e9d42d@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- net/wireless/core.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4741d71ead21..4cdd75449d73 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5299,6 +5299,8 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * @wiphy: pointer to hardware description * @iftype: interface type * @registered: is this wdev already registered with cfg80211 + * @registering: indicates we're doing registration under wiphy lock + * for the notifier * @list: (private) Used to collect the interfaces * @netdev: (private) Used to reference back to the netdev, may be %NULL * @identifier: (private) Identifier used in nl80211 to identify this @@ -5382,7 +5384,7 @@ struct wireless_dev { struct mutex mtx; - bool use_4addr, is_running, registered; + bool use_4addr, is_running, registered, registering; u8 address[ETH_ALEN] __aligned(sizeof(u16)); diff --git a/net/wireless/core.c b/net/wireless/core.c index 18f9a5c214b5..a2785379df6e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1346,6 +1346,7 @@ int cfg80211_register_netdevice(struct net_device *dev) /* we'll take care of this */ wdev->registered = true; + wdev->registering = true; ret = register_netdevice(dev); if (ret) goto out; @@ -1361,6 +1362,7 @@ int cfg80211_register_netdevice(struct net_device *dev) cfg80211_register_wdev(rdev, wdev); ret = 0; out: + wdev->registering = false; if (ret) wdev->registered = false; return ret; @@ -1403,7 +1405,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * It is possible to get NETDEV_UNREGISTER multiple times, * so check wdev->registered. */ - if (wdev->registered) { + if (wdev->registered && !wdev->registering) { wiphy_lock(&rdev->wiphy); _cfg80211_unregister_wdev(wdev, false); wiphy_unlock(&rdev->wiphy); -- cgit v1.2.3 From 680aea08e78c003292415518ad270bc20f9c80b1 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 1 Feb 2021 21:47:52 +0200 Subject: net: ipv4: Emit notification when fib hardware flags are changed After installing a route to the kernel, user space receives an acknowledgment, which means the route was installed in the kernel, but not necessarily in hardware. The asynchronous nature of route installation in hardware can lead to a routing daemon advertising a route before it was actually installed in hardware. This can result in packet loss or mis-routed packets until the route is installed in hardware. It is also possible for a route already installed in hardware to change its action and therefore its flags. For example, a host route that is trapping packets can be "promoted" to perform decapsulation following the installation of an IPinIP/VXLAN tunnel. Emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags are changed. The aim is to provide an indication to user-space (e.g., routing daemons) about the state of the route in hardware. Introduce a sysctl that controls this behavior. Keep the default value at 0 (i.e., do not emit notifications) for several reasons: - Multiple RTM_NEWROUTE notification per-route might confuse existing routing daemons. - Convergence reasons in routing daemons. - The extra notifications will negatively impact the insertion rate. - Not all users are interested in these notifications. Signed-off-by: Amit Cohen Acked-by: Roopa Prabhu Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 20 ++++++++++++++++++++ include/net/netns/ipv4.h | 2 ++ net/ipv4/af_inet.c | 2 ++ net/ipv4/fib_trie.c | 27 +++++++++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 5 files changed, 60 insertions(+) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 2685ec161060..b0e800e68366 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -178,6 +178,26 @@ min_adv_mss - INTEGER The advertised MSS depends on the first hop route MTU, but will never be lower than this setting. +fib_notify_on_flag_change - INTEGER + Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/ + RTM_F_TRAP flags are changed. + + After installing a route to the kernel, user space receives an + acknowledgment, which means the route was installed in the kernel, + but not necessarily in hardware. + It is also possible for a route already installed in hardware to change + its action and therefore its flags. For example, a host route that is + trapping packets can be "promoted" to perform decapsulation following + the installation of an IPinIP/VXLAN tunnel. + The notifications will indicate to user-space the state of the route. + + Default: 0 (Do not emit notifications.) + + Possible values: + + - 0 - Do not emit notifications. + - 1 - Emit notifications. + IP Fragmentation: ipfrag_high_thresh - LONG INTEGER diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8e4fcac4df72..70a2a085dd1a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -188,6 +188,8 @@ struct netns_ipv4 { int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; + int sysctl_fib_notify_on_flag_change; + #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; #endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b94fa8eb831b..ab42f6404fc6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1871,6 +1871,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_igmp_llm_reports = 1; net->ipv4.sysctl_igmp_qrv = 2; + net->ipv4.sysctl_fib_notify_on_flag_change = 0; + return 0; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 28117c05dc35..60559b708158 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1038,6 +1038,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { struct fib_alias *fa_match; + struct sk_buff *skb; + int err; rcu_read_lock(); @@ -1045,9 +1047,34 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) if (!fa_match) goto out; + if (fa_match->offload == fri->offload && fa_match->trap == fri->trap) + goto out; + fa_match->offload = fri->offload; fa_match->trap = fri->trap; + if (!net->ipv4.sysctl_fib_notify_on_flag_change) + goto out; + + skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); + goto out; + +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3e5f4f2e705e..e5798b3b59d2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1354,6 +1354,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, + { + .procname = "fib_notify_on_flag_change", + .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- cgit v1.2.3 From fbaca8f895a6855b0b442227bc21da2d4cf77a01 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 1 Feb 2021 21:47:53 +0200 Subject: net: Pass 'net' struct as first argument to fib6_info_hw_flags_set() The next patch will emit notification when hardware flags are changed, in case that fib_notify_on_flag_change sysctl is set to 1. To know sysctl values, net struct is needed. This change is consistent with the IPv4 version, which gets 'net' struct as its first argument. Currently, the only callers of this function are mlxsw and netdevsim. Patch the callers to pass net. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 7 ++++--- drivers/net/netdevsim/fib.c | 14 ++++++++------ include/net/ip6_fib.h | 5 +++-- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 0ac7014703aa..5516e141f5bf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5004,8 +5004,8 @@ mlxsw_sp_fib6_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp, fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry, common); list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) - fib6_info_hw_flags_set(mlxsw_sp_rt6->rt, should_offload, - !should_offload); + fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), mlxsw_sp_rt6->rt, + should_offload, !should_offload); } static void @@ -5018,7 +5018,8 @@ mlxsw_sp_fib6_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp, fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry, common); list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) - fib6_info_hw_flags_set(mlxsw_sp_rt6->rt, false, false); + fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), mlxsw_sp_rt6->rt, + false, false); } static void diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index d557533d43dd..cb68f0cc6740 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -585,13 +585,15 @@ err_fib6_rt_nh_del: return err; } -static void nsim_fib6_rt_hw_flags_set(const struct nsim_fib6_rt *fib6_rt, +static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, + const struct nsim_fib6_rt *fib6_rt, bool trap) { + struct net *net = devlink_net(data->devlink); struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) - fib6_info_hw_flags_set(fib6_rt_nh->rt, false, trap); + fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap); } static int nsim_fib6_rt_add(struct nsim_fib_data *data, @@ -607,7 +609,7 @@ static int nsim_fib6_rt_add(struct nsim_fib_data *data, goto err_fib_dismiss; msleep(1); - nsim_fib6_rt_hw_flags_set(fib6_rt, true); + nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); return 0; @@ -641,9 +643,9 @@ static int nsim_fib6_rt_replace(struct nsim_fib_data *data, return err; msleep(1); - nsim_fib6_rt_hw_flags_set(fib6_rt, true); + nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); - nsim_fib6_rt_hw_flags_set(fib6_rt_old, false); + nsim_fib6_rt_hw_flags_set(data, fib6_rt_old, false); nsim_fib6_rt_destroy(fib6_rt_old); return 0; @@ -954,7 +956,7 @@ static void nsim_fib6_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib6_rt *fib6_rt; fib6_rt = container_of(fib_rt, struct nsim_fib6_rt, common); - nsim_fib6_rt_hw_flags_set(fib6_rt, false); + nsim_fib6_rt_hw_flags_set(data, fib6_rt, false); nsim_fib_account(&data->ipv6.fib, false); nsim_fib6_rt_destroy(fib6_rt); } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ac5ff3c3afb1..cc189e668adf 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -336,8 +336,9 @@ static inline void fib6_info_release(struct fib6_info *f6i) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } -static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload, - bool trap) +static inline void +fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, + bool trap) { f6i->offload = offload; f6i->trap = trap; -- cgit v1.2.3 From 907eea486888cfe118c19759bbc4b8fca2e004df Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 1 Feb 2021 21:47:55 +0200 Subject: net: ipv6: Emit notification when fib hardware flags are changed After installing a route to the kernel, user space receives an acknowledgment, which means the route was installed in the kernel, but not necessarily in hardware. The asynchronous nature of route installation in hardware can lead to a routing daemon advertising a route before it was actually installed in hardware. This can result in packet loss or mis-routed packets until the route is installed in hardware. It is also possible for a route already installed in hardware to change its action and therefore its flags. For example, a host route that is trapping packets can be "promoted" to perform decapsulation following the installation of an IPinIP/VXLAN tunnel. Emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags are changed. The aim is to provide an indication to user-space (e.g., routing daemons) about the state of the route in hardware. Introduce a sysctl that controls this behavior. Keep the default value at 0 (i.e., do not emit notifications) for several reasons: - Multiple RTM_NEWROUTE notification per-route might confuse existing routing daemons. - Convergence reasons in routing daemons. - The extra notifications will negatively impact the insertion rate. - Not all users are interested in these notifications. Move fib6_info_hw_flags_set() to C file because it is no longer a short function. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 20 ++++++++++++++++ include/net/ip6_fib.h | 10 ++------ include/net/netns/ipv6.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/route.c | 44 ++++++++++++++++++++++++++++++++++ net/ipv6/sysctl_net_ipv6.c | 9 +++++++ 6 files changed, 77 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b0e800e68366..61a358301f12 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1795,6 +1795,26 @@ nexthop_compat_mode - BOOLEAN and extraneous notifications. Default: true (backward compat mode) +fib_notify_on_flag_change - INTEGER + Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/ + RTM_F_TRAP flags are changed. + + After installing a route to the kernel, user space receives an + acknowledgment, which means the route was installed in the kernel, + but not necessarily in hardware. + It is also possible for a route already installed in hardware to change + its action and therefore its flags. For example, a host route that is + trapping packets can be "promoted" to perform decapsulation following + the installation of an IPinIP/VXLAN tunnel. + The notifications will indicate to user-space the state of the route. + + Default: 0 (Do not emit notifications.) + + Possible values: + + - 0 - Do not emit notifications. + - 1 - Emit notifications. + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cc189e668adf..1e262b23c68b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -336,14 +336,6 @@ static inline void fib6_info_release(struct fib6_info *f6i) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } -static inline void -fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, - bool trap) -{ - f6i->offload = offload; - f6i->trap = trap; -} - enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, @@ -546,6 +538,8 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } +void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, + bool offload, bool trap); #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5ec054473d81..21c0debbd39e 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -51,6 +51,7 @@ struct netns_sysctl_ipv6 { int max_hbh_opts_len; int seg6_flowlabel; bool skip_notify_on_dev_down; + int fib_notify_on_flag_change; }; struct netns_ipv6 { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8e9c3e9ea36e..0e9994e0ecd7 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -954,6 +954,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; + net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 41d8f801b75f..92b400eb8476 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6064,6 +6064,50 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, + bool offload, bool trap) +{ + struct sk_buff *skb; + int err; + + if (f6i->offload == offload && f6i->trap == trap) + return; + + f6i->offload = offload; + f6i->trap = trap; + + if (!rcu_access_pointer(f6i->fib6_node)) + /* The route was removed from the tree, do not send + * notfication. + */ + return; + + if (!net->ipv6.sysctl.fib_notify_on_flag_change) + return; + + skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, + 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} +EXPORT_SYMBOL(fib6_info_hw_flags_set); + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 5b60a4bdd36a..392ef01e3366 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -160,6 +160,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_notify_on_flag_change", + .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; -- cgit v1.2.3 From 4f4e54366eae20d5867864001db57c5d90693d8c Mon Sep 17 00:00:00 2001 From: Emil Renner Berthing Date: Sun, 31 Jan 2021 00:46:37 +0100 Subject: net: usb: cdc_ncm: use new API for bh tasklet This converts the driver to use the new tasklet API introduced in commit 12cc923f1ccc ("tasklet: Introduce new initialization API") It is unfortunate that we need to add a pointer to the driver context to get back to the usbnet device, but the space will be reclaimed once there are no more users of the old API left and we can remove the data value and flag from the tasklet struct. Signed-off-by: Emil Renner Berthing Link: https://lore.kernel.org/r/20210130234637.26505-1-kernel@esmil.dk Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 12 +++++++----- include/linux/usb/cdc_ncm.h | 2 ++ 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 291e76d32abe..4087c9e33781 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -61,7 +61,7 @@ static bool prefer_mbim; module_param(prefer_mbim, bool, 0644); MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions"); -static void cdc_ncm_txpath_bh(unsigned long param); +static void cdc_ncm_txpath_bh(struct tasklet_struct *t); static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx); static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer); static struct usb_driver cdc_ncm_driver; @@ -813,9 +813,11 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ if (!ctx) return -ENOMEM; + ctx->dev = dev; + hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ctx->tx_timer.function = &cdc_ncm_tx_timer_cb; - tasklet_init(&ctx->bh, cdc_ncm_txpath_bh, (unsigned long)dev); + tasklet_setup(&ctx->bh, cdc_ncm_txpath_bh); atomic_set(&ctx->stop, 0); spin_lock_init(&ctx->mtx); @@ -1472,10 +1474,10 @@ static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void cdc_ncm_txpath_bh(unsigned long param) +static void cdc_ncm_txpath_bh(struct tasklet_struct *t) { - struct usbnet *dev = (struct usbnet *)param; - struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; + struct cdc_ncm_ctx *ctx = from_tasklet(ctx, t, bh); + struct usbnet *dev = ctx->dev; spin_lock_bh(&ctx->mtx); if (ctx->tx_timer_pending != 0) { diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 0ce4377545f8..f7cb3ddce7fb 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -98,6 +98,8 @@ struct cdc_ncm_ctx { struct hrtimer tx_timer; struct tasklet_struct bh; + struct usbnet *dev; + const struct usb_cdc_ncm_desc *func_desc; const struct usb_cdc_mbim_desc *mbim_desc; const struct usb_cdc_mbim_extended_desc *mbim_extended_desc; -- cgit v1.2.3 From e43b21906439ed14dda84f9784d38c03d0464607 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Mon, 1 Feb 2021 17:41:29 +0000 Subject: net: use indirect call helpers for dst_input This patch avoids the indirect call for the common case: ip_local_deliver and ip6_input Signed-off-by: Brian Vazquez Signed-off-by: Jakub Kicinski --- include/net/dst.h | 6 +++++- net/ipv4/ip_input.c | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 10f0a8399867..98cf6e8c06c4 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -18,6 +18,7 @@ #include #include #include +#include struct sk_buff; @@ -441,10 +442,13 @@ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *s return skb_dst(skb)->output(net, sk, skb); } +INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { - return skb_dst(skb)->input(skb); + return INDIRECT_CALL_INET(skb_dst(skb)->input, + ip6_input, ip_local_deliver, skb); } static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b0c244af1e4d..3a025c011971 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -253,6 +253,7 @@ int ip_local_deliver(struct sk_buff *skb) net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } +EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { -- cgit v1.2.3 From 6585d7dc491d9d5e323ed52ee32ad071e04c9dfa Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Mon, 1 Feb 2021 17:41:30 +0000 Subject: net: use indirect call helpers for dst_output This patch avoids the indirect call for the common case: ip6_output and ip_output Signed-off-by: Brian Vazquez Signed-off-by: Jakub Kicinski --- include/net/dst.h | 8 +++++++- net/ipv4/ip_output.c | 1 + net/ipv6/ip6_output.c | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 98cf6e8c06c4..3932e9931f08 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -436,10 +436,16 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) dst->expires = expires; } +INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, + struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return skb_dst(skb)->output(net, sk, skb); + return INDIRECT_CALL_INET(skb_dst(skb)->output, + ip6_output, ip_output, + net, sk, skb); } INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 959b94e32f2b..3aab53beb4ea 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -434,6 +434,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } +EXPORT_SYMBOL(ip_output); /* * copy saddr and daddr, possibly using 64bit load/stores diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 117cd95df213..ff4f9ebcf7f6 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -217,6 +217,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +EXPORT_SYMBOL(ip6_output); bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { -- cgit v1.2.3 From f67fbeaebdc0356e0cbc94f4b099f45ebe174b02 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Mon, 1 Feb 2021 17:41:31 +0000 Subject: net: use indirect call helpers for dst_mtu This patch avoids the indirect call for the common case: ip6_mtu and ipv4_mtu Signed-off-by: Brian Vazquez Signed-off-by: Jakub Kicinski --- include/net/dst.h | 4 +++- net/ipv4/route.c | 6 ++++-- net/ipv6/route.c | 6 ++++-- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 3932e9931f08..9f474a79ed7d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -194,9 +194,11 @@ dst_feature(const struct dst_entry *dst, u32 feature) return dst_metric(dst, RTAX_FEATURES) & feature; } +INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); +INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { - return dst->ops->mtu(dst); + return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e26652ff7059..4fac91f8bd6c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -135,7 +135,8 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); -static unsigned int ipv4_mtu(const struct dst_entry *dst); +INDIRECT_CALLABLE_SCOPE +unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -1311,7 +1312,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) return min(advmss, IPV4_MAX_PMTU - header_size); } -static unsigned int ipv4_mtu(const struct dst_entry *dst) +INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; @@ -1333,6 +1334,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +EXPORT_SYMBOL(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 92b400eb8476..886b2095097e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -83,7 +83,8 @@ enum rt6_nud_state { static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); -static unsigned int ip6_mtu(const struct dst_entry *dst); +INDIRECT_CALLABLE_SCOPE +unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, @@ -3089,7 +3090,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) return mtu; } -static unsigned int ip6_mtu(const struct dst_entry *dst) +INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { struct inet6_dev *idev; unsigned int mtu; @@ -3111,6 +3112,7 @@ out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +EXPORT_SYMBOL(ip6_mtu); /* MTU selection: * 1. mtu on route is locked - use it -- cgit v1.2.3 From bbd807dfbf20506f5548b0297c430a09326e7c4b Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Mon, 1 Feb 2021 17:41:32 +0000 Subject: net: indirect call helpers for ipv4/ipv6 dst_check functions This patch avoids the indirect call for the common case: ip6_dst_check and ipv4_dst_check Signed-off-by: Brian Vazquez Signed-off-by: Jakub Kicinski --- include/net/dst.h | 7 ++++++- net/core/sock.c | 12 ++++++++++-- net/ipv4/route.c | 7 +++++-- net/ipv4/tcp_ipv4.c | 5 ++++- net/ipv6/route.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 5 ++++- 6 files changed, 34 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 9f474a79ed7d..26f134ad3a25 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -459,10 +459,15 @@ static inline int dst_input(struct sk_buff *skb) ip6_input, ip_local_deliver, skb); } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) - dst = dst->ops->check(dst, cookie); + dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, + ipv4_dst_check, dst, cookie); return dst; } diff --git a/net/core/sock.c b/net/core/sock.c index 648a5cb46209..0ed98f20448a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -526,11 +526,17 @@ discard_and_relse: } EXPORT_SYMBOL(__sk_receive_skb); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); @@ -546,7 +552,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4fac91f8bd6c..9e6537709794 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -133,7 +133,8 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; * Interface to generic destination cache. */ -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +INDIRECT_CALLABLE_SCOPE +struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); @@ -1188,7 +1189,8 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, + u32 cookie) { struct rtable *rt = (struct rtable *) dst; @@ -1204,6 +1206,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) return NULL; return dst; } +EXPORT_SYMBOL(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 777306b5bc22..611039207d30 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1649,6 +1649,8 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, return mss; } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1668,7 +1670,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - !dst->ops->check(dst, 0)) { + !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, + dst, 0)) { dst_release(dst); sk->sk_rx_dst = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 886b2095097e..8d9e053dc071 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -81,7 +81,8 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +INDIRECT_CALLABLE_SCOPE +struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); @@ -2612,7 +2613,8 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, return NULL; } -static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, + u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; @@ -2642,6 +2644,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) return dst_ret; } +EXPORT_SYMBOL(ip6_dst_check); static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e1509b02cb3..d093ef3ef060 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1420,6 +1420,8 @@ out: return NULL; } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1473,7 +1475,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, + dst, np->rx_dst_cookie) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } -- cgit v1.2.3 From 65e6dcf73398ddb64bb782ff2acd918d3a37a53a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 29 Jan 2021 23:04:08 +0100 Subject: net, veth: Alloc skb in bulk for ndo_xdp_xmit Split ndo_xdp_xmit and ndo_start_xmit use cases in veth_xdp_rcv routine in order to alloc skbs in bulk for XDP_PASS verdict. Introduce xdp_alloc_skb_bulk utility routine to alloc skb bulk list. The proposed approach has been tested in the following scenario: eth (ixgbe) --> XDP_REDIRECT --> veth0 --> (remote-ns) veth1 --> XDP_PASS XDP_REDIRECT: xdp_redirect_map bpf sample XDP_PASS: xdp_rxq_info bpf sample traffic generator: pkt_gen sending udp traffic on a remote device bpf-next master: ~3.64Mpps bpf-next + skb bulking allocation: ~3.79Mpps Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Reviewed-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/a14a30d3c06fff24e13f836c733d80efc0bd6eb5.1611957532.git.lorenzo@kernel.org --- drivers/net/veth.c | 78 ++++++++++++++++++++++++++++++++++++++++-------------- include/net/xdp.h | 1 + net/core/xdp.c | 11 ++++++++ 3 files changed, 70 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 6e03b619c93c..aa1a66ad2ce5 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -35,6 +35,7 @@ #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) #define VETH_XDP_TX_BULK_SIZE 16 +#define VETH_XDP_BATCH 16 struct veth_stats { u64 rx_drops; @@ -562,14 +563,13 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, return 0; } -static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, - struct xdp_frame *frame, - struct veth_xdp_tx_bq *bq, - struct veth_stats *stats) +static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, + struct xdp_frame *frame, + struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) { struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; - struct sk_buff *skb; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); @@ -623,13 +623,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, } rcu_read_unlock(); - skb = xdp_build_skb_from_frame(frame, rq->dev); - if (!skb) { - xdp_return_frame(frame); - stats->rx_drops++; - } - - return skb; + return frame; err_xdp: rcu_read_unlock(); xdp_return_frame(frame); @@ -637,6 +631,37 @@ xdp_xmit: return NULL; } +/* frames array contains VETH_XDP_BATCH at most */ +static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, + int n_xdpf, struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) +{ + void *skbs[VETH_XDP_BATCH]; + int i; + + if (xdp_alloc_skb_bulk(skbs, n_xdpf, + GFP_ATOMIC | __GFP_ZERO) < 0) { + for (i = 0; i < n_xdpf; i++) + xdp_return_frame(frames[i]); + stats->rx_drops += n_xdpf; + + return; + } + + for (i = 0; i < n_xdpf; i++) { + struct sk_buff *skb = skbs[i]; + + skb = __xdp_build_skb_from_frame(frames[i], skb, + rq->dev); + if (!skb) { + xdp_return_frame(frames[i]); + stats->rx_drops++; + continue; + } + napi_gro_receive(&rq->xdp_napi, skb); + } +} + static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, struct veth_xdp_tx_bq *bq, @@ -784,32 +809,45 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - int i, done = 0; + int i, done = 0, n_xdpf = 0; + void *xdpf[VETH_XDP_BATCH]; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); - struct sk_buff *skb; if (!ptr) break; if (veth_is_xdp_frame(ptr)) { + /* ndo_xdp_xmit */ struct xdp_frame *frame = veth_ptr_to_xdp(ptr); stats->xdp_bytes += frame->len; - skb = veth_xdp_rcv_one(rq, frame, bq, stats); + frame = veth_xdp_rcv_one(rq, frame, bq, stats); + if (frame) { + /* XDP_PASS */ + xdpf[n_xdpf++] = frame; + if (n_xdpf == VETH_XDP_BATCH) { + veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, + bq, stats); + n_xdpf = 0; + } + } } else { - skb = ptr; + /* ndo_start_xmit */ + struct sk_buff *skb = ptr; + stats->xdp_bytes += skb->len; skb = veth_xdp_rcv_skb(rq, skb, bq, stats); + if (skb) + napi_gro_receive(&rq->xdp_napi, skb); } - - if (skb) - napi_gro_receive(&rq->xdp_napi, skb); - done++; } + if (n_xdpf) + veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); + u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; diff --git a/include/net/xdp.h b/include/net/xdp.h index c4bfdc9a8b79..a5bc214a49d9 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -169,6 +169,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/net/core/xdp.c b/net/core/xdp.c index 0d2630a35c3e..05354976c1fc 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -514,6 +514,17 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +{ + n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, + n_skb, skbs); + if (unlikely(!n_skb)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From 012ce4dd3102a0f4d80167de343e9d44b257c1b8 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Tue, 2 Feb 2021 20:06:06 +0200 Subject: ethtool: Extend link modes settings uAPI with lanes Currently, when auto negotiation is on, the user can advertise all the linkmodes which correspond to a specific speed, but does not have a similar selector for the number of lanes. This is significant when a specific speed can be achieved using different number of lanes. For example, 2x50 or 4x25. Add 'ETHTOOL_A_LINKMODES_LANES' attribute and expand 'struct ethtool_link_settings' with lanes field in order to implement a new lanes-selector that will enable the user to advertise a specific number of lanes as well. When auto negotiation is off, lanes parameter can be forced only if the driver supports it. Add a capability bit in 'struct ethtool_ops' that allows ethtool know if the driver can handle the lanes parameter when auto negotiation is off, so if it does not, an error message will be returned when trying to set lanes. Example: $ ethtool -s swp1 lanes 4 $ ethtool swp1 Settings for swp1: Supported ports: [ FIBRE ] Supported link modes: 1000baseKX/Full 10000baseKR/Full 40000baseCR4/Full 40000baseSR4/Full 40000baseLR4/Full 25000baseCR/Full 25000baseSR/Full 50000baseCR2/Full 100000baseSR4/Full 100000baseCR4/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: Yes Supported FEC modes: Not reported Advertised link modes: 40000baseCR4/Full 40000baseSR4/Full 40000baseLR4/Full 100000baseSR4/Full 100000baseCR4/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Advertised FEC modes: Not reported Speed: Unknown! Duplex: Unknown! (255) Auto-negotiation: on Port: Direct Attach Copper PHYAD: 0 Transceiver: internal Link detected: no Signed-off-by: Danielle Ratson Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 11 ++-- include/linux/ethtool.h | 4 ++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/linkmodes.c | 96 +++++++++++++++++++++++----- net/ethtool/netlink.h | 2 +- 5 files changed, 93 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 30b98245979f..05073482db05 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -431,16 +431,17 @@ Request contents: ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ``ETHTOOL_A_LINKMODES_LANES`` u32 lanes ========================================== ====== ========================== ``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If autonegotiation is on (either set now or kept from before), advertised modes are not changed (no ``ETHTOOL_A_LINKMODES_OURS`` attribute) and at least one -of speed and duplex is specified, kernel adjusts advertised modes to all -supported modes matching speed, duplex or both (whatever is specified). This -autoselection is done on ethtool side with ioctl interface, netlink interface -is supposed to allow requesting changes without knowing what exactly kernel -supports. +of speed, duplex and lanes is specified, kernel adjusts advertised modes to all +supported modes matching speed, duplex, lanes or all (whatever is specified). +This autoselection is done on ethtool side with ioctl interface, netlink +interface is supposed to allow requesting changes without knowing what exactly +kernel supports. LINKSTATE_GET diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e3da25b51ae4..1ab13c5dfb2f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -128,6 +128,7 @@ struct ethtool_link_ksettings { __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); } link_modes; + u32 lanes; }; /** @@ -265,6 +266,8 @@ struct ethtool_pause_stats { /** * struct ethtool_ops - optional netdev operations + * @cap_link_lanes_supported: indicates if the driver supports lanes + * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. * @get_drvinfo: Report driver/device information. Should only set the * @driver, @version, @fw_version and @bus_info fields. If not @@ -420,6 +423,7 @@ struct ethtool_pause_stats { * of the generic netdev features interface. */ struct ethtool_ops { + u32 cap_link_lanes_supported:1; u32 supported_coalesce_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index e2bf36e6964b..a286635ac9b8 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -227,6 +227,7 @@ enum { ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ + ETHTOOL_A_LINKMODES_LANES, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKMODES_CNT, diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index bb8a3351fb72..db3e31fc6709 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -152,12 +152,47 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = { struct link_mode_info { int speed; + u8 lanes; u8 duplex; }; -#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ - [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ - .speed = SPEED_ ## _speed, \ +#define __LINK_MODE_LANES_CR 1 +#define __LINK_MODE_LANES_CR2 2 +#define __LINK_MODE_LANES_CR4 4 +#define __LINK_MODE_LANES_CR8 8 +#define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR8 8 +#define __LINK_MODE_LANES_KR 1 +#define __LINK_MODE_LANES_KR2 2 +#define __LINK_MODE_LANES_KR4 4 +#define __LINK_MODE_LANES_KR8 8 +#define __LINK_MODE_LANES_SR 1 +#define __LINK_MODE_LANES_SR2 2 +#define __LINK_MODE_LANES_SR4 4 +#define __LINK_MODE_LANES_SR8 8 +#define __LINK_MODE_LANES_ER 1 +#define __LINK_MODE_LANES_KX 1 +#define __LINK_MODE_LANES_KX4 4 +#define __LINK_MODE_LANES_LR 1 +#define __LINK_MODE_LANES_LR4 4 +#define __LINK_MODE_LANES_LR4_ER4 4 +#define __LINK_MODE_LANES_LR_ER_FR 1 +#define __LINK_MODE_LANES_LR2_ER2_FR2 2 +#define __LINK_MODE_LANES_LR4_ER4_FR4 4 +#define __LINK_MODE_LANES_LR8_ER8_FR8 8 +#define __LINK_MODE_LANES_LRM 1 +#define __LINK_MODE_LANES_MLD2 2 +#define __LINK_MODE_LANES_T 1 +#define __LINK_MODE_LANES_T1 1 +#define __LINK_MODE_LANES_X 1 +#define __LINK_MODE_LANES_FX 1 + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ .duplex = __DUPLEX_ ## _duplex \ } #define __DUPLEX_Half DUPLEX_HALF @@ -165,6 +200,7 @@ struct link_mode_info { #define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ .speed = SPEED_UNKNOWN, \ + .lanes = 0, \ .duplex = DUPLEX_UNKNOWN, \ } @@ -274,16 +310,17 @@ const struct nla_policy ethnl_linkmodes_set_policy[] = { [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_LANES] = NLA_POLICY_RANGE(NLA_U32, 1, 8), }; -/* Set advertised link modes to all supported modes matching requested speed - * and duplex values. Called when autonegotiation is on, speed or duplex is - * requested but no link mode change. This is done in userspace with ioctl() - * interface, move it into kernel for netlink. +/* Set advertised link modes to all supported modes matching requested speed, + * lanes and duplex values. Called when autonegotiation is on, speed, lanes or + * duplex is requested but no link mode change. This is done in userspace with + * ioctl() interface, move it into kernel for netlink. * Returns true if advertised modes bitmap was modified. */ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, - bool req_speed, bool req_duplex) + bool req_speed, bool req_lanes, bool req_duplex) { unsigned long *advertising = ksettings->link_modes.advertising; unsigned long *supported = ksettings->link_modes.supported; @@ -302,6 +339,7 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, continue; if (test_bit(i, supported) && (!req_speed || info->speed == ksettings->base.speed) && + (!req_lanes || info->lanes == ksettings->lanes) && (!req_duplex || info->duplex == ksettings->base.duplex)) set_bit(i, advertising); else @@ -327,7 +365,7 @@ static bool ethnl_validate_master_slave_cfg(u8 cfg) static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb) { - const struct nlattr *master_slave_cfg; + const struct nlattr *master_slave_cfg, *lanes_cfg; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; if (master_slave_cfg && @@ -337,16 +375,23 @@ static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb) return -EOPNOTSUPP; } + lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; + if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) { + NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, + "lanes value is invalid"); + return -EINVAL; + } + return 0; } static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, - bool *mod) + bool *mod, const struct net_device *dev) { struct ethtool_link_settings *lsettings = &ksettings->base; - bool req_speed, req_duplex; - const struct nlattr *master_slave_cfg; + bool req_speed, req_lanes, req_duplex; + const struct nlattr *master_slave_cfg, *lanes_cfg; int ret; master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; @@ -360,10 +405,30 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; + req_lanes = tb[ETHTOOL_A_LINKMODES_LANES]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], mod); + + lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES]; + if (lanes_cfg) { + /* If autoneg is off and lanes parameter is not supported by the + * driver, return an error. + */ + if (!lsettings->autoneg && + !dev->ethtool_ops->cap_link_lanes_supported) { + NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg, + "lanes configuration not supported by device"); + return -EOPNOTSUPP; + } + } else if (!lsettings->autoneg) { + /* If autoneg is off and lanes parameter is not passed from user, + * set the lanes parameter to 0. + */ + ksettings->lanes = 0; + } + ret = ethnl_update_bitset(ksettings->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS, tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, @@ -372,13 +437,14 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, return ret; ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], mod); + ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && - (req_speed || req_duplex) && - ethnl_auto_linkmodes(ksettings, req_speed, req_duplex)) + (req_speed || req_lanes || req_duplex) && + ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex)) *mod = true; return 0; @@ -420,7 +486,7 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) goto out_ops; } - ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod); + ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev); if (ret < 0) goto out_ops; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index d8efec516d86..6eabd58d81bf 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -351,7 +351,7 @@ extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_O extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; -extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1]; +extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; -- cgit v1.2.3 From c8907043c6ac9ed58e6c1a76f2824be714b42228 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Tue, 2 Feb 2021 20:06:07 +0200 Subject: ethtool: Get link mode in use instead of speed and duplex parameters Currently, when user space queries the link's parameters, as speed and duplex, each parameter is passed from the driver to ethtool. Instead, get the link mode bit in use, and derive each of the parameters from it in ethtool. Signed-off-by: Danielle Ratson Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 1 + net/ethtool/common.c | 147 +++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/common.h | 7 +++ net/ethtool/ioctl.c | 18 +++++- net/ethtool/linkmodes.c | 157 +----------------------------------------------- 5 files changed, 174 insertions(+), 156 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1ab13c5dfb2f..ec4cd3921c67 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -129,6 +129,7 @@ struct ethtool_link_ksettings { __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); } link_modes; u32 lanes; + enum ethtool_link_mode_bit_indices link_mode; }; /** diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 181220101a6e..835b9bba3e7e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -198,6 +198,153 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); +#define __LINK_MODE_LANES_CR 1 +#define __LINK_MODE_LANES_CR2 2 +#define __LINK_MODE_LANES_CR4 4 +#define __LINK_MODE_LANES_CR8 8 +#define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR8 8 +#define __LINK_MODE_LANES_KR 1 +#define __LINK_MODE_LANES_KR2 2 +#define __LINK_MODE_LANES_KR4 4 +#define __LINK_MODE_LANES_KR8 8 +#define __LINK_MODE_LANES_SR 1 +#define __LINK_MODE_LANES_SR2 2 +#define __LINK_MODE_LANES_SR4 4 +#define __LINK_MODE_LANES_SR8 8 +#define __LINK_MODE_LANES_ER 1 +#define __LINK_MODE_LANES_KX 1 +#define __LINK_MODE_LANES_KX4 4 +#define __LINK_MODE_LANES_LR 1 +#define __LINK_MODE_LANES_LR4 4 +#define __LINK_MODE_LANES_LR4_ER4 4 +#define __LINK_MODE_LANES_LR_ER_FR 1 +#define __LINK_MODE_LANES_LR2_ER2_FR2 2 +#define __LINK_MODE_LANES_LR4_ER4_FR4 4 +#define __LINK_MODE_LANES_LR8_ER8_FR8 8 +#define __LINK_MODE_LANES_LRM 1 +#define __LINK_MODE_LANES_MLD2 2 +#define __LINK_MODE_LANES_T 1 +#define __LINK_MODE_LANES_T1 1 +#define __LINK_MODE_LANES_X 1 +#define __LINK_MODE_LANES_FX 1 + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ + .duplex = __DUPLEX_ ## _duplex \ + } +#define __DUPLEX_Half DUPLEX_HALF +#define __DUPLEX_Full DUPLEX_FULL +#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ + .speed = SPEED_UNKNOWN, \ + .lanes = 0, \ + .duplex = DUPLEX_UNKNOWN, \ + } + +const struct link_mode_info link_mode_params[] = { + __DEFINE_LINK_MODE_PARAMS(10, T, Half), + __DEFINE_LINK_MODE_PARAMS(10, T, Full), + __DEFINE_LINK_MODE_PARAMS(100, T, Half), + __DEFINE_LINK_MODE_PARAMS(100, T, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T, Half), + __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), + __DEFINE_SPECIAL_MODE_PARAMS(TP), + __DEFINE_SPECIAL_MODE_PARAMS(AUI), + __DEFINE_SPECIAL_MODE_PARAMS(MII), + __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), + __DEFINE_SPECIAL_MODE_PARAMS(BNC), + __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Pause), + __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), + __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Backplane), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { + .speed = SPEED_10000, + .duplex = DUPLEX_FULL, + }, + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(1000, X, Full), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), + __DEFINE_LINK_MODE_PARAMS(2500, T, Full), + __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, T1, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full), +}; +static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); + const char netif_msg_class_names[][ETH_GSTRING_LEN] = { [NETIF_MSG_DRV_BIT] = "drv", [NETIF_MSG_PROBE_BIT] = "probe", diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 3d9251c95a8b..a9d071248698 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -14,6 +14,12 @@ #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) +struct link_mode_info { + int speed; + u8 lanes; + u8 duplex; +}; + extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; extern const char @@ -23,6 +29,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; +extern const struct link_mode_info link_mode_params[]; extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 771688e1b0da..24783b71c584 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -426,13 +426,29 @@ struct ethtool_link_usettings { int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { + const struct link_mode_info *link_info; + int err; + ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; memset(link_ksettings, 0, sizeof(*link_ksettings)); - return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); + + link_ksettings->link_mode = -1; + err = dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); + if (err) + return err; + + if (link_ksettings->link_mode != -1) { + link_info = &link_mode_params[link_ksettings->link_mode]; + link_ksettings->base.speed = link_info->speed; + link_ksettings->lanes = link_info->lanes; + link_ksettings->base.duplex = link_info->duplex; + } + + return 0; } EXPORT_SYMBOL(__ethtool_get_link_ksettings); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index db3e31fc6709..fc986d035b01 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -4,6 +4,8 @@ #include "common.h" #include "bitset.h" +/* LINKMODES_GET */ + struct linkmodes_req_info { struct ethnl_req_info base; }; @@ -150,158 +152,6 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = { /* LINKMODES_SET */ -struct link_mode_info { - int speed; - u8 lanes; - u8 duplex; -}; - -#define __LINK_MODE_LANES_CR 1 -#define __LINK_MODE_LANES_CR2 2 -#define __LINK_MODE_LANES_CR4 4 -#define __LINK_MODE_LANES_CR8 8 -#define __LINK_MODE_LANES_DR 1 -#define __LINK_MODE_LANES_DR2 2 -#define __LINK_MODE_LANES_DR4 4 -#define __LINK_MODE_LANES_DR8 8 -#define __LINK_MODE_LANES_KR 1 -#define __LINK_MODE_LANES_KR2 2 -#define __LINK_MODE_LANES_KR4 4 -#define __LINK_MODE_LANES_KR8 8 -#define __LINK_MODE_LANES_SR 1 -#define __LINK_MODE_LANES_SR2 2 -#define __LINK_MODE_LANES_SR4 4 -#define __LINK_MODE_LANES_SR8 8 -#define __LINK_MODE_LANES_ER 1 -#define __LINK_MODE_LANES_KX 1 -#define __LINK_MODE_LANES_KX4 4 -#define __LINK_MODE_LANES_LR 1 -#define __LINK_MODE_LANES_LR4 4 -#define __LINK_MODE_LANES_LR4_ER4 4 -#define __LINK_MODE_LANES_LR_ER_FR 1 -#define __LINK_MODE_LANES_LR2_ER2_FR2 2 -#define __LINK_MODE_LANES_LR4_ER4_FR4 4 -#define __LINK_MODE_LANES_LR8_ER8_FR8 8 -#define __LINK_MODE_LANES_LRM 1 -#define __LINK_MODE_LANES_MLD2 2 -#define __LINK_MODE_LANES_T 1 -#define __LINK_MODE_LANES_T1 1 -#define __LINK_MODE_LANES_X 1 -#define __LINK_MODE_LANES_FX 1 - -#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ - [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ - .speed = SPEED_ ## _speed, \ - .lanes = __LINK_MODE_LANES_ ## _type, \ - .duplex = __DUPLEX_ ## _duplex \ - } -#define __DUPLEX_Half DUPLEX_HALF -#define __DUPLEX_Full DUPLEX_FULL -#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ - [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ - .speed = SPEED_UNKNOWN, \ - .lanes = 0, \ - .duplex = DUPLEX_UNKNOWN, \ - } - -static const struct link_mode_info link_mode_params[] = { - __DEFINE_LINK_MODE_PARAMS(10, T, Half), - __DEFINE_LINK_MODE_PARAMS(10, T, Full), - __DEFINE_LINK_MODE_PARAMS(100, T, Half), - __DEFINE_LINK_MODE_PARAMS(100, T, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T, Half), - __DEFINE_LINK_MODE_PARAMS(1000, T, Full), - __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), - __DEFINE_SPECIAL_MODE_PARAMS(TP), - __DEFINE_SPECIAL_MODE_PARAMS(AUI), - __DEFINE_SPECIAL_MODE_PARAMS(MII), - __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), - __DEFINE_SPECIAL_MODE_PARAMS(BNC), - __DEFINE_LINK_MODE_PARAMS(10000, T, Full), - __DEFINE_SPECIAL_MODE_PARAMS(Pause), - __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), - __DEFINE_LINK_MODE_PARAMS(2500, X, Full), - __DEFINE_SPECIAL_MODE_PARAMS(Backplane), - __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), - [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { - .speed = SPEED_10000, - .duplex = DUPLEX_FULL, - }, - __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), - __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(1000, X, Full), - __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), - __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), - __DEFINE_LINK_MODE_PARAMS(2500, T, Full), - __DEFINE_LINK_MODE_PARAMS(5000, T, Full), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), - __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, T1, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), - __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), - __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, FX, Half), - __DEFINE_LINK_MODE_PARAMS(100, FX, Full), -}; - const struct nla_policy ethnl_linkmodes_set_policy[] = { [ETHTOOL_A_LINKMODES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -327,9 +177,6 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int i; - BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) != - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { -- cgit v1.2.3 From f5a5589c72509abaeb705123b64e7f5a078becf0 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 2 Feb 2021 11:34:08 -0800 Subject: tcp: use a smaller percpu_counter batch size for sk_alloc Currently, a percpu_counter with the default batch size (2*nr_cpus) is used to record the total # of active sockets per protocol. This means sk_sockets_allocated_read_positive() could be off by +/-2*(nr_cpus^2). This under/over-estimation could lead to wrong memory suppression conditions in __sk_raise_mem_allocated(). Fix this by using a more reasonable fixed batch size of 16. See related commit cf86a086a180 ("net/dst: use a smaller percpu_counter batch for dst entries accounting") that addresses a similar issue. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Reviewed-by: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20210202193408.1171634-1-weiwan@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 129d200bccb4..690e496a0e79 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1350,14 +1350,18 @@ sk_memory_allocated_sub(struct sock *sk, int amt) atomic_long_sub(amt, sk->sk_prot->memory_allocated); } +#define SK_ALLOC_PERCPU_COUNTER_BATCH 16 + static inline void sk_sockets_allocated_dec(struct sock *sk) { - percpu_counter_dec(sk->sk_prot->sockets_allocated); + percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, + SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { - percpu_counter_inc(sk->sk_prot->sockets_allocated); + percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, + SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 -- cgit v1.2.3 From 3dd344ea84e122f791ab55498aab985535f32cba Mon Sep 17 00:00:00 2001 From: Hariharan Ananthakrishnan Date: Fri, 29 Jan 2021 00:12:10 +0000 Subject: net: tracepoint: exposing sk_family in all tcp:tracepoints Similar to sock:inet_sock_set_state tracepoint, expose sk_family to distinguish AF_INET and AF_INET6 families. The following tcp tracepoints are updated: tcp:tcp_destroy_sock tcp:tcp_rcv_space_adjust tcp:tcp_retransmit_skb tcp:tcp_send_reset tcp:tcp_receive_reset tcp:tcp_retransmit_synack tcp:tcp_probe Signed-off-by: Hariharan Ananthakrishnan Signed-off-by: Brendan Gregg Link: https://lore.kernel.org/r/20210129001210.344438-1-hari@netflix.com Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index cf97f6339acb..ba94857eea11 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -59,6 +59,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, __field(int, state) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) @@ -75,6 +76,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; @@ -86,7 +88,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", + TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", + show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) @@ -125,6 +128,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk, __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) @@ -140,6 +144,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk, __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; @@ -153,7 +158,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk, __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", + TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", + show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, @@ -192,6 +198,7 @@ TRACE_EVENT(tcp_retransmit_synack, __field(const void *, req) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) @@ -207,6 +214,7 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); + __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; @@ -218,7 +226,8 @@ TRACE_EVENT(tcp_retransmit_synack, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) @@ -238,6 +247,7 @@ TRACE_EVENT(tcp_probe, __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) @@ -264,6 +274,7 @@ TRACE_EVENT(tcp_probe, __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; + __entry->family = sk->sk_family; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; @@ -276,7 +287,8 @@ TRACE_EVENT(tcp_probe, __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", + TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", + show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, -- cgit v1.2.3 From 49ecc587dca2754571791bebd36e9e36e2a0d973 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Wed, 3 Feb 2021 08:07:59 +0100 Subject: Revert "GTP: add support for flow based tunneling API" This reverts commit 9ab7e76aefc97a9aa664accb59d6e8dc5e52514a. This patch was committed without maintainer approval and despite a number of unaddressed concerns from review. There are several issues that impede the acceptance of this patch and that make a reversion of this particular instance of these changes the best way forward: i) the patch contains several logically separate changes that would be better served as smaller patches (for review purposes) ii) functionality like the handling of end markers has been introduced without further explanation iii) symmetry between the handling of GTPv0 and GTPv1 has been unnecessarily broken iv) the patchset produces 'broken' packets when extension headers are included v) there are no available userspace tools to allow for testing this functionality vi) there is an unaddressed Coverity report against the patch concering memory leakage vii) most importantly, the patch contains a large amount of superfluous churn that impedes other ongoing work with this driver This patch will be reworked into a series that aligns with other ongoing work and facilitates review. Signed-off-by: Jonas Bonn Acked-by: Harald Welte Acked-by: Pravin B Shelar Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 527 ++++++++++--------------------------- include/uapi/linux/gtp.h | 12 - include/uapi/linux/if_link.h | 1 - include/uapi/linux/if_tunnel.h | 1 - tools/include/uapi/linux/if_link.h | 1 - 5 files changed, 144 insertions(+), 398 deletions(-) (limited to 'include') diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 851364314ecc..4c04e271f184 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -74,9 +73,6 @@ struct gtp_dev { unsigned int hash_size; struct hlist_head *tid_hash; struct hlist_head *addr_hash; - /* Used by LWT tunnel. */ - bool collect_md; - struct socket *collect_md_sock; }; static unsigned int gtp_net_id __read_mostly; @@ -183,121 +179,33 @@ static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx, return false; } -static int gtp_set_tun_dst(struct gtp_dev *gtp, struct sk_buff *skb, - unsigned int hdrlen, u8 gtp_version, - __be64 tid, u8 flags) +static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, + unsigned int hdrlen, unsigned int role) { - struct metadata_dst *tun_dst; - int opts_len = 0; - - if (unlikely(flags & GTP1_F_MASK)) - opts_len = sizeof(struct gtpu_metadata); - - tun_dst = udp_tun_rx_dst(skb, gtp->sk1u->sk_family, TUNNEL_KEY, tid, opts_len); - if (!tun_dst) { - netdev_dbg(gtp->dev, "Failed to allocate tun_dst"); - goto err; + if (!gtp_check_ms(skb, pctx, hdrlen, role)) { + netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); + return 1; } - netdev_dbg(gtp->dev, "attaching metadata_dst to skb, gtp ver %d hdrlen %d\n", - gtp_version, hdrlen); - if (unlikely(opts_len)) { - struct gtpu_metadata *opts; - struct gtp1_header *gtp1; - - opts = ip_tunnel_info_opts(&tun_dst->u.tun_info); - gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); - opts->ver = GTP_METADATA_V1; - opts->flags = gtp1->flags; - opts->type = gtp1->type; - netdev_dbg(gtp->dev, "recved control pkt: flag %x type: %d\n", - opts->flags, opts->type); - tun_dst->u.tun_info.key.tun_flags |= TUNNEL_GTPU_OPT; - tun_dst->u.tun_info.options_len = opts_len; - skb->protocol = htons(0xffff); /* Unknown */ - } /* Get rid of the GTP + UDP headers. */ if (iptunnel_pull_header(skb, hdrlen, skb->protocol, - !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)))) { - gtp->dev->stats.rx_length_errors++; - goto err; - } - - skb_dst_set(skb, &tun_dst->dst); - return 0; -err: - return -1; -} - -static int gtp_rx(struct gtp_dev *gtp, struct sk_buff *skb, - unsigned int hdrlen, u8 gtp_version, unsigned int role, - __be64 tid, u8 flags, u8 type) -{ - if (ip_tunnel_collect_metadata() || gtp->collect_md) { - int err; - - err = gtp_set_tun_dst(gtp, skb, hdrlen, gtp_version, tid, flags); - if (err) - goto err; - } else { - struct pdp_ctx *pctx; - - if (flags & GTP1_F_MASK) - hdrlen += 4; - - if (type != GTP_TPDU) - return 1; + !net_eq(sock_net(pctx->sk), dev_net(pctx->dev)))) + return -1; - if (gtp_version == GTP_V0) - pctx = gtp0_pdp_find(gtp, be64_to_cpu(tid)); - else - pctx = gtp1_pdp_find(gtp, be64_to_cpu(tid)); - if (!pctx) { - netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); - return 1; - } - - if (!gtp_check_ms(skb, pctx, hdrlen, role)) { - netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); - return 1; - } - /* Get rid of the GTP + UDP headers. */ - if (iptunnel_pull_header(skb, hdrlen, skb->protocol, - !net_eq(sock_net(pctx->sk), dev_net(gtp->dev)))) { - gtp->dev->stats.rx_length_errors++; - goto err; - } - } - netdev_dbg(gtp->dev, "forwarding packet from GGSN to uplink\n"); + netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n"); /* Now that the UDP and the GTP header have been removed, set up the * new network header. This is required by the upper layer to * calculate the transport header. */ skb_reset_network_header(skb); - if (pskb_may_pull(skb, sizeof(struct iphdr))) { - struct iphdr *iph; - - iph = ip_hdr(skb); - if (iph->version == 4) { - netdev_dbg(gtp->dev, "inner pkt: ipv4"); - skb->protocol = htons(ETH_P_IP); - } else if (iph->version == 6) { - netdev_dbg(gtp->dev, "inner pkt: ipv6"); - skb->protocol = htons(ETH_P_IPV6); - } else { - netdev_dbg(gtp->dev, "inner pkt error: Unknown type"); - } - } - skb->dev = gtp->dev; - dev_sw_netstats_rx_add(gtp->dev, skb->len); + skb->dev = pctx->dev; + + dev_sw_netstats_rx_add(pctx->dev, skb->len); + netif_rx(skb); return 0; - -err: - gtp->dev->stats.rx_dropped++; - return -1; } /* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */ @@ -306,6 +214,7 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp0_header); struct gtp0_header *gtp0; + struct pdp_ctx *pctx; if (!pskb_may_pull(skb, hdrlen)) return -1; @@ -315,7 +224,16 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) if ((gtp0->flags >> 5) != GTP_V0) return 1; - return gtp_rx(gtp, skb, hdrlen, GTP_V0, gtp->role, gtp0->tid, gtp0->flags, gtp0->type); + if (gtp0->type != GTP_TPDU) + return 1; + + pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid)); + if (!pctx) { + netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); + return 1; + } + + return gtp_rx(pctx, skb, hdrlen, gtp->role); } static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) @@ -323,30 +241,41 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp1_header); struct gtp1_header *gtp1; + struct pdp_ctx *pctx; if (!pskb_may_pull(skb, hdrlen)) return -1; gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); - netdev_dbg(gtp->dev, "GTPv1 recv: flags %x\n", gtp1->flags); if ((gtp1->flags >> 5) != GTP_V1) return 1; + if (gtp1->type != GTP_TPDU) + return 1; + /* From 29.060: "This field shall be present if and only if any one or * more of the S, PN and E flags are set.". * * If any of the bit is set, then the remaining ones also have to be * set. */ + if (gtp1->flags & GTP1_F_MASK) + hdrlen += 4; + /* Make sure the header is larger enough, including extensions. */ if (!pskb_may_pull(skb, hdrlen)) return -1; gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); - return gtp_rx(gtp, skb, hdrlen, GTP_V1, gtp->role, - key32_to_tunnel_id(gtp1->tid), gtp1->flags, gtp1->type); + pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid)); + if (!pctx) { + netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); + return 1; + } + + return gtp_rx(pctx, skb, hdrlen, gtp->role); } static void __gtp_encap_destroy(struct sock *sk) @@ -386,11 +315,6 @@ static void gtp_encap_disable(struct gtp_dev *gtp) { gtp_encap_disable_sock(gtp->sk0); gtp_encap_disable_sock(gtp->sk1u); - if (gtp->collect_md_sock) { - udp_tunnel_sock_release(gtp->collect_md_sock); - gtp->collect_md_sock = NULL; - netdev_dbg(gtp->dev, "GTP socket released.\n"); - } } /* UDP encapsulation receive handler. See net/ipv4/udp.c. @@ -405,8 +329,7 @@ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!gtp) return 1; - netdev_dbg(gtp->dev, "encap_recv sk=%p type %d\n", - sk, udp_sk(sk)->encap_type); + netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk); switch (udp_sk(sk)->encap_type) { case UDP_ENCAP_GTP0: @@ -460,13 +383,12 @@ static void gtp_dev_uninit(struct net_device *dev) static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, const struct sock *sk, - __be32 daddr, - __be32 saddr) + __be32 daddr) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; - fl4->saddr = saddr; + fl4->saddr = inet_sk(sk)->inet_saddr; fl4->flowi4_tos = RT_CONN_FLAGS(sk); fl4->flowi4_proto = sk->sk_protocol; @@ -490,7 +412,7 @@ static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) gtp0->tid = cpu_to_be64(pctx->u.v0.tid); } -static inline void gtp1_push_header(struct sk_buff *skb, __be32 tid) +static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) { int payload_len = skb->len; struct gtp1_header *gtp1; @@ -506,63 +428,46 @@ static inline void gtp1_push_header(struct sk_buff *skb, __be32 tid) gtp1->flags = 0x30; /* v1, GTP-non-prime. */ gtp1->type = GTP_TPDU; gtp1->length = htons(payload_len); - gtp1->tid = tid; + gtp1->tid = htonl(pctx->u.v1.o_tei); /* TODO: Suppport for extension header, sequence number and N-PDU. * Update the length field if any of them is available. */ } -static inline int gtp1_push_control_header(struct sk_buff *skb, - __be32 tid, - struct gtpu_metadata *opts, - struct net_device *dev) -{ - struct gtp1_header *gtp1c; - int payload_len; - - if (opts->ver != GTP_METADATA_V1) - return -ENOENT; +struct gtp_pktinfo { + struct sock *sk; + struct iphdr *iph; + struct flowi4 fl4; + struct rtable *rt; + struct pdp_ctx *pctx; + struct net_device *dev; + __be16 gtph_port; +}; - if (opts->type == 0xFE) { - /* for end marker ignore skb data. */ - netdev_dbg(dev, "xmit pkt with null data"); - pskb_trim(skb, 0); +static void gtp_push_header(struct sk_buff *skb, struct gtp_pktinfo *pktinfo) +{ + switch (pktinfo->pctx->gtp_version) { + case GTP_V0: + pktinfo->gtph_port = htons(GTP0_PORT); + gtp0_push_header(skb, pktinfo->pctx); + break; + case GTP_V1: + pktinfo->gtph_port = htons(GTP1U_PORT); + gtp1_push_header(skb, pktinfo->pctx); + break; } - if (skb_cow_head(skb, sizeof(*gtp1c)) < 0) - return -ENOMEM; - - payload_len = skb->len; - - gtp1c = skb_push(skb, sizeof(*gtp1c)); - - gtp1c->flags = opts->flags; - gtp1c->type = opts->type; - gtp1c->length = htons(payload_len); - gtp1c->tid = tid; - netdev_dbg(dev, "xmit control pkt: ver %d flags %x type %x pkt len %d tid %x", - opts->ver, opts->flags, opts->type, skb->len, tid); - return 0; } -struct gtp_pktinfo { - struct sock *sk; - __u8 tos; - struct flowi4 fl4; - struct rtable *rt; - struct net_device *dev; - __be16 gtph_port; -}; - static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo, - struct sock *sk, - __u8 tos, - struct rtable *rt, + struct sock *sk, struct iphdr *iph, + struct pdp_ctx *pctx, struct rtable *rt, struct flowi4 *fl4, struct net_device *dev) { pktinfo->sk = sk; - pktinfo->tos = tos; + pktinfo->iph = iph; + pktinfo->pctx = pctx; pktinfo->rt = rt; pktinfo->fl4 = *fl4; pktinfo->dev = dev; @@ -572,99 +477,40 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo) { struct gtp_dev *gtp = netdev_priv(dev); - struct gtpu_metadata *opts = NULL; - struct sock *sk = NULL; struct pdp_ctx *pctx; struct rtable *rt; struct flowi4 fl4; - u8 gtp_version; - __be16 df = 0; - __be32 tun_id; - __be32 daddr; - __be32 saddr; - __u8 tos; + struct iphdr *iph; + __be16 df; int mtu; - if (gtp->collect_md) { - /* LWT GTP1U encap */ - struct ip_tunnel_info *info = NULL; - - info = skb_tunnel_info(skb); - if (!info) { - netdev_dbg(dev, "missing tunnel info"); - return -ENOENT; - } - if (info->key.tp_dst && ntohs(info->key.tp_dst) != GTP1U_PORT) { - netdev_dbg(dev, "unexpected GTP dst port: %d", ntohs(info->key.tp_dst)); - return -EOPNOTSUPP; - } - pctx = NULL; - gtp_version = GTP_V1; - tun_id = tunnel_id_to_key32(info->key.tun_id); - daddr = info->key.u.ipv4.dst; - saddr = info->key.u.ipv4.src; - sk = gtp->sk1u; - if (!sk) { - netdev_dbg(dev, "missing tunnel sock"); - return -EOPNOTSUPP; - } - tos = info->key.tos; - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) - df = htons(IP_DF); - - if (info->options_len != 0) { - if (info->key.tun_flags & TUNNEL_GTPU_OPT) { - opts = ip_tunnel_info_opts(info); - } else { - netdev_dbg(dev, "missing tunnel metadata for control pkt"); - return -EOPNOTSUPP; - } - } - netdev_dbg(dev, "flow-based GTP1U encap: tunnel id %d\n", - be32_to_cpu(tun_id)); - } else { - struct iphdr *iph; - - if (ntohs(skb->protocol) != ETH_P_IP) - return -EOPNOTSUPP; - - iph = ip_hdr(skb); - - /* Read the IP destination address and resolve the PDP context. - * Prepend PDP header with TEI/TID from PDP ctx. - */ - if (gtp->role == GTP_ROLE_SGSN) - pctx = ipv4_pdp_find(gtp, iph->saddr); - else - pctx = ipv4_pdp_find(gtp, iph->daddr); + /* Read the IP destination address and resolve the PDP context. + * Prepend PDP header with TEI/TID from PDP ctx. + */ + iph = ip_hdr(skb); + if (gtp->role == GTP_ROLE_SGSN) + pctx = ipv4_pdp_find(gtp, iph->saddr); + else + pctx = ipv4_pdp_find(gtp, iph->daddr); - if (!pctx) { - netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", - &iph->daddr); - return -ENOENT; - } - sk = pctx->sk; - netdev_dbg(dev, "found PDP context %p\n", pctx); - - gtp_version = pctx->gtp_version; - tun_id = htonl(pctx->u.v1.o_tei); - daddr = pctx->peer_addr_ip4.s_addr; - saddr = inet_sk(sk)->inet_saddr; - tos = iph->tos; - df = iph->frag_off; - netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n", - &iph->saddr, &iph->daddr); + if (!pctx) { + netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", + &iph->daddr); + return -ENOENT; } + netdev_dbg(dev, "found PDP context %p\n", pctx); - rt = ip4_route_output_gtp(&fl4, sk, daddr, saddr); + rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer_addr_ip4.s_addr); if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to SSGN %pI4\n", &daddr); + netdev_dbg(dev, "no route to SSGN %pI4\n", + &pctx->peer_addr_ip4.s_addr); dev->stats.tx_carrier_errors++; goto err; } if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to SSGN %pI4\n", &daddr); + netdev_dbg(dev, "circular route to SSGN %pI4\n", + &pctx->peer_addr_ip4.s_addr); dev->stats.collisions++; goto err_rt; } @@ -672,10 +518,11 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* This is similar to tnl_update_pmtu(). */ + df = iph->frag_off; if (df) { mtu = dst_mtu(&rt->dst) - dev->hard_header_len - sizeof(struct iphdr) - sizeof(struct udphdr); - switch (gtp_version) { + switch (pctx->gtp_version) { case GTP_V0: mtu -= sizeof(struct gtp0_header); break; @@ -689,38 +536,17 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, false); - if (!skb_is_gso(skb) && (df & htons(IP_DF)) && mtu < skb->len) { - netdev_dbg(dev, "packet too big, fragmentation needed"); + if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && + mtu < ntohs(iph->tot_len)) { + netdev_dbg(dev, "packet too big, fragmentation needed\n"); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto err_rt; } - gtp_set_pktinfo_ipv4(pktinfo, sk, tos, rt, &fl4, dev); - - if (unlikely(opts)) { - int err; - - pktinfo->gtph_port = htons(GTP1U_PORT); - err = gtp1_push_control_header(skb, tun_id, opts, dev); - if (err) { - netdev_info(dev, "cntr pkt error %d", err); - goto err_rt; - } - return 0; - } - - switch (gtp_version) { - case GTP_V0: - pktinfo->gtph_port = htons(GTP0_PORT); - gtp0_push_header(skb, pctx); - break; - case GTP_V1: - pktinfo->gtph_port = htons(GTP1U_PORT); - gtp1_push_header(skb, tun_id); - break; - } + gtp_set_pktinfo_ipv4(pktinfo, pctx->sk, iph, pctx, rt, &fl4, dev); + gtp_push_header(skb, pktinfo); return 0; err_rt: @@ -731,6 +557,7 @@ err: static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) { + unsigned int proto = ntohs(skb->protocol); struct gtp_pktinfo pktinfo; int err; @@ -742,22 +569,32 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) /* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */ rcu_read_lock(); - err = gtp_build_skb_ip4(skb, dev, &pktinfo); + switch (proto) { + case ETH_P_IP: + err = gtp_build_skb_ip4(skb, dev, &pktinfo); + break; + default: + err = -EOPNOTSUPP; + break; + } rcu_read_unlock(); if (err < 0) goto tx_err; - udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb, - pktinfo.fl4.saddr, - pktinfo.fl4.daddr, - pktinfo.tos, - ip4_dst_hoplimit(&pktinfo.rt->dst), - 0, - pktinfo.gtph_port, - pktinfo.gtph_port, - true, - false); + switch (proto) { + case ETH_P_IP: + netdev_dbg(pktinfo.dev, "gtp -> IP src: %pI4 dst: %pI4\n", + &pktinfo.iph->saddr, &pktinfo.iph->daddr); + udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb, + pktinfo.fl4.saddr, pktinfo.fl4.daddr, + pktinfo.iph->tos, + ip4_dst_hoplimit(&pktinfo.rt->dst), + 0, + pktinfo.gtph_port, pktinfo.gtph_port, + true, false); + break; + } return NETDEV_TX_OK; tx_err: @@ -773,19 +610,6 @@ static const struct net_device_ops gtp_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, }; -static struct gtp_dev *gtp_find_flow_based_dev(struct net *net) -{ - struct gtp_net *gn = net_generic(net, gtp_net_id); - struct gtp_dev *gtp; - - list_for_each_entry(gtp, &gn->gtp_dev_list, list) { - if (gtp->collect_md) - return gtp; - } - - return NULL; -} - static void gtp_link_setup(struct net_device *dev) { dev->netdev_ops = >p_netdev_ops; @@ -810,7 +634,7 @@ static void gtp_link_setup(struct net_device *dev) } static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize); -static int gtp_encap_enable(struct gtp_dev *gtp, struct net_device *dev, struct nlattr *data[]); +static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]); static void gtp_destructor(struct net_device *dev) { @@ -828,24 +652,11 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, struct gtp_net *gn; int hashsize, err; - if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1] && - !data[IFLA_GTP_COLLECT_METADATA]) + if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1]) return -EINVAL; gtp = netdev_priv(dev); - if (data[IFLA_GTP_COLLECT_METADATA]) { - if (data[IFLA_GTP_FD0]) { - netdev_dbg(dev, "LWT device does not support setting v0 socket"); - return -EINVAL; - } - if (gtp_find_flow_based_dev(src_net)) { - netdev_dbg(dev, "LWT device already exist"); - return -EBUSY; - } - gtp->collect_md = true; - } - if (!data[IFLA_GTP_PDP_HASHSIZE]) { hashsize = 1024; } else { @@ -858,7 +669,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; - err = gtp_encap_enable(gtp, dev, data); + err = gtp_encap_enable(gtp, data); if (err < 0) goto out_hashtable; @@ -872,7 +683,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, list_add_rcu(>p->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; - netdev_dbg(dev, "registered new GTP interface %s\n", dev->name); + netdev_dbg(dev, "registered new GTP interface\n"); return 0; @@ -903,7 +714,6 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = { [IFLA_GTP_FD1] = { .type = NLA_U32 }, [IFLA_GTP_PDP_HASHSIZE] = { .type = NLA_U32 }, [IFLA_GTP_ROLE] = { .type = NLA_U32 }, - [IFLA_GTP_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static int gtp_validate(struct nlattr *tb[], struct nlattr *data[], @@ -927,9 +737,6 @@ static int gtp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_GTP_PDP_HASHSIZE, gtp->hash_size)) goto nla_put_failure; - if (gtp->collect_md && nla_put_flag(skb, IFLA_GTP_COLLECT_METADATA)) - goto nla_put_failure; - return 0; nla_put_failure: @@ -975,24 +782,35 @@ err1: return -ENOMEM; } -static int __gtp_encap_enable_socket(struct socket *sock, int type, - struct gtp_dev *gtp) +static struct sock *gtp_encap_enable_socket(int fd, int type, + struct gtp_dev *gtp) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; + struct socket *sock; struct sock *sk; + int err; + + pr_debug("enable gtp on %d, %d\n", fd, type); + + sock = sockfd_lookup(fd, &err); + if (!sock) { + pr_debug("gtp socket fd=%d not found\n", fd); + return NULL; + } sk = sock->sk; if (sk->sk_protocol != IPPROTO_UDP || sk->sk_type != SOCK_DGRAM || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { - pr_debug("socket not UDP\n"); - return -EINVAL; + pr_debug("socket fd=%d not UDP\n", fd); + sk = ERR_PTR(-EINVAL); + goto out_sock; } lock_sock(sk); if (sk->sk_user_data) { - release_sock(sock->sk); - return -EBUSY; + sk = ERR_PTR(-EBUSY); + goto out_rel_sock; } sock_hold(sk); @@ -1003,58 +821,15 @@ static int __gtp_encap_enable_socket(struct socket *sock, int type, tuncfg.encap_destroy = gtp_encap_destroy; setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg); - release_sock(sock->sk); - return 0; -} -static struct sock *gtp_encap_enable_socket(int fd, int type, - struct gtp_dev *gtp) -{ - struct socket *sock; - int err; - - pr_debug("enable gtp on %d, %d\n", fd, type); - - sock = sockfd_lookup(fd, &err); - if (!sock) { - pr_debug("gtp socket fd=%d not found\n", fd); - return NULL; - } - err = __gtp_encap_enable_socket(sock, type, gtp); +out_rel_sock: + release_sock(sock->sk); +out_sock: sockfd_put(sock); - if (err) - return ERR_PTR(err); - - return sock->sk; + return sk; } -static struct socket *gtp_create_gtp_socket(struct gtp_dev *gtp, struct net_device *dev) -{ - struct udp_port_cfg udp_conf; - struct socket *sock; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - udp_conf.local_udp_port = htons(GTP1U_PORT); - - err = udp_sock_create(dev_net(dev), &udp_conf, &sock); - if (err < 0) { - pr_debug("create gtp sock failed: %d\n", err); - return ERR_PTR(err); - } - err = __gtp_encap_enable_socket(sock, UDP_ENCAP_GTP1U, gtp); - if (err) { - pr_debug("enable gtp sock encap failed: %d\n", err); - udp_tunnel_sock_release(sock); - return ERR_PTR(err); - } - pr_debug("create gtp sock done\n"); - return sock; -} - -static int gtp_encap_enable(struct gtp_dev *gtp, struct net_device *dev, struct nlattr *data[]) +static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) { struct sock *sk1u = NULL; struct sock *sk0 = NULL; @@ -1078,25 +853,11 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct net_device *dev, struct } } - if (data[IFLA_GTP_COLLECT_METADATA]) { - struct socket *sock; - - if (!sk1u) { - sock = gtp_create_gtp_socket(gtp, dev); - if (IS_ERR(sock)) - return PTR_ERR(sock); - - gtp->collect_md_sock = sock; - sk1u = sock->sk; - } else { - gtp->collect_md_sock = NULL; - } - } - if (data[IFLA_GTP_ROLE]) { role = nla_get_u32(data[IFLA_GTP_ROLE]); if (role > GTP_ROLE_SGSN) { - gtp_encap_disable(gtp); + gtp_encap_disable_sock(sk0); + gtp_encap_disable_sock(sk1u); return -EINVAL; } } @@ -1655,7 +1416,7 @@ static int __init gtp_init(void) if (err < 0) goto unreg_genl_family; - pr_info("GTP module loaded (pdp ctx size %zd bytes) with tnl-md support\n", + pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", sizeof(struct pdp_ctx)); return 0; diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 62aff78b7c56..79f9191bbb24 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,8 +2,6 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ -#include - #define GTP_GENL_MCGRP_NAME "gtp" enum gtp_genl_cmds { @@ -36,14 +34,4 @@ enum gtp_attrs { }; #define GTPA_MAX (__GTPA_MAX + 1) -enum { - GTP_METADATA_V1 -}; - -struct gtpu_metadata { - __u8 ver; - __u8 flags; - __u8 type; -}; - #endif /* _UAPI_LINUX_GTP_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index eb8018c3a737..91c8dda6d95d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -811,7 +811,6 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, - IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 802da679fab1..7d9105533c7b 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,7 +176,6 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) -#define TUNNEL_GTPU_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 28d649bda686..d208b2af697f 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -617,7 +617,6 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, - IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) -- cgit v1.2.3 From 7e3ce05e7f650371061d0b9eec1e1cf74ed6fca0 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 3 Feb 2021 22:48:16 -0300 Subject: netlink: add tracepoint at NL_SET_ERR_MSG Often userspace won't request the extack information, or they don't log it because of log level or so, and even when they do, sometimes it's not enough to know exactly what caused the error. Netlink extack is the standard way of reporting erros with descriptive error messages. With a trace point on it, we then can know exactly where the error happened, regardless of userspace app. Also, we can even see if the err msg was overwritten. The wrapper do_trace_netlink_extack() is because trace points shouldn't be called from .h files, as trace points are not that small, and the function call to do_trace_netlink_extack() on the macros is not protected by tracepoint_enabled() because the macros are called from modules, and this would require exporting some trace structs. As this is error path, it's better to export just the wrapper instead. v2: removed leftover tracepoint declaration Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: David Ahern Link: https://lore.kernel.org/r/4546b63e67b2989789d146498b13cc09e1fdc543.1612403190.git.marcelo.leitner@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 6 ++++++ include/trace/events/netlink.h | 29 +++++++++++++++++++++++++++++ net/netlink/af_netlink.c | 8 ++++++++ 3 files changed, 43 insertions(+) create mode 100644 include/trace/events/netlink.h (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 9f118771e248..0bcf98098c5a 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -11,6 +11,8 @@ struct net; +void do_trace_netlink_extack(const char *msg); + static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; @@ -90,6 +92,8 @@ struct netlink_ext_ack { static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ + do_trace_netlink_extack(__msg); \ + \ if (__extack) \ __extack->_msg = __msg; \ } while (0) @@ -110,6 +114,8 @@ struct netlink_ext_ack { static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ + do_trace_netlink_extack(__msg); \ + \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ diff --git a/include/trace/events/netlink.h b/include/trace/events/netlink.h new file mode 100644 index 000000000000..3b7be3b386a4 --- /dev/null +++ b/include/trace/events/netlink.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM netlink + +#if !defined(_TRACE_NETLINK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NETLINK_H + +#include + +TRACE_EVENT(netlink_extack, + + TP_PROTO(const char *msg), + + TP_ARGS(msg), + + TP_STRUCT__entry( + __string( msg, msg ) + ), + + TP_fast_assign( + __assign_str(msg, msg); + ), + + TP_printk("msg=%s", __get_str(msg)) +); + +#endif /* _TRACE_NETLINK_H */ + +/* This part must be outside protection */ +#include diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index daca50d6bb12..dd488938447f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -67,6 +67,8 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include #include "af_netlink.h" @@ -147,6 +149,12 @@ static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; +void do_trace_netlink_extack(const char *msg) +{ + trace_netlink_extack(msg); +} +EXPORT_SYMBOL(do_trace_netlink_extack); + static inline u32 netlink_group_mask(u32 group) { return group ? 1 << (group - 1) : 0; -- cgit v1.2.3 From 0053859496baa17c7675526936677c9213bf5a0d Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Thu, 4 Feb 2021 18:18:38 +0000 Subject: net: add EXPORT_INDIRECT_CALLABLE wrapper When a static function is annotated with INDIRECT_CALLABLE_SCOPE and CONFIG_RETPOLINE is set, the static keyword is removed. Sometimes the function needs to be exported but EXPORT_SYMBOL can't be used because if CONFIG_RETPOLINE is not set, we will attempt to export a static symbol. This patch introduces a new indirect call wrapper: EXPORT_INDIRECT_CALLABLE. This basically does EXPORT_SYMBOL when CONFIG_RETPOLINE is set, but does nothing when it's not. Reported-by: Stephen Rothwell Signed-off-by: Brian Vazquez Link: https://lore.kernel.org/r/20210204181839.558951-1-brianvv@google.com Signed-off-by: Jakub Kicinski --- include/linux/indirect_call_wrapper.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 54c02c84906a..a8345c8a613d 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -36,6 +36,7 @@ #define INDIRECT_CALLABLE_DECLARE(f) f #define INDIRECT_CALLABLE_SCOPE +#define EXPORT_INDIRECT_CALLABLE(f) EXPORT_SYMBOL(f) #else #define INDIRECT_CALL_1(f, f1, ...) f(__VA_ARGS__) @@ -44,6 +45,7 @@ #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) f(__VA_ARGS__) #define INDIRECT_CALLABLE_DECLARE(f) #define INDIRECT_CALLABLE_SCOPE static +#define EXPORT_INDIRECT_CALLABLE(f) #endif /* -- cgit v1.2.3 From 1d7bab6a94458e959f3f55788fd50ddc7d97403b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 2 Feb 2021 13:30:54 +0000 Subject: mm: constify page_is_pfmemalloc() argument The function only tests for page->index, so its argument should be const. Signed-off-by: Alexander Lobakin Reviewed-by: Jesse Brandeburg Acked-by: David Rientjes Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecdf8a8cd6ae..078633d43af9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1584,7 +1584,7 @@ struct address_space *page_mapping_file(struct page *page); * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ -static inline bool page_is_pfmemalloc(struct page *page) +static inline bool page_is_pfmemalloc(const struct page *page) { /* * Page index cannot be this large so this must be -- cgit v1.2.3 From 48f971c9c80a728646fc03367a28df747f20d0f4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 2 Feb 2021 13:31:05 +0000 Subject: skbuff: constify skb_propagate_pfmemalloc() "page" argument The function doesn't write anything to the page struct itself, so this argument can be const. Misc: align second argument to the brace while at it. Signed-off-by: Alexander Lobakin Reviewed-by: Jesse Brandeburg Acked-by: David Rientjes Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9313b5aaf45b..b027526da4f9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2943,8 +2943,8 @@ static inline struct page *dev_alloc_page(void) * @page: The page that was allocated from skb_alloc_page * @skb: The skb that may need pfmemalloc set */ -static inline void skb_propagate_pfmemalloc(struct page *page, - struct sk_buff *skb) +static inline void skb_propagate_pfmemalloc(const struct page *page, + struct sk_buff *skb) { if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; -- cgit v1.2.3 From bc38f30f8dbce0afb8af05d917bee084b1329418 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 2 Feb 2021 13:31:21 +0000 Subject: net: introduce common dev_page_is_reusable() A bunch of drivers test the page before reusing/recycling for two common conditions: - if a page was allocated under memory pressure (pfmemalloc page); - if a page was allocated at a distant memory node (to exclude slowdowns). Introduce a new common inline for doing this, with likely() already folded inside to make driver code a bit simpler. Suggested-by: David Rientjes Suggested-by: Jakub Kicinski Cc: John Hubbard Signed-off-by: Alexander Lobakin Reviewed-by: Jesse Brandeburg Acked-by: David Rientjes Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b027526da4f9..0e42c53b8ca9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2938,6 +2938,22 @@ static inline struct page *dev_alloc_page(void) return dev_alloc_pages(0); } +/** + * dev_page_is_reusable - check whether a page can be reused for network Rx + * @page: the page to test + * + * A page shouldn't be considered for reusing/recycling if it was allocated + * under memory pressure or at a distant memory node. + * + * Returns false if this page should be returned to page allocator, true + * otherwise. + */ +static inline bool dev_page_is_reusable(const struct page *page) +{ + return likely(page_to_nid(page) == numa_mem_id() && + !page_is_pfmemalloc(page)); +} + /** * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page * @page: The page that was allocated from skb_alloc_page -- cgit v1.2.3 From a4a600dd301ccde6ea239804ec1f19364a39d643 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 3 Feb 2021 16:54:22 +0800 Subject: udp: call udp_encap_enable for v6 sockets when enabling encap When enabling encap for a ipv6 socket without udp_encap_needed_key increased, UDP GRO won't work for v4 mapped v6 address packets as sk will be NULL in udp4_gro_receive(). This patch is to enable it by increasing udp_encap_needed_key for v6 sockets in udp_tunnel_encap_enable(), and correspondingly decrease udp_encap_needed_key in udpv6_destroy_sock(). v1->v2: - add udp_encap_disable() and export it. v2->v3: - add the change for rxrpc and bareudp into one patch, as Alex suggested. v3->v4: - move rxrpc part to another patch. Acked-by: Willem de Bruijn Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 6 ------ include/net/udp.h | 1 + include/net/udp_tunnel.h | 3 +-- net/ipv4/udp.c | 6 ++++++ net/ipv6/udp.c | 4 +++- 5 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 1b8f59713fc7..7511bca9c15e 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -240,12 +240,6 @@ static int bareudp_socket_create(struct bareudp_dev *bareudp, __be16 port) tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg); - /* As the setup_udp_tunnel_sock does not call udp_encap_enable if the - * socket type is v6 an explicit call to udp_encap_enable is needed. - */ - if (sock->sk->sk_family == AF_INET6) - udp_encap_enable(); - rcu_assign_pointer(bareudp->sock, sock); return 0; } diff --git a/include/net/udp.h b/include/net/udp.h index 01351ba25b87..5ddbb42fdb36 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -467,6 +467,7 @@ void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); +void udp_encap_disable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 282d10ee60e1..afc7ce713657 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -181,9 +181,8 @@ static inline void udp_tunnel_encap_enable(struct socket *sock) #if IS_ENABLED(CONFIG_IPV6) if (sock->sk->sk_family == PF_INET6) ipv6_stub->udpv6_encap_enable(); - else #endif - udp_encap_enable(); + udp_encap_enable(); } #define UDP_TUNNEL_NIC_MAX_TABLES 4 diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69ea76578abb..48208fb4e895 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -596,6 +596,12 @@ void udp_encap_enable(void) } EXPORT_SYMBOL(udp_encap_enable); +void udp_encap_disable(void) +{ + static_branch_dec(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_disable); + /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b9f3dfdd2383..d75429205084 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1608,8 +1608,10 @@ void udpv6_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (up->encap_enabled) + if (up->encap_enabled) { static_branch_dec(&udpv6_encap_needed_key); + udp_encap_disable(); + } } inet6_destroy_sock(sk); -- cgit v1.2.3 From 1faba27f11c8da244e793546a1b35a9b1da8208e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 3 Feb 2021 15:51:09 +0200 Subject: ipv6: silence compilation warning for non-IPV6 builds The W=1 compilation of allmodconfig generates the following warning: net/ipv6/icmp.c:448:6: warning: no previous prototype for 'icmp6_send' [-Wmissing-prototypes] 448 | void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, | ^~~~~~~~~~ Fix it by providing function declaration for builds with ipv6 as a module. Signed-off-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- include/linux/icmpv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 1b3371ae8193..452d8978ffc7 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -16,9 +16,9 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr); -#if IS_BUILTIN(CONFIG_IPV6) void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr); +#if IS_BUILTIN(CONFIG_IPV6) static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { icmp6_send(skb, type, code, info, NULL); -- cgit v1.2.3 From f9a4719cc16fcf4f8816521e0c903a218fa072b6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 3 Feb 2021 15:51:10 +0200 Subject: ipv6: move udp declarations to net/udp.h Fix the following compilation warning: net/ipv6/udp.c:1031:30: warning: no previous prototype for 'udp_v6_early_demux' [-Wmissing-prototypes] 1031 | INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) | ^~~~~~~~~~~~~~~~~~ net/ipv6/udp.c:1072:29: warning: no previous prototype for 'udpv6_rcv' [-Wmissing-prototypes] 1072 | INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb) | ^~~~~~~~~ Signed-off-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- include/net/udp.h | 3 +++ net/ipv6/ip6_input.c | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 5ddbb42fdb36..a132a02b2f2c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -173,6 +173,9 @@ INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); + struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index e96304d8a4a7..e9d2a4a409aa 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -44,7 +45,6 @@ #include #include -INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *)); static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -352,7 +352,6 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, ip6_sublist_rcv(&sublist, curr_dev, curr_net); } -INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); /* -- cgit v1.2.3 From 04f00ab2275f60658b5e4996e28f52ab1bc51d75 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 3 Feb 2021 15:51:11 +0200 Subject: net/core: move gro function declarations to separate header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fir the following compilation warnings: 1031 | INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) net/ipv6/ip6_offload.c:182:41: warning: no previous prototype for ‘ipv6_gro_receive’ [-Wmissing-prototypes] 182 | INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, | ^~~~~~~~~~~~~~~~ net/ipv6/ip6_offload.c:320:29: warning: no previous prototype for ‘ipv6_gro_complete’ [-Wmissing-prototypes] 320 | INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) | ^~~~~~~~~~~~~~~~~ net/ipv6/ip6_offload.c:182:41: warning: no previous prototype for ‘ipv6_gro_receive’ [-Wmissing-prototypes] 182 | INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, | ^~~~~~~~~~~~~~~~ net/ipv6/ip6_offload.c:320:29: warning: no previous prototype for ‘ipv6_gro_complete’ [-Wmissing-prototypes] 320 | INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) Signed-off-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- include/net/gro.h | 12 ++++++++++++ net/core/dev.c | 7 +------ net/ipv6/ip6_offload.c | 1 + 3 files changed, 14 insertions(+), 6 deletions(-) create mode 100644 include/net/gro.h (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h new file mode 100644 index 000000000000..8a6eb5303cc4 --- /dev/null +++ b/include/net/gro.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_IPV6_GRO_H +#define _NET_IPV6_GRO_H + +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); +#endif /* _NET_IPV6_GRO_H */ diff --git a/net/core/dev.c b/net/core/dev.c index aae116d059da..21d74d30f5d7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -101,6 +101,7 @@ #include #include #include +#include #include #include #include @@ -5751,8 +5752,6 @@ static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) gro_normal_list(napi); } -INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; @@ -5921,10 +5920,6 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) napi_gro_complete(napi, oldest); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, - struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index a80f90bf3ae7..1b9827ff8ccf 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "ip6_offload.h" -- cgit v1.2.3 From edf597da02a01edb26bddf06890fb81eee3d82cf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 3 Feb 2021 15:51:12 +0200 Subject: netfilter: move handlers to net/ip_vs.h Fix the following compilation warnings: net/netfilter/ipvs/ip_vs_proto_tcp.c:147:1: warning: no previous prototype for 'tcp_snat_handler' [-Wmissing-prototypes] 147 | tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, | ^~~~~~~~~~~~~~~~ net/netfilter/ipvs/ip_vs_proto_udp.c:136:1: warning: no previous prototype for 'udp_snat_handler' [-Wmissing-prototypes] 136 | udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, | ^~~~~~~~~~~~~~~~ Signed-off-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 11 +++++++++++ net/netfilter/ipvs/ip_vs_core.c | 12 ------------ 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d609e957a3ec..7cb5a1aace40 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1712,4 +1712,15 @@ ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) atomic_read(&dest->inactconns); } +#ifdef CONFIG_IP_VS_PROTO_TCP +INDIRECT_CALLABLE_DECLARE(int + tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP +INDIRECT_CALLABLE_DECLARE(int + udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); +#endif #endif /* _NET_IP_VS_H */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 54e086c65721..0c132ff9b446 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,18 +68,6 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); -#ifdef CONFIG_IP_VS_PROTO_TCP -INDIRECT_CALLABLE_DECLARE(int - tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP -INDIRECT_CALLABLE_DECLARE(int - udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); -#endif - #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) #define SNAT_CALL(f, ...) \ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) -- cgit v1.2.3 From 10742efc20a429b2040658af685d6bb2aa674a73 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 21 Jan 2021 19:41:52 +0200 Subject: net/mlx5e: VF tunnel TX traffic offloading When tunnel endpoint is on VF, driver still assumes that endpoint is on uplink and incorrectly configures encap rule offload according to that assumption. As a result, traffic is sent directly to the uplink and rules installed on representor of tunnel endpoint VF are ignored. Implement following changes to allow offloading tx traffic with tunnel endpoint on VF: - For tunneling flows perform route lookup on route and out devices pair. If out device is uplink and route device is VF of same physical port, then modify packet reg_c_0 metadata register (source port) with the value of VF vport. Use eswitch vhca_id->vport mapping introduced in one of previous patches in the series to obtain vport from route netdevice. - Recirculate encapsulated packets to VF vport in order to apply any flow rules installed on VF representor that match on encapsulated traffic. Only enable support for this functionality when all following conditions are true: - Hardware advertises capability to preserve reg_c_0 value on packet recirculation. - Vport metadata matching is enabled. - Termination tables are to be used by the flow. Example TC rules for VF tunnel traffic: 1. Rule that redirects packets from UL to VF rep that has the tunnel endpoint IP address: $ tc -s filter show dev enp8s0f0 ingress filter protocol ip pref 4 flower chain 0 filter protocol ip pref 4 flower chain 0 handle 0x1 dst_mac 16:c9:a0:2d:69:2c src_mac 0c:42:a1:58:ab:e4 eth_type ipv4 ip_flags nofrag in_hw in_hw_count 1 action order 1: mirred (Egress Redirect to device enp8s0f0_0) stolen index 3 ref 1 bind 1 installed 377 sec used 0 sec Action statistics: Sent 114096 bytes 952 pkt (dropped 0, overlimits 0 requeues 0) Sent software 0 bytes 0 pkt Sent hardware 114096 bytes 952 pkt backlog 0b 0p requeues 0 cookie 878fa48d8c423fc08c3b6ca599b50a97 no_percpu used_hw_stats delayed 2. Rule that decapsulates the tunneled flow and redirects to destination VF representor: $ tc -s filter show dev vxlan_sys_4789 ingress filter protocol ip pref 4 flower chain 0 filter protocol ip pref 4 flower chain 0 handle 0x1 dst_mac ca:2e:a7:3f:f5:0f src_mac 0a:40:bd:30:89:99 eth_type ipv4 enc_dst_ip 7.7.7.5 enc_src_ip 7.7.7.1 enc_key_id 98 enc_dst_port 4789 enc_tos 0 ip_flags nofrag in_hw in_hw_count 1 action order 1: tunnel_key unset pipe index 2 ref 1 bind 1 installed 434 sec used 434 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 used_hw_stats delayed action order 2: mirred (Egress Redirect to device enp8s0f0_1) stolen index 4 ref 1 bind 1 installed 434 sec used 0 sec Action statistics: Sent 129936 bytes 1082 pkt (dropped 0, overlimits 0 requeues 0) Sent software 0 bytes 0 pkt Sent hardware 129936 bytes 1082 pkt backlog 0b 0p requeues 0 cookie ac17cf398c4c69e4a5b2f7aabd1b88ff no_percpu used_hw_stats delayed Co-developed-by: Dmytro Linkin Signed-off-by: Dmytro Linkin Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 87 +++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 + .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 120 +++++++++++++++++++-- include/linux/mlx5/eswitch.h | 2 + 5 files changed, 201 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 280ea1e1e039..43f1508a05b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -165,6 +165,11 @@ struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = { .moffset = 0, .mlen = 2, }, + [VPORT_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0, + .moffset = 2, + .mlen = 2, + }, [TUNNEL_TO_REG] = { .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1, .moffset = 1, @@ -1315,6 +1320,44 @@ static void remove_unready_flow(struct mlx5e_tc_flow *flow) mutex_unlock(&uplink_priv->unready_flows_lock); } +static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv); + +static bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_dev) +{ + struct mlx5_core_dev *out_mdev, *route_mdev; + struct mlx5e_priv *out_priv, *route_priv; + + out_priv = netdev_priv(out_dev); + out_mdev = out_priv->mdev; + route_priv = netdev_priv(route_dev); + route_mdev = route_priv->mdev; + + if (out_mdev->coredev_type != MLX5_COREDEV_PF || + route_mdev->coredev_type != MLX5_COREDEV_VF) + return false; + + return same_hw_devs(out_priv, route_priv); +} + +static int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, + u16 *vport) +{ + struct mlx5e_priv *out_priv, *route_priv; + struct mlx5_core_dev *route_mdev; + struct mlx5_eswitch *esw; + u16 vhca_id; + int err; + + out_priv = netdev_priv(out_dev); + esw = out_priv->mdev->priv.eswitch; + route_priv = netdev_priv(route_dev); + route_mdev = route_priv->mdev; + + vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id); + err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport); + return err; +} + static int mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, @@ -3700,6 +3743,45 @@ static bool is_duplicated_encap_entry(struct mlx5e_priv *priv, return false; } +static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + struct net_device *out_dev, + int route_dev_ifindex, + int out_index) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct net_device *route_dev; + u16 vport_num; + int err = 0; + u32 data; + + route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex); + + if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops || + !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) + goto out; + + err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num); + if (err) + goto out; + + attr->dest_chain = 0; + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE; + data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch, + vport_num); + err = mlx5e_tc_match_to_reg_set(esw->dev, mod_hdr_acts, + MLX5_FLOW_NAMESPACE_FDB, VPORT_TO_REG, data); + if (err) + goto out; + +out: + if (route_dev) + dev_put(route_dev); + return err; +} + static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct net_device *mirred_dev, @@ -3791,6 +3873,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, e->compl_result = 1; attach_flow: + err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev, + e->route_dev_ifindex, out_index); + if (err) + goto out_err; + flow->encaps[out_index].e = e; list_add(&flow->encaps[out_index].list, &e->flows); flow->encaps[out_index].index = out_index; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index 4a2ce241522e..56d809904ea7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -167,6 +167,7 @@ void mlx5e_tc_reoffload_flows_work(struct work_struct *work); enum mlx5e_tc_attr_to_reg { CHAIN_TO_REG, + VPORT_TO_REG, TUNNEL_TO_REG, CTSTATE_TO_REG, ZONE_TO_REG, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 1a045e95bc68..1ab34751329e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -389,12 +389,14 @@ enum mlx5_flow_match_level { enum { MLX5_ESW_DEST_ENCAP = BIT(0), MLX5_ESW_DEST_ENCAP_VALID = BIT(1), + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE = BIT(2), }; enum { MLX5_ESW_ATTR_FLAG_VLAN_HANDLED = BIT(0), MLX5_ESW_ATTR_FLAG_SLOW_PATH = BIT(1), MLX5_ESW_ATTR_FLAG_NO_IN_PORT = BIT(2), + MLX5_ESW_ATTR_FLAG_SRC_REWRITE = BIT(3), }; struct mlx5_esw_flow_attr { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 335dc83d1bb9..1b18f624e04a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -337,6 +337,65 @@ esw_setup_chain_dest(struct mlx5_flow_destination *dest, return 0; } +static void esw_put_dest_tables_loop(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr, + int from, int to) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + int i; + + for (i = from; i < to; i++) + if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + mlx5_chains_put_table(chains, 0, 1, 0); +} + +static bool +esw_is_chain_src_port_rewrite(struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *esw_attr) +{ + int i; + + for (i = esw_attr->split_count; i < esw_attr->out_count; i++) + if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + return true; + return false; +} + +static int +esw_setup_chain_src_port_rewrite(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, + struct mlx5_fs_chains *chains, + struct mlx5_flow_attr *attr, + int *i) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + int j, err; + + if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE)) + return -EOPNOTSUPP; + + for (j = esw_attr->split_count; j < esw_attr->out_count; j++, (*i)++) { + err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, 1, 0, *i); + if (err) + goto err_setup_chain; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_act->pkt_reformat = esw_attr->dests[j].pkt_reformat; + } + return 0; + +err_setup_chain: + esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, j); + return err; +} + +static void esw_cleanup_chain_src_port_rewrite(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + + esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, esw_attr->out_count); +} + static void esw_cleanup_chain_dest(struct mlx5_fs_chains *chains, u32 chain, u32 prio, u32 level) { @@ -381,12 +440,18 @@ esw_setup_dests(struct mlx5_flow_destination *dest, struct mlx5_flow_act *flow_act, struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, int *i) { struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; struct mlx5_fs_chains *chains = esw_chains(esw); int err = 0; + if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) && + MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) && + mlx5_eswitch_vport_match_metadata_enabled(esw)) + attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE; + if (attr->dest_ft) { esw_setup_ft_dest(dest, flow_act, attr, *i); (*i)++; @@ -397,6 +462,8 @@ esw_setup_dests(struct mlx5_flow_destination *dest, err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, 1, 0, *i); (*i)++; + } else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) { + err = esw_setup_chain_src_port_rewrite(dest, flow_act, esw, chains, attr, i); } else { *i = esw_setup_vport_dests(dest, flow_act, esw, esw_attr, *i); } @@ -408,10 +475,15 @@ static void esw_cleanup_dests(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr) { + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; struct mlx5_fs_chains *chains = esw_chains(esw); - if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH) && attr->dest_chain) - esw_cleanup_chain_dest(chains, attr->dest_chain, 1, 0); + if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH)) { + if (attr->dest_chain) + esw_cleanup_chain_dest(chains, attr->dest_chain, 1, 0); + else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) + esw_cleanup_chain_src_port_rewrite(esw, attr); + } } struct mlx5_flow_handle * @@ -448,10 +520,12 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, } } + mlx5_eswitch_set_rule_flow_source(esw, spec, esw_attr); + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { int err; - err = esw_setup_dests(dest, &flow_act, esw, attr, &i); + err = esw_setup_dests(dest, &flow_act, esw, attr, spec, &i); if (err) { rule = ERR_PTR(err); goto err_create_goto_table; @@ -498,8 +572,6 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, goto err_esw_get; } - mlx5_eswitch_set_rule_flow_source(esw, spec, esw_attr); - if (mlx5_eswitch_termtbl_required(esw, attr, &flow_act, spec)) rule = mlx5_eswitch_add_termtbl_rule(esw, fdb, spec, esw_attr, &flow_act, dest, i); @@ -536,7 +608,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, struct mlx5_flow_table *fast_fdb; struct mlx5_flow_table *fwd_fdb; struct mlx5_flow_handle *rule; - int i; + int i, err = 0; fast_fdb = mlx5_chains_get_table(chains, attr->chain, attr->prio, 0); if (IS_ERR(fast_fdb)) { @@ -554,8 +626,18 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, } flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - for (i = 0; i < esw_attr->split_count; i++) - esw_setup_vport_dest(dest, &flow_act, esw, esw_attr, i, i, false); + for (i = 0; i < esw_attr->split_count; i++) { + if (esw_is_chain_src_port_rewrite(esw, esw_attr)) + err = esw_setup_chain_src_port_rewrite(dest, &flow_act, esw, chains, attr, + &i); + else + esw_setup_vport_dest(dest, &flow_act, esw, esw_attr, i, i, false); + + if (err) { + rule = ERR_PTR(err); + goto err_chain_src_rewrite; + } + } dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest[i].ft = fwd_fdb; i++; @@ -570,13 +652,16 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i); - if (IS_ERR(rule)) - goto add_err; + if (IS_ERR(rule)) { + i = esw_attr->split_count; + goto err_chain_src_rewrite; + } atomic64_inc(&esw->offloads.num_flows); return rule; -add_err: +err_chain_src_rewrite: + esw_put_dest_tables_loop(esw, attr, 0, i); esw_vport_tbl_put(esw, &fwd_attr); err_get_fwd: mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); @@ -617,6 +702,7 @@ __mlx5_eswitch_del_rule(struct mlx5_eswitch *esw, if (fwd_rule) { esw_vport_tbl_put(esw, &fwd_attr); mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); + esw_put_dest_tables_loop(esw, attr, 0, esw_attr->split_count); } else { if (split) esw_vport_tbl_put(esw, &fwd_attr); @@ -3020,3 +3106,15 @@ int mlx5_eswitch_vhca_id_to_vport(struct mlx5_eswitch *esw, u16 vhca_id, u16 *vp *vport_num = *res; return 0; } + +u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, + u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (WARN_ON_ONCE(IS_ERR(vport))) + return 0; + + return vport->metadata; +} +EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_set); diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 29fd832950e0..67e341274a22 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -96,6 +96,8 @@ static inline u32 mlx5_eswitch_get_vport_metadata_mask(void) u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, u16 vport_num); +u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, + u16 vport_num); u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ -- cgit v1.2.3 From 48d216e5596a58e3cfa6d4548343f982c5921b79 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 31 Aug 2020 16:18:19 +0300 Subject: net/mlx5e: Refactor reg_c1 usage Following patch in series uses reg_c1 in eswitch code. To use reg_c1 helpers in both TC and eswitch code, refactor existing helpers according to similar use case of reg_c0 and move the functionality into eswitch.h. Calculate reg mappings length from new defines to ensure that they are always in sync and only need to be changed in single place. Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h | 6 ++---- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 2 +- include/linux/mlx5/eswitch.h | 19 +++++++++++++++++++ 5 files changed, 26 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index 76177f7c5ec2..14bcebd4a0b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -651,7 +651,7 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, tc_skb_ext->chain = chain; - zone_restore_id = reg_c1 & ZONE_RESTORE_MAX; + zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK; uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); uplink_priv = &uplink_rpriv->uplink_priv; @@ -660,7 +660,7 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, return false; } - tunnel_id = reg_c1 >> REG_MAPPING_SHIFT(TUNNEL_TO_REG); + tunnel_id = reg_c1 >> ESW_TUN_OFFSET; return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id); #endif /* CONFIG_NET_TC_SKB_EXT */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h index 6503b614337c..69e618d17071 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h @@ -73,7 +73,7 @@ struct mlx5_ct_attr { #define zone_restore_to_reg_ct {\ .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1,\ .moffset = 0,\ - .mlen = 1,\ + .mlen = (ESW_ZONE_ID_BITS / 8),\ .soffset = MLX5_BYTE_OFF(fte_match_param,\ misc_parameters_2.metadata_reg_c_1) + 3,\ } @@ -81,14 +81,12 @@ struct mlx5_ct_attr { #define nic_zone_restore_to_reg_ct {\ .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_B,\ .moffset = 2,\ - .mlen = 1,\ + .mlen = (ESW_ZONE_ID_BITS / 8),\ } #define REG_MAPPING_MLEN(reg) (mlx5e_tc_attr_to_reg_mappings[reg].mlen) #define REG_MAPPING_MOFFSET(reg) (mlx5e_tc_attr_to_reg_mappings[reg].moffset) #define REG_MAPPING_SHIFT(reg) (REG_MAPPING_MOFFSET(reg) * 8) -#define ZONE_RESTORE_BITS (REG_MAPPING_MLEN(ZONE_RESTORE_TO_REG) * 8) -#define ZONE_RESTORE_MAX GENMASK(ZONE_RESTORE_BITS - 1, 0) #if IS_ENABLED(CONFIG_MLX5_TC_CT) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 098f3efa5d4d..90db5a99879d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -173,7 +173,7 @@ struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = { [TUNNEL_TO_REG] = { .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1, .moffset = 1, - .mlen = 3, + .mlen = ((ESW_TUN_OPTS_BITS + ESW_TUN_ID_BITS) / 8), .soffset = MLX5_BYTE_OFF(fte_match_param, misc_parameters_2.metadata_reg_c_1), }, @@ -5649,7 +5649,7 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, tc_skb_ext->chain = chain; zone_restore_id = (reg_b >> REG_MAPPING_SHIFT(NIC_ZONE_RESTORE_TO_REG)) & - ZONE_RESTORE_MAX; + ESW_ZONE_ID_MASK; if (!mlx5e_tc_ct_restore_flow(tc->ct, skb, zone_restore_id)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index ee0029192504..1e4ee02bfb1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -302,7 +302,7 @@ static inline bool mlx5e_cqe_regb_chain(struct mlx5_cqe64 *cqe) reg_b = be32_to_cpu(cqe->ft_metadata); - if (reg_b >> (MLX5E_TC_TABLE_CHAIN_TAG_BITS + ZONE_RESTORE_BITS)) + if (reg_b >> (MLX5E_TC_TABLE_CHAIN_TAG_BITS + ESW_ZONE_ID_BITS)) return false; chain = reg_b & MLX5E_TC_TABLE_CHAIN_TAG_MASK; diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 67e341274a22..3b20e84049c1 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -98,6 +98,25 @@ u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, u16 vport_num); u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, u16 vport_num); + +/* Reg C1 usage: + * Reg C1 = < ESW_TUN_ID(12) | ESW_TUN_OPTS(12) | ESW_ZONE_ID(8) > + * + * Highest 12 bits of reg c1 is the encapsulation tunnel id, next 12 bits is + * encapsulation tunnel options, and the lowest 8 bits are used for zone id. + * + * Zone id is used to restore CT flow when packet misses on chain. + * + * Tunnel id and options are used together to restore the tunnel info metadata + * on miss and to support inner header rewrite by means of implicit chain 0 + * flows. + */ +#define ESW_ZONE_ID_BITS 8 +#define ESW_TUN_OPTS_BITS 12 +#define ESW_TUN_ID_BITS 12 +#define ESW_TUN_OFFSET ESW_ZONE_ID_BITS +#define ESW_ZONE_ID_MASK GENMASK(ESW_ZONE_ID_BITS - 1, 0) + u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ -- cgit v1.2.3 From 8e404fefa58b6138531e3d4b5647ee79f75ae9a8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 31 Aug 2020 16:18:57 +0300 Subject: net/mlx5e: Match recirculated packet miss in slow table using reg_c1 Previous patch in series that implements stack devices RX path implements indirect table rules that match on tunnel VNI. After such rule is created all tunnel traffic is recirculated to root table. However, recirculated packet might not match on any rules installed in the table (for example, when IP traffic follows ARP traffic). In that case packets appear on representor of tunnel endpoint VF instead being redirected to the VF itself. Extend slow table with additional flow group that matches on reg_c0 (source port value set by indirect tables implemented by previous patch in series) and reg_c1 (special 0xFFF mark). When creating offloads fdb tables, install one rule per VF vport to match on recirculated miss packets and redirect them to appropriate VF vport. Modify indirect tables code to also rewrite reg_c1 with special 0xFFF mark. Implementation reuses reg_c1 tunnel id bits. This is safe to do because recirculated packets are always matched before decapsulation. Signed-off-by: Vlad Buslov Signed-off-by: Dmytro Linkin Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 +- .../ethernet/mellanox/mlx5/core/esw/indir_table.c | 17 ++- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 + .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 119 ++++++++++++++++++++- include/linux/mlx5/eswitch.h | 10 +- 5 files changed, 143 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 90db5a99879d..21568d1fc00f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -5515,7 +5515,8 @@ int mlx5e_tc_esw_init(struct rhashtable *tc_ht) } uplink_priv->tunnel_mapping = mapping; - mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK, true); + /* 0xFFF is reserved for stack devices slow path table mark */ + mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK - 1, true); if (IS_ERR(mapping)) { err = PTR_ERR(mapping); goto err_enc_opts_mapping; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c index a0ebf40c9907..b7d00c4c7046 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c @@ -162,7 +162,7 @@ static int mlx5_esw_indir_table_rule_get(struct mlx5_eswitch *esw, (attr->ip_version == 4 ? ETH_P_IP : ETH_P_IPV6)); } else { err = -EOPNOTSUPP; - goto err_mod_hdr; + goto err_ethertype; } if (attr->ip_version == 4) { @@ -198,13 +198,18 @@ static int mlx5_esw_indir_table_rule_get(struct mlx5_eswitch *esw, err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB, VPORT_TO_REG, data); if (err) - goto err_mod_hdr; + goto err_mod_hdr_regc0; + + err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB, + TUNNEL_TO_REG, ESW_TUN_SLOW_TABLE_GOTO_VPORT); + if (err) + goto err_mod_hdr_regc1; flow_act.modify_hdr = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_FDB, mod_acts.num_actions, mod_acts.actions); if (IS_ERR(flow_act.modify_hdr)) { err = PTR_ERR(flow_act.modify_hdr); - goto err_mod_hdr; + goto err_mod_hdr_alloc; } flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; @@ -236,7 +241,11 @@ err_handle: mlx5_chains_put_table(chains, 0, 1, 0); err_table: mlx5_modify_header_dealloc(esw->dev, flow_act.modify_hdr); -err_mod_hdr: +err_mod_hdr_alloc: +err_mod_hdr_regc1: + dealloc_mod_hdr_actions(&mod_acts); +err_mod_hdr_regc0: +err_ethertype: kfree(rule); out: kfree(rule_spec); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index c2361c5b824c..34a21ff95472 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -179,9 +179,11 @@ struct mlx5_eswitch_fdb { struct mlx5_flow_namespace *ns; struct mlx5_flow_table *slow_fdb; struct mlx5_flow_group *send_to_vport_grp; + struct mlx5_flow_group *send_to_vport_meta_grp; struct mlx5_flow_group *peer_miss_grp; struct mlx5_flow_handle **peer_miss_rules; struct mlx5_flow_group *miss_grp; + struct mlx5_flow_handle **send_to_vport_meta_rules; struct mlx5_flow_handle *miss_rule_uni; struct mlx5_flow_handle *miss_rule_multi; int vlan_push_pop_refcount; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index a44728595420..94cb0217b4f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1080,6 +1080,81 @@ void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule) mlx5_del_flow_rules(rule); } +static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_handle **flows = esw->fdb_table.offloads.send_to_vport_meta_rules; + int i = 0, num_vfs = esw->esw_funcs.num_vfs, vport_num; + + if (!num_vfs || !flows) + return; + + mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) + mlx5_del_flow_rules(flows[i++]); + + kvfree(flows); +} + +static int +mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw) +{ + int num_vfs, vport_num, rule_idx = 0, err = 0; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_handle **flows; + struct mlx5_flow_spec *spec; + + num_vfs = esw->esw_funcs.num_vfs; + flows = kvzalloc(num_vfs * sizeof(*flows), GFP_KERNEL); + if (!flows) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto alloc_err; + } + + MLX5_SET(fte_match_param, spec->match_criteria, + misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, spec->match_criteria, + misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_1, + ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) { + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); + dest.vport.num = vport_num; + + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + esw_warn(esw->dev, "FDB: Failed to add send to vport meta rule idx %d, err %ld\n", + rule_idx, PTR_ERR(flow_rule)); + goto rule_err; + } + flows[rule_idx++] = flow_rule; + } + + esw->fdb_table.offloads.send_to_vport_meta_rules = flows; + kvfree(spec); + return 0; + +rule_err: + while (--rule_idx >= 0) + mlx5_del_flow_rules(flows[rule_idx]); + kvfree(spec); +alloc_err: + kvfree(flows); + return err; +} + static bool mlx5_eswitch_reg_c1_loopback_supported(struct mlx5_eswitch *esw) { return MLX5_CAP_ESW_FLOWTABLE(esw->dev, fdb_to_vport_reg_c_id) & @@ -1562,11 +1637,11 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw) { int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); struct mlx5_flow_table_attr ft_attr = {}; + int num_vfs, table_size, ix, err = 0; struct mlx5_core_dev *dev = esw->dev; struct mlx5_flow_namespace *root_ns; struct mlx5_flow_table *fdb = NULL; u32 flags = 0, *flow_group_in; - int table_size, ix, err = 0; struct mlx5_flow_group *g; void *match_criteria; u8 *dmac; @@ -1592,7 +1667,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw) } table_size = esw->total_vports * MAX_SQ_NVPORTS + MAX_PF_SQ + - MLX5_ESW_MISS_FLOWS + esw->total_vports; + MLX5_ESW_MISS_FLOWS + esw->total_vports + esw->esw_funcs.num_vfs; /* create the slow path fdb with encap set, so further table instances * can be created at run time while VFs are probed if the FW allows that. @@ -1640,6 +1715,38 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw) } esw->fdb_table.offloads.send_to_vport_grp = g; + /* meta send to vport */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK); + + num_vfs = esw->esw_funcs.num_vfs; + if (num_vfs) { + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ix); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix + num_vfs - 1); + ix += num_vfs; + + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create send-to-vport meta flow group err(%d)\n", + err); + goto send_vport_meta_err; + } + esw->fdb_table.offloads.send_to_vport_meta_grp = g; + + err = mlx5_eswitch_add_send_to_vport_meta_rules(esw); + if (err) + goto meta_rule_err; + } + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) { /* create peer esw miss group */ memset(flow_group_in, 0, inlen); @@ -1707,6 +1814,11 @@ miss_err: if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp); peer_miss_err: + mlx5_eswitch_del_send_to_vport_meta_rules(esw); +meta_rule_err: + if (esw->fdb_table.offloads.send_to_vport_meta_grp) + mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_meta_grp); +send_vport_meta_err: mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp); send_vport_err: esw_chains_destroy(esw, esw_chains(esw)); @@ -1728,7 +1840,10 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw) esw_debug(esw->dev, "Destroy offloads FDB Tables\n"); mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_multi); mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni); + mlx5_eswitch_del_send_to_vport_meta_rules(esw); mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp); + if (esw->fdb_table.offloads.send_to_vport_meta_grp) + mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_meta_grp); if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp); mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp); diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 3b20e84049c1..994c2c8cb4fd 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -114,8 +114,16 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, #define ESW_ZONE_ID_BITS 8 #define ESW_TUN_OPTS_BITS 12 #define ESW_TUN_ID_BITS 12 -#define ESW_TUN_OFFSET ESW_ZONE_ID_BITS +#define ESW_TUN_OPTS_OFFSET ESW_ZONE_ID_BITS +#define ESW_TUN_OFFSET ESW_TUN_OPTS_OFFSET #define ESW_ZONE_ID_MASK GENMASK(ESW_ZONE_ID_BITS - 1, 0) +#define ESW_TUN_OPTS_MASK GENMASK(32 - ESW_TUN_ID_BITS - 1, ESW_TUN_OPTS_OFFSET) +#define ESW_TUN_MASK GENMASK(31, ESW_TUN_OFFSET) +#define ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT 0 /* 0 is not a valid tunnel id */ +#define ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT 0xFFF /* 0xFFF is a reserved mapping */ +#define ESW_TUN_SLOW_TABLE_GOTO_VPORT ((ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT << ESW_TUN_OPTS_BITS) | \ + ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT) +#define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ -- cgit v1.2.3 From cfa55c6d47b1e75ccc4b950616e881f3fd07712e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Jan 2021 00:00:01 +0100 Subject: batman-adv: Drop publication years from copyright info The batman-adv source code was using the year of publication (to net-next) as "last" year for the copyright statement. The whole source code mentioned in the MAINTAINERS "BATMAN ADVANCED" section was handled as a single entity regarding the publishing year. This avoided having outdated (in sense of year information - not copyright holder) publishing information inside several files. But since the simple "update copyright year" commit (without other changes) in the file was not well received in the upstream kernel, the option to not have a copyright year (for initial and last publication) in the files are chosen instead. More detailed information about the years can still be retrieved from the SCM system. Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- include/uapi/linux/batman_adv.h | 2 +- net/batman-adv/Kconfig | 2 +- net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.c | 2 +- net/batman-adv/bat_algo.h | 2 +- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_iv_ogm.h | 2 +- net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v.h | 2 +- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_elp.h | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/bat_v_ogm.h | 2 +- net/batman-adv/bitarray.c | 2 +- net/batman-adv/bitarray.h | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/bridge_loop_avoidance.h | 2 +- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/fragmentation.h | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/gateway_common.h | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hard-interface.h | 2 +- net/batman-adv/hash.c | 2 +- net/batman-adv/hash.h | 2 +- net/batman-adv/log.c | 2 +- net/batman-adv/log.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/multicast.h | 2 +- net/batman-adv/netlink.c | 2 +- net/batman-adv/netlink.h | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/network-coding.h | 2 +- net/batman-adv/originator.c | 2 +- net/batman-adv/originator.h | 2 +- net/batman-adv/routing.c | 2 +- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 2 +- net/batman-adv/send.h | 2 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/soft-interface.h | 2 +- net/batman-adv/tp_meter.c | 2 +- net/batman-adv/tp_meter.h | 2 +- net/batman-adv/trace.c | 2 +- net/batman-adv/trace.h | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/translation-table.h | 2 +- net/batman-adv/tvlv.c | 2 +- net/batman-adv/tvlv.h | 2 +- net/batman-adv/types.h | 2 +- 57 files changed, 57 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 9c8604c5b5f6..ea4692c339ce 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index bdb317faa1dc..35dc016c9bb4 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 993afd5ff7bb..dd853dc22d32 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +# Copyright (C) B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 8010c34b987c..3bd0760c76a2 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +# Copyright (C) B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index c5f404f6892f..4eee53d19eb0 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 43b045ac8ac7..2c486374af58 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Linus LĂŒssing */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 168621c9a081..a5e313cd6f44 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 0c57c1000c64..04b01bd684e8 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index e4455babe4c2..e1ca2b8c3152 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus LĂŒssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 5e0be10bc84e..964431f4dc8d 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Linus LĂŒssing */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 0512ea6cd818..423c2d171703 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus LĂŒssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 4358d436be2a..9e2740195fa2 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus LĂŒssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 798d659855d0..a0a9636d1740 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 0ae2575f70bb..edeffedecade 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 4bc695cda397..649c41f393e1 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 533c6d44cb58..37f7ae413bc6 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index d2de12e527ba..360bdbf44748 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 7dc6d3571925..5c22955bb9d5 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index fd7ba6bbdf85..01e0f84cb1ff 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index e980fb45693a..bed7f3d20844 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index e522f1fcfd9a..a5d9d800082b 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin HundebĂžll */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 881ef328b6cd..dbf0871f8703 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin HundebĂžll */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index cffe72f4edd7..007f2827935d 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 2fbc500f0ac1..2ae5846ef958 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 16cd9450ceb1..fdde305a198e 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index c3a0c5a7f7e9..87c37f907261 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 0f186ddc15e3..4a6a25d551a8 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index f4b8e9efef19..83d11b46a9d8 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 68638e0450a6..8016e619787f 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 91ae9f32b580..46696759f194 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index b7e9923b11a2..f0e5d1429662 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 979864c0fa6b..6717c965f0fa 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index ed9d87ce3407..e48f7ac8a854 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2486efe4ffa6..8f0102b71656 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 854e5ff28a3f..71d0bc323471 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus LĂŒssing */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index d61593d02072..9fee5da08311 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Linus LĂŒssing */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 97bcf149633d..fdd76eeaa6be 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 7ee48f916997..48102cc7490c 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 0cec108b7a99..4bb76b434d07 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin HundebĂžll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 8fb2c01e7837..368cc3130e4c 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Martin HundebĂžll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 77431e59b228..da7249448474 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index e75d4c4d11f5..805be87d55b8 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 49cbca4aa428..40f5cffde6a3 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 2ed49db6eff5..5f387786e9a7 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 87017332b567..157abe92d827 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 0d36e15589f6..2b0daf8b2bc4 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 97118efbe678..6b8181bc3122 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 74716d9ca4f6..38b0ad182584 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index d4e10005df6c..07e6c1d864a4 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 140105215aa2..f0046d366eac 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3444d9e4e90d..ec8b9519076b 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index a87547570b4e..d673ebdd0426 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index cd09916f97fe..f8761281aab0 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 57192c817229..e1285904f885 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 6a23a566cde1..253f5a33a914 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index d509d00c7a23..54f2a35653d0 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 2f96e96a5ca4..2c654ec964b7 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ -- cgit v1.2.3 From b358e2122b9d7aa99f681d4edfafd999845d16ff Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 4 Feb 2021 18:56:35 +0800 Subject: mm: page_frag: Introduce page_frag_alloc_align() In the current implementation of page_frag_alloc(), it doesn't have any align guarantee for the returned buffer address. But for some hardwares they do require the DMA buffer to be aligned correctly, so we would have to use some workarounds like below if the buffers allocated by the page_frag_alloc() are used by these hardwares for DMA. buf = page_frag_alloc(really_needed_size + align); buf = PTR_ALIGN(buf, align); These codes seems ugly and would waste a lot of memories if the buffers are used in a network driver for the TX/RX. So introduce page_frag_alloc_align() to make sure that an aligned buffer address is returned. Signed-off-by: Kevin Hao Acked-by: Vlastimil Babka Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- include/linux/gfp.h | 12 ++++++++++-- mm/page_alloc.c | 8 +++++--- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6e479e9c48ce..80544d5c08e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -583,8 +583,16 @@ extern void free_pages(unsigned long addr, unsigned int order); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); -extern void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask); +extern void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask); + +static inline void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) +{ + return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); +} + extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 519a60d5b6f7..ef5070fed76b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5137,8 +5137,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); -void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -5190,11 +5191,12 @@ refill: } nc->pagecnt_bias--; + offset &= align_mask; nc->offset = offset; return nc->va + offset; } -EXPORT_SYMBOL(page_frag_alloc); +EXPORT_SYMBOL(page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. -- cgit v1.2.3 From 3f6e687dff395da43b056c18150a423bc7bf5d14 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 4 Feb 2021 18:56:36 +0800 Subject: net: Introduce {netdev,napi}_alloc_frag_align() In the current implementation of {netdev,napi}_alloc_frag(), it doesn't have any align guarantee for the returned buffer address, But for some hardwares they do require the DMA buffer to be aligned correctly, so we would have to use some workarounds like below if the buffers allocated by the {netdev,napi}_alloc_frag() are used by these hardwares for DMA. buf = napi_alloc_frag(really_needed_size + align); buf = PTR_ALIGN(buf, align); These codes seems ugly and would waste a lot of memories if the buffers are used in a network driver for the TX/RX. We have added the align support for the page_frag functions, so add the corresponding {netdev,napi}_frag functions. Signed-off-by: Kevin Hao Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 36 ++++++++++++++++++++++++++++++++++-- net/core/skbuff.c | 26 ++++++++++---------------- 2 files changed, 44 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0e42c53b8ca9..0a4e91a2f873 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2818,7 +2818,26 @@ void skb_queue_purge(struct sk_buff_head *list); unsigned int skb_rbtree_purge(struct rb_root *root); -void *netdev_alloc_frag(unsigned int fragsz); +void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); + +/** + * netdev_alloc_frag - allocate a page fragment + * @fragsz: fragment size + * + * Allocates a frag from a page for receive buffer. + * Uses GFP_ATOMIC allocations. + */ +static inline void *netdev_alloc_frag(unsigned int fragsz) +{ + return __netdev_alloc_frag_align(fragsz, ~0u); +} + +static inline void *netdev_alloc_frag_align(unsigned int fragsz, + unsigned int align) +{ + WARN_ON_ONCE(!is_power_of_2(align)); + return __netdev_alloc_frag_align(fragsz, -align); +} struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask); @@ -2877,7 +2896,20 @@ static inline void skb_free_frag(void *addr) page_frag_free(addr); } -void *napi_alloc_frag(unsigned int fragsz); +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); + +static inline void *napi_alloc_frag(unsigned int fragsz) +{ + return __napi_alloc_frag_align(fragsz, ~0u); +} + +static inline void *napi_alloc_frag_align(unsigned int fragsz, + unsigned int align) +{ + WARN_ON_ONCE(!is_power_of_2(align)); + return __napi_alloc_frag_align(fragsz, -align); +} + struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int length, gfp_t gfp_mask); static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3787093239f5..d380c7b5a12d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -374,29 +374,23 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return page_frag_alloc(&nc->page, fragsz, gfp_mask); + return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask); } -void *napi_alloc_frag(unsigned int fragsz) +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { fragsz = SKB_DATA_ALIGN(fragsz); - return __napi_alloc_frag(fragsz, GFP_ATOMIC); + return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); } -EXPORT_SYMBOL(napi_alloc_frag); +EXPORT_SYMBOL(__napi_alloc_frag_align); -/** - * netdev_alloc_frag - allocate a page fragment - * @fragsz: fragment size - * - * Allocates a frag from a page for receive buffer. - * Uses GFP_ATOMIC allocations. - */ -void *netdev_alloc_frag(unsigned int fragsz) +void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct page_frag_cache *nc; void *data; @@ -404,15 +398,15 @@ void *netdev_alloc_frag(unsigned int fragsz) fragsz = SKB_DATA_ALIGN(fragsz); if (in_irq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); + data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); } else { local_bh_disable(); - data = __napi_alloc_frag(fragsz, GFP_ATOMIC); + data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); local_bh_enable(); } return data; } -EXPORT_SYMBOL(netdev_alloc_frag); +EXPORT_SYMBOL(__netdev_alloc_frag_align); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device -- cgit v1.2.3 From b80af659699d212cf8cec6593f6551905c4ae86f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 6 Feb 2021 00:02:14 +0200 Subject: net: mscc: ocelot: set up the bonding mask in a way that avoids a net_device Since this code should be called from pure switchdev as well as from DSA, we must find a way to determine the bonding mask not by looking directly at the net_device lowers of the bonding interface, since those could have different private structures. We keep a pointer to the bonding upper interface, if present, in struct ocelot_port. Then the bonding mask becomes the bitwise OR of all ports that have the same bonding upper interface. This adds a duplication of functionality with the current "lags" array, but the duplication will be short-lived, since further patches will remove the latter completely. Signed-off-by: Vladimir Oltean Reviewed-by: Alexandre Belloni Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 29 ++++++++++++++++++++++------- include/soc/mscc/ocelot.h | 2 ++ 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ef3f10f1e54f..127beedcccde 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -889,6 +889,24 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, } EXPORT_SYMBOL(ocelot_get_ts_info); +static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond) +{ + u32 mask = 0; + int port; + + for (port = 0; port < ocelot->num_phys_ports; port++) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + if (!ocelot_port) + continue; + + if (ocelot_port->bond == bond) + mask |= BIT(port); + } + + return mask; +} + static u32 ocelot_get_dsa_8021q_cpu_mask(struct ocelot *ocelot) { u32 mask = 0; @@ -1319,20 +1337,15 @@ int ocelot_port_lag_join(struct ocelot *ocelot, int port, struct net_device *bond, struct netdev_lag_upper_info *info) { - struct net_device *ndev; u32 bond_mask = 0; int lag, lp; if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH) return -EOPNOTSUPP; - rcu_read_lock(); - for_each_netdev_in_bond_rcu(bond, ndev) { - struct ocelot_port_private *priv = netdev_priv(ndev); + ocelot->ports[port]->bond = bond; - bond_mask |= BIT(priv->chip_port); - } - rcu_read_unlock(); + bond_mask = ocelot_get_bond_mask(ocelot, bond); lp = __ffs(bond_mask); @@ -1366,6 +1379,8 @@ void ocelot_port_lag_leave(struct ocelot *ocelot, int port, u32 port_cfg; int i; + ocelot->ports[port]->bond = NULL; + /* Remove port from any lag */ for (i = 0; i < ocelot->num_phys_ports; i++) ocelot->lags[i] &= ~BIT(port); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6a61c499a30d..e36a1ed29c01 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -611,6 +611,8 @@ struct ocelot_port { u8 *xmit_template; bool is_dsa_8021q_cpu; + + struct net_device *bond; }; struct ocelot { -- cgit v1.2.3 From 528d3f190c98c8f7d9581f68db4af021696727b2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 6 Feb 2021 00:02:17 +0200 Subject: net: mscc: ocelot: drop the use of the "lags" array We can now simplify the implementation by always using ocelot_get_bond_mask to look up the other ports that are offloading the same bonding interface as us. In ocelot_set_aggr_pgids, the code had a way to uniquely iterate through LAGs. We need to achieve the same behavior by marking each LAG as visited, which we do now by using a temporary 32-bit "visited" bitmask. This is ok and we do not need dynamic memory allocation, because we know that this switch architecture will not have more than 32 ports (the PGID port masks are 32-bit anyway). Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 95 ++++++++++++++++---------------------- include/soc/mscc/ocelot.h | 2 - 2 files changed, 39 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 5d765245c6d3..c906c449d2dd 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -957,21 +957,11 @@ void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) mask = GENMASK(ocelot->num_phys_ports - 1, 0); mask &= ~cpu_fwd_mask; } else if (ocelot->bridge_fwd_mask & BIT(port)) { - int lag; + struct net_device *bond = ocelot_port->bond; mask = ocelot->bridge_fwd_mask & ~BIT(port); - - for (lag = 0; lag < ocelot->num_phys_ports; lag++) { - unsigned long bond_mask = ocelot->lags[lag]; - - if (!bond_mask) - continue; - - if (bond_mask & BIT(port)) { - mask &= ~bond_mask; - break; - } - } + if (bond) + mask &= ~ocelot_get_bond_mask(ocelot, bond); } else { /* Standalone ports forward only to DSA tag_8021q CPU * ports (if those exist), or to the hardware CPU port @@ -1277,6 +1267,7 @@ EXPORT_SYMBOL(ocelot_port_bridge_leave); static void ocelot_set_aggr_pgids(struct ocelot *ocelot) { + unsigned long visited = GENMASK(ocelot->num_phys_ports - 1, 0); int i, port, lag; /* Reset destination and aggregation PGIDS */ @@ -1287,16 +1278,35 @@ static void ocelot_set_aggr_pgids(struct ocelot *ocelot) ocelot_write_rix(ocelot, GENMASK(ocelot->num_phys_ports - 1, 0), ANA_PGID_PGID, i); - /* Now, set PGIDs for each LAG */ + /* The visited ports bitmask holds the list of ports offloading any + * bonding interface. Initially we mark all these ports as unvisited, + * then every time we visit a port in this bitmask, we know that it is + * the lowest numbered port, i.e. the one whose logical ID == physical + * port ID == LAG ID. So we mark as visited all further ports in the + * bitmask that are offloading the same bonding interface. This way, + * we set up the aggregation PGIDs only once per bonding interface. + */ + for (port = 0; port < ocelot->num_phys_ports; port++) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + if (!ocelot_port || !ocelot_port->bond) + continue; + + visited &= ~BIT(port); + } + + /* Now, set PGIDs for each active LAG */ for (lag = 0; lag < ocelot->num_phys_ports; lag++) { + struct net_device *bond = ocelot->ports[lag]->bond; unsigned long bond_mask; int aggr_count = 0; u8 aggr_idx[16]; - bond_mask = ocelot->lags[lag]; - if (!bond_mask) + if (!bond || (visited & BIT(lag))) continue; + bond_mask = ocelot_get_bond_mask(ocelot, bond); + for_each_set_bit(port, &bond_mask, ocelot->num_phys_ports) { // Destination mask ocelot_write_rix(ocelot, bond_mask, @@ -1313,6 +1323,19 @@ static void ocelot_set_aggr_pgids(struct ocelot *ocelot) ac |= BIT(aggr_idx[i % aggr_count]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); } + + /* Mark all ports in the same LAG as visited to avoid applying + * the same config again. + */ + for (port = lag; port < ocelot->num_phys_ports; port++) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + if (!ocelot_port) + continue; + + if (ocelot_port->bond == bond) + visited |= BIT(port); + } } } @@ -1353,30 +1376,11 @@ int ocelot_port_lag_join(struct ocelot *ocelot, int port, struct net_device *bond, struct netdev_lag_upper_info *info) { - u32 bond_mask = 0; - int lag; - if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH) return -EOPNOTSUPP; ocelot->ports[port]->bond = bond; - bond_mask = ocelot_get_bond_mask(ocelot, bond); - - lag = __ffs(bond_mask); - - /* If the new port is the lowest one, use it as the logical port from - * now on - */ - if (port == lag) { - ocelot->lags[port] = bond_mask; - bond_mask &= ~BIT(port); - if (bond_mask) - ocelot->lags[__ffs(bond_mask)] = 0; - } else { - ocelot->lags[lag] |= BIT(port); - } - ocelot_setup_logical_port_ids(ocelot); ocelot_apply_bridge_fwd_mask(ocelot); ocelot_set_aggr_pgids(ocelot); @@ -1388,24 +1392,8 @@ EXPORT_SYMBOL(ocelot_port_lag_join); void ocelot_port_lag_leave(struct ocelot *ocelot, int port, struct net_device *bond) { - int i; - ocelot->ports[port]->bond = NULL; - /* Remove port from any lag */ - for (i = 0; i < ocelot->num_phys_ports; i++) - ocelot->lags[i] &= ~BIT(port); - - /* if it was the logical port of the lag, move the lag config to the - * next port - */ - if (ocelot->lags[port]) { - int n = __ffs(ocelot->lags[port]); - - ocelot->lags[n] = ocelot->lags[port]; - ocelot->lags[port] = 0; - } - ocelot_setup_logical_port_ids(ocelot); ocelot_apply_bridge_fwd_mask(ocelot); ocelot_set_aggr_pgids(ocelot); @@ -1587,11 +1575,6 @@ int ocelot_init(struct ocelot *ocelot) } } - ocelot->lags = devm_kcalloc(ocelot->dev, ocelot->num_phys_ports, - sizeof(u32), GFP_KERNEL); - if (!ocelot->lags) - return -ENOMEM; - ocelot->stats = devm_kcalloc(ocelot->dev, ocelot->num_phys_ports * ocelot->num_stats, sizeof(u64), GFP_KERNEL); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index e36a1ed29c01..089e552719e0 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -657,8 +657,6 @@ struct ocelot { enum ocelot_tag_prefix npi_inj_prefix; enum ocelot_tag_prefix npi_xtr_prefix; - u32 *lags; - struct list_head multicast; struct list_head pgids; -- cgit v1.2.3 From 23ca3b727ee6b432166391607b614d3a6beb6784 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 6 Feb 2021 00:02:19 +0200 Subject: net: mscc: ocelot: rebalance LAGs on link up/down events At present there is an issue when ocelot is offloading a bonding interface, but one of the links of the physical ports goes down. Traffic keeps being hashed towards that destination, and of course gets dropped on egress. Monitor the netdev notifier events emitted by the bonding driver for changes in the physical state of lower interfaces, to determine which ports are active and which ones are no longer. Then extend ocelot_get_bond_mask to return either the configured bonding interfaces, or the active ones, depending on a boolean argument. The code that does rebalancing only needs to do so among the active ports, whereas the bridge forwarding mask and the logical port IDs still need to look at the permanently bonded ports. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 41 ++++++++++++++++++++++++++-------- drivers/net/ethernet/mscc/ocelot.h | 1 + drivers/net/ethernet/mscc/ocelot_net.c | 30 +++++++++++++++++++++++++ include/soc/mscc/ocelot.h | 1 + 4 files changed, 64 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 380a5a661702..f8b85ab8be5d 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -889,7 +889,8 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, } EXPORT_SYMBOL(ocelot_get_ts_info); -static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond) +static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond, + bool only_active_ports) { u32 mask = 0; int port; @@ -900,8 +901,12 @@ static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond) if (!ocelot_port) continue; - if (ocelot_port->bond == bond) + if (ocelot_port->bond == bond) { + if (only_active_ports && !ocelot_port->lag_tx_active) + continue; + mask |= BIT(port); + } } return mask; @@ -960,8 +965,10 @@ void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) struct net_device *bond = ocelot_port->bond; mask = ocelot->bridge_fwd_mask & ~BIT(port); - if (bond) - mask &= ~ocelot_get_bond_mask(ocelot, bond); + if (bond) { + mask &= ~ocelot_get_bond_mask(ocelot, bond, + false); + } } else { /* Standalone ports forward only to DSA tag_8021q CPU * ports (if those exist), or to the hardware CPU port @@ -1298,20 +1305,20 @@ static void ocelot_set_aggr_pgids(struct ocelot *ocelot) /* Now, set PGIDs for each active LAG */ for (lag = 0; lag < ocelot->num_phys_ports; lag++) { struct net_device *bond = ocelot->ports[lag]->bond; - int num_ports_in_lag = 0; + int num_active_ports = 0; unsigned long bond_mask; u8 aggr_idx[16]; if (!bond || (visited & BIT(lag))) continue; - bond_mask = ocelot_get_bond_mask(ocelot, bond); + bond_mask = ocelot_get_bond_mask(ocelot, bond, true); for_each_set_bit(port, &bond_mask, ocelot->num_phys_ports) { // Destination mask ocelot_write_rix(ocelot, bond_mask, ANA_PGID_PGID, port); - aggr_idx[num_ports_in_lag++] = port; + aggr_idx[num_active_ports++] = port; } for_each_aggr_pgid(ocelot, i) { @@ -1319,7 +1326,11 @@ static void ocelot_set_aggr_pgids(struct ocelot *ocelot) ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; - ac |= BIT(aggr_idx[i % num_ports_in_lag]); + /* Don't do division by zero if there was no active + * port. Just make all aggregation codes zero. + */ + if (num_active_ports) + ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); } @@ -1356,7 +1367,8 @@ static void ocelot_setup_logical_port_ids(struct ocelot *ocelot) bond = ocelot_port->bond; if (bond) { - int lag = __ffs(ocelot_get_bond_mask(ocelot, bond)); + int lag = __ffs(ocelot_get_bond_mask(ocelot, bond, + false)); ocelot_rmw_gix(ocelot, ANA_PORT_PORT_CFG_PORTID_VAL(lag), @@ -1399,6 +1411,17 @@ void ocelot_port_lag_leave(struct ocelot *ocelot, int port, } EXPORT_SYMBOL(ocelot_port_lag_leave); +void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + ocelot_port->lag_tx_active = lag_tx_active; + + /* Rebalance the LAGs */ + ocelot_set_aggr_pgids(ocelot); +} +EXPORT_SYMBOL(ocelot_port_lag_change); + /* Configure the maximum SDU (L2 payload) on RX to the value specified in @sdu. * The length of VLAN tags is accounted for automatically via DEV_MAC_TAGS_CFG. * In the special case that it's the NPI port that we're configuring, the diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index 12dc74453076..b18f6644726a 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -114,6 +114,7 @@ int ocelot_port_lag_join(struct ocelot *ocelot, int port, struct netdev_lag_upper_info *info); void ocelot_port_lag_leave(struct ocelot *ocelot, int port, struct net_device *bond); +void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active); struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port); int ocelot_netdev_to_port(struct net_device *dev); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 0a4de949f4d9..8f12fa45b1b5 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1164,6 +1164,27 @@ ocelot_netdevice_lag_changeupper(struct net_device *dev, return NOTIFY_DONE; } +static int +ocelot_netdevice_changelowerstate(struct net_device *dev, + struct netdev_lag_lower_state_info *info) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + bool is_active = info->link_up && info->tx_enabled; + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + if (!ocelot_port->bond) + return NOTIFY_DONE; + + if (ocelot_port->lag_tx_active == is_active) + return NOTIFY_DONE; + + ocelot_port_lag_change(ocelot, port, is_active); + + return NOTIFY_OK; +} + static int ocelot_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -1181,6 +1202,15 @@ static int ocelot_netdevice_event(struct notifier_block *unused, break; } + case NETDEV_CHANGELOWERSTATE: { + struct netdev_notifier_changelowerstate_info *info = ptr; + + if (!ocelot_netdevice_dev_check(dev)) + break; + + return ocelot_netdevice_changelowerstate(dev, + info->lower_state_info); + } default: break; } diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 089e552719e0..6e806872cd24 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -613,6 +613,7 @@ struct ocelot_port { bool is_dsa_8021q_cpu; struct net_device *bond; + bool lag_tx_active; }; struct ocelot { -- cgit v1.2.3 From 8fe6832e96acbf9d5777fc0b13e3e680ff46ba11 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 6 Feb 2021 00:02:21 +0200 Subject: net: dsa: felix: propagate the LAG offload ops towards the ocelot lib The ocelot switch has been supporting LAG offload since its initial commit, however felix could not make use of that, due to lack of a LAG abstraction in DSA. Now that we have that, let's forward DSA's calls towards the ocelot library, who will deal with setting up the bonding. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 32 ++++++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot.h | 6 ------ include/soc/mscc/ocelot.h | 6 ++++++ 3 files changed, 38 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 167463010b55..1bd5aea12b25 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -569,6 +569,35 @@ static void felix_bridge_leave(struct dsa_switch *ds, int port, ocelot_port_bridge_leave(ocelot, port, br); } +static int felix_lag_join(struct dsa_switch *ds, int port, + struct net_device *bond, + struct netdev_lag_upper_info *info) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_port_lag_join(ocelot, port, bond, info); +} + +static int felix_lag_leave(struct dsa_switch *ds, int port, + struct net_device *bond) +{ + struct ocelot *ocelot = ds->priv; + + ocelot_port_lag_leave(ocelot, port, bond); + + return 0; +} + +static int felix_lag_change(struct dsa_switch *ds, int port) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct ocelot *ocelot = ds->priv; + + ocelot_port_lag_change(ocelot, port, dp->lag_tx_enabled); + + return 0; +} + static int felix_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { @@ -1331,6 +1360,9 @@ const struct dsa_switch_ops felix_switch_ops = { .port_mdb_del = felix_mdb_del, .port_bridge_join = felix_bridge_join, .port_bridge_leave = felix_bridge_leave, + .port_lag_join = felix_lag_join, + .port_lag_leave = felix_lag_leave, + .port_lag_change = felix_lag_change, .port_stp_state_set = felix_bridge_stp_state_set, .port_vlan_filtering = felix_vlan_filtering, .port_vlan_add = felix_vlan_add, diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index b18f6644726a..c485795c606b 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -109,12 +109,6 @@ int ocelot_mact_learn(struct ocelot *ocelot, int port, unsigned int vid, enum macaccess_entry_type type); int ocelot_mact_forget(struct ocelot *ocelot, const unsigned char mac[ETH_ALEN], unsigned int vid); -int ocelot_port_lag_join(struct ocelot *ocelot, int port, - struct net_device *bond, - struct netdev_lag_upper_info *info); -void ocelot_port_lag_leave(struct ocelot *ocelot, int port, - struct net_device *bond); -void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active); struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port); int ocelot_netdev_to_port(struct net_device *dev); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6e806872cd24..d0d48e9620fb 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -798,6 +798,12 @@ int ocelot_port_mdb_add(struct ocelot *ocelot, int port, const struct switchdev_obj_port_mdb *mdb); int ocelot_port_mdb_del(struct ocelot *ocelot, int port, const struct switchdev_obj_port_mdb *mdb); +int ocelot_port_lag_join(struct ocelot *ocelot, int port, + struct net_device *bond, + struct netdev_lag_upper_info *info); +void ocelot_port_lag_leave(struct ocelot *ocelot, int port, + struct net_device *bond); +void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active); int ocelot_devlink_sb_register(struct ocelot *ocelot); void ocelot_devlink_sb_unregister(struct ocelot *ocelot); -- cgit v1.2.3 From 4331667fa14e6643859d0498b34281185eb8018b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 5 Feb 2021 14:56:39 +0800 Subject: ssb: Use true and false for bool variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following coccicheck warnings: ./include/linux/ssb/ssb_driver_gige.h:89:8-9: WARNING: return of 0/1 in function 'ssb_gige_one_dma_at_once' with return type bool. ./include/linux/ssb/ssb_driver_gige.h:79:8-9: WARNING: return of 0/1 in function 'ssb_gige_have_roboswitch' with return type bool. ./include/linux/ssb/ssb_driver_gige.h:182:8-9: WARNING: return of 0/1 in function 'ssb_gige_must_flush_posted_writes' with return type bool. ./include/linux/ssb/ssb_driver_gige.h:178:8-9: WARNING: return of 0/1 in function 'ssb_gige_one_dma_at_once' with return type bool. ./include/linux/ssb/ssb_driver_gige.h:174:8-9: WARNING: return of 0/1 in function 'ssb_gige_have_roboswitch' with return type bool. ./include/linux/ssb/ssb_driver_gige.h:170:8-9: WARNING: return of 0/1 in function 'ssb_gige_is_rgmii' with return type bool. ./include/linux/ssb/ssb_driver_gige.h:162:8-9: WARNING: return of 0/1 in function 'pdev_is_ssb_gige_core' with return type bool. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Michael BĂŒsch Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1612508199-92282-1-git-send-email-jiapeng.chong@linux.alibaba.com --- include/linux/ssb/ssb_driver_gige.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h index 31593b34608e..15ba0df1ee0d 100644 --- a/include/linux/ssb/ssb_driver_gige.h +++ b/include/linux/ssb/ssb_driver_gige.h @@ -76,7 +76,7 @@ static inline bool ssb_gige_have_roboswitch(struct pci_dev *pdev) if (dev) return !!(dev->dev->bus->sprom.boardflags_lo & SSB_GIGE_BFL_ROBOSWITCH); - return 0; + return false; } /* Returns whether we can only do one DMA at once. */ @@ -86,7 +86,7 @@ static inline bool ssb_gige_one_dma_at_once(struct pci_dev *pdev) if (dev) return ((dev->dev->bus->chip_id == 0x4785) && (dev->dev->bus->chip_rev < 2)); - return 0; + return false; } /* Returns whether we must flush posted writes. */ @@ -159,7 +159,7 @@ static inline void ssb_gige_exit(void) static inline bool pdev_is_ssb_gige_core(struct pci_dev *pdev) { - return 0; + return false; } static inline struct ssb_gige * pdev_to_ssb_gige(struct pci_dev *pdev) { @@ -167,19 +167,19 @@ static inline struct ssb_gige * pdev_to_ssb_gige(struct pci_dev *pdev) } static inline bool ssb_gige_is_rgmii(struct pci_dev *pdev) { - return 0; + return false; } static inline bool ssb_gige_have_roboswitch(struct pci_dev *pdev) { - return 0; + return false; } static inline bool ssb_gige_one_dma_at_once(struct pci_dev *pdev) { - return 0; + return false; } static inline bool ssb_gige_must_flush_posted_writes(struct pci_dev *pdev) { - return 0; + return false; } static inline int ssb_gige_get_macaddr(struct pci_dev *pdev, u8 *macaddr) { -- cgit v1.2.3 From 49fc251360a10e6bff0d886c9e3c62008a1c4caf Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 7 Feb 2021 10:22:49 +0200 Subject: rtnetlink: Add RTM_F_OFFLOAD_FAILED flag The flag indicates to user space that route offload failed. Previous patch set added the ability to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags are changed, but if the offload fails there is no indication to user-space. The flag will be used in subsequent patches by netdevsim and mlxsw to indicate to user space that route offload failed, so that users will have better visibility into the offload process. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index b841caa4657e..91e4ca064d61 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -319,6 +319,11 @@ enum rt_scope_t { #define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ #define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ #define RTM_F_TRAP 0x8000 /* route is trapping packets */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed, this value + * is chosen to avoid conflicts with + * other flags defined in + * include/uapi/linux/ipv6_route.h + */ /* Reserved table identifiers */ -- cgit v1.2.3 From 36c5100e859d93b3436ae24810612b05addb1e89 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 7 Feb 2021 10:22:50 +0200 Subject: IPv4: Add "offload failed" indication to routes After installing a route to the kernel, user space receives an acknowledgment, which means the route was installed in the kernel, but not necessarily in hardware. The asynchronous nature of route installation in hardware can lead to a routing daemon advertising a route before it was actually installed in hardware. This can result in packet loss or mis-routed packets until the route is installed in hardware. To avoid such cases, previous patch set added the ability to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags are changed, this behavior is controlled by sysctl. With the above mentioned behavior, it is possible to know from user-space if the route was offloaded, but if the offload fails there is no indication to user-space. Following a failure, a routing daemon will wait indefinitely for a notification that will never come. This patch adds an "offload_failed" indication to IPv4 routes, so that users will have better visibility into the offload process. 'struct fib_alias', and 'struct fib_rt_info' are extended with new field that indicates if route offload failed. Note that the new field is added using unused bit and therefore there is no need to increase structs size. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 ++ drivers/net/netdevsim/fib.c | 1 + include/net/ip_fib.h | 3 ++- net/ipv4/fib_lookup.h | 3 ++- net/ipv4/fib_semantics.c | 3 +++ net/ipv4/fib_trie.c | 7 ++++++- net/ipv4/route.c | 1 + 7 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index cf111e73f81e..ac9a174372cc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4963,6 +4963,7 @@ mlxsw_sp_fib4_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp, fri.type = fib4_entry->type; fri.offload = should_offload; fri.trap = !should_offload; + fri.offload_failed = false; fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri); } @@ -4985,6 +4986,7 @@ mlxsw_sp_fib4_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp, fri.type = fib4_entry->type; fri.offload = false; fri.trap = false; + fri.offload_failed = false; fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri); } diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 1779146926a5..ca19da169853 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -319,6 +319,7 @@ static void nsim_fib4_rt_hw_flags_set(struct net *net, fri.type = fib4_rt->type; fri.offload = false; fri.trap = trap; + fri.offload_failed = false; fib_alias_hw_flags_set(net, &fri); } diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 2ec062aaa978..a914f33f3ed5 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -213,7 +213,8 @@ struct fib_rt_info { u8 type; u8 offload:1, trap:1, - unused:6; + offload_failed:1, + unused:5; }; struct fib_entry_notifier_info { diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index aff454ef0fa3..b58db1ca4bfb 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -18,7 +18,8 @@ struct fib_alias { s16 fa_default; u8 offload:1, trap:1, - unused:6; + offload_failed:1, + unused:5; struct rcu_head rcu; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4c38facf91c0..a632b66bc13a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -521,6 +521,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, fri.type = fa->fa_type; fri.offload = fa->offload; fri.trap = fa->trap; + fri.offload_failed = fa->offload_failed; err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -1811,6 +1812,8 @@ offload: rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; + if (fri->offload_failed) + rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 60559b708158..80147caa9bfd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1047,11 +1047,13 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) if (!fa_match) goto out; - if (fa_match->offload == fri->offload && fa_match->trap == fri->trap) + if (fa_match->offload == fri->offload && fa_match->trap == fri->trap && + fa_match->offload_failed == fri->offload_failed) goto out; fa_match->offload = fri->offload; fa_match->trap = fri->trap; + fa_match->offload_failed = fri->offload_failed; if (!net->ipv4.sysctl_fib_notify_on_flag_change) goto out; @@ -1290,6 +1292,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; + new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); @@ -1350,6 +1353,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; + new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); @@ -2289,6 +2293,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, fri.type = fa->fa_type; fri.offload = fa->offload; fri.trap = fa->trap; + fri.offload_failed = fa->offload_failed; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index be31e2446470..02d81d79deeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3304,6 +3304,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; + fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; -- cgit v1.2.3 From 0c5fcf9e249ee1d94cf872c99baf9cba7ff9ce27 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 7 Feb 2021 10:22:52 +0200 Subject: IPv6: Add "offload failed" indication to routes After installing a route to the kernel, user space receives an acknowledgment, which means the route was installed in the kernel, but not necessarily in hardware. The asynchronous nature of route installation in hardware can lead to a routing daemon advertising a route before it was actually installed in hardware. This can result in packet loss or mis-routed packets until the route is installed in hardware. To avoid such cases, previous patch set added the ability to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags are changed, this behavior is controlled by sysctl. With the above mentioned behavior, it is possible to know from user-space if the route was offloaded, but if the offload fails there is no indication to user-space. Following a failure, a routing daemon will wait indefinitely for a notification that will never come. This patch adds an "offload_failed" indication to IPv6 routes, so that users will have better visibility into the offload process. 'struct fib6_info' is extended with new field that indicates if route offload failed. Note that the new field is added using unused bit and therefore there is no need to increase struct size. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 4 ++-- drivers/net/netdevsim/fib.c | 2 +- include/net/ip6_fib.h | 5 +++-- net/ipv6/route.c | 8 ++++++-- 4 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index ac9a174372cc..a5f980b6940e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5008,7 +5008,7 @@ mlxsw_sp_fib6_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp, common); list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), mlxsw_sp_rt6->rt, - should_offload, !should_offload); + should_offload, !should_offload, false); } #else static void @@ -5030,7 +5030,7 @@ mlxsw_sp_fib6_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp, common); list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), mlxsw_sp_rt6->rt, - false, false); + false, false, false); } #else static void diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index ca19da169853..6b1ec3b4750c 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -595,7 +595,7 @@ static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) - fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap); + fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap, false); } #else static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1e262b23c68b..15b7fbe6b15c 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -195,7 +195,8 @@ struct fib6_info { fib6_destroying:1, offload:1, trap:1, - unused:2; + offload_failed:1, + unused:1; struct rcu_head rcu; struct nexthop *nh; @@ -539,7 +540,7 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, - bool offload, bool trap); + bool offload, bool trap, bool offload_failed); #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0d1784b0d65d..bc75b705f54b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5619,6 +5619,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_flags |= RTM_F_OFFLOAD; if (rt->trap) rtm->rtm_flags |= RTM_F_TRAP; + if (rt->offload_failed) + rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) @@ -6070,16 +6072,18 @@ errout: } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, - bool offload, bool trap) + bool offload, bool trap, bool offload_failed) { struct sk_buff *skb; int err; - if (f6i->offload == offload && f6i->trap == trap) + if (f6i->offload == offload && f6i->trap == trap && + f6i->offload_failed == offload_failed) return; f6i->offload = offload; f6i->trap = trap; + f6i->offload_failed = offload_failed; if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send -- cgit v1.2.3 From 29863d41bb6e1d969c62fdb15b0961806942960e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Feb 2021 11:34:09 -0800 Subject: net: implement threaded-able napi poll loop support This patch allows running each napi poll loop inside its own kernel thread. The kthread is created during netif_napi_add() if dev->threaded is set. And threaded mode is enabled in napi_enable(). We will provide a way to set dev->threaded and enable threaded mode without a device up/down in the following patch. Once that threaded mode is enabled and the kthread is started, napi_schedule() will wake-up such thread instead of scheduling the softirq. The threaded poll loop behaves quite likely the net_rx_action, but it does not have to manipulate local irqs and uses an explicit scheduling point based on netdev_budget. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Hannes Frederic Sowa Signed-off-by: Hannes Frederic Sowa Co-developed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski Signed-off-by: Wei Wang Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 +++------ net/core/dev.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e9e7ada07ea1..99fb4ec9573e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -347,6 +347,7 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; unsigned int napi_id; + struct task_struct *thread; }; enum { @@ -358,6 +359,7 @@ enum { NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ + NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ }; enum { @@ -369,6 +371,7 @@ enum { NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), }; enum gro_result { @@ -503,20 +506,7 @@ static inline bool napi_complete(struct napi_struct *n) */ void napi_disable(struct napi_struct *n); -/** - * napi_enable - enable NAPI scheduling - * @n: NAPI context - * - * Resume NAPI from being scheduled on this context. - * Must be paired with napi_disable. - */ -static inline void napi_enable(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - smp_mb__before_atomic(); - clear_bit(NAPI_STATE_SCHED, &n->state); - clear_bit(NAPI_STATE_NPSVC, &n->state); -} +void napi_enable(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running @@ -1827,6 +1817,8 @@ enum netdev_priv_flags { * * @wol_enabled: Wake-on-LAN is enabled * + * @threaded: napi threaded mode is enabled + * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. @@ -2145,6 +2137,7 @@ struct net_device { struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; + unsigned threaded:1; struct list_head net_notifier_list; diff --git a/net/core/dev.c b/net/core/dev.c index 59751a22d7c3..1e35f4f44f3b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -1494,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -4265,6 +4287,21 @@ int gro_normal_batch __read_mostly = 8; static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable(). Use READ_ONCE() to guarantee + * a complete read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + wake_up_process(thread); + return; + } + } + list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } @@ -6728,6 +6765,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (dev->threaded && napi_kthread_create(napi)) + dev->threaded = 0; } EXPORT_SYMBOL(netif_napi_add); @@ -6745,9 +6788,28 @@ void napi_disable(struct napi_struct *n) clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); + clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); +/** + * napi_enable - enable NAPI scheduling + * @n: NAPI context + * + * Resume NAPI from being scheduled on this context. + * Must be paired with napi_disable. + */ +void napi_enable(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + smp_mb__before_atomic(); + clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); + if (n->dev->threaded && n->thread) + set_bit(NAPI_STATE_THREADED, &n->state); +} +EXPORT_SYMBOL(napi_enable); + static void flush_gro_hash(struct napi_struct *napi) { int i; @@ -6773,6 +6835,11 @@ void __netif_napi_del(struct napi_struct *napi) flush_gro_hash(napi); napi->gro_bitmask = 0; + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; + } } EXPORT_SYMBOL(__netif_napi_del); @@ -6867,6 +6934,51 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) return work; } +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop() && !napi_disable_pending(napi)) { + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return -1; +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + void *have; + + while (!napi_thread_wait(napi)) { + for (;;) { + bool repoll = false; + + local_bh_disable(); + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + __kfree_skb_flush(); + local_bh_enable(); + + if (!repoll) + break; + + cond_resched(); + } + } + return 0; +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); -- cgit v1.2.3 From 5fdd2f0e5c64846bf3066689b73fc3b8dddd1c74 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Feb 2021 11:34:10 -0800 Subject: net: add sysfs attribute to control napi threaded mode This patch adds a new sysfs attribute to the network device class. Said attribute provides a per-device control to enable/disable the threaded mode for all the napi instances of the given network device, without the need for a device up/down. User sets it to 1 or 0 to enable or disable threaded mode. Note: when switching between threaded and the current softirq based mode for a napi instance, it will not immediately take effect if the napi is currently being polled. The mode switch will happen for the next time napi_schedule() is called. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Hannes Frederic Sowa Signed-off-by: Hannes Frederic Sowa Co-developed-by: Felix Fietkau Signed-off-by: Felix Fietkau Signed-off-by: Wei Wang Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 15 ++++++++++ include/linux/netdevice.h | 2 ++ net/core/dev.c | 48 +++++++++++++++++++++++++++++-- net/core/net-sysfs.c | 40 ++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 1f2002df5ba2..1419103d11f9 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org Description: 32-bit unsigned integer counting the number of times the link has been down + +What: /sys/class/net//threaded +Date: Jan 2021 +KernelVersion: 5.12 +Contact: netdev@vger.kernel.org +Description: + Boolean value to control the threaded mode per device. User could + set this value to enable/disable threaded mode for all napi + belonging to this device, without the need to do device up/down. + + Possible values: + == ================================== + 0 threaded mode disabled for this dev + 1 threaded mode enabled for this dev + == ================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 99fb4ec9573e..1340327f7abf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } +int dev_set_threaded(struct net_device *dev, bool threaded); + /** * napi_disable - prevent NAPI from scheduling * @n: NAPI context diff --git a/net/core/dev.c b/net/core/dev.c index 1e35f4f44f3b..7647278e46f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4291,8 +4291,9 @@ static inline void ____napi_schedule(struct softnet_data *sd, if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in - * napi_enable(). Use READ_ONCE() to guarantee - * a complete read on napi->thread. Only call + * napi_enable()/dev_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); @@ -6738,6 +6739,49 @@ static void init_gro_hash(struct napi_struct *napi) napi->gro_bitmask = 0; } +int dev_set_threaded(struct net_device *dev, bool threaded) +{ + struct napi_struct *napi; + int err = 0; + + if (dev->threaded == threaded) + return 0; + + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = false; + break; + } + } + } + } + + dev->threaded = threaded; + + /* Make sure kthread is created before THREADED bit + * is set. + */ + smp_mb__before_atomic(); + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } + + return err; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 91afb0b6de69..307628fdf380 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); +static ssize_t threaded_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) + ret = sprintf(buf, fmt_dec, netdev->threaded); + + rtnl_unlock(); + return ret; +} + +static int modify_napi_threaded(struct net_device *dev, unsigned long val) +{ + int ret; + + if (list_empty(&dev->napi_list)) + return -EOPNOTSUPP; + + if (val != 0 && val != 1) + return -EOPNOTSUPP; + + ret = dev_set_threaded(dev, val); + + return ret; +} + +static ssize_t threaded_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, modify_napi_threaded); +} +static DEVICE_ATTR_RW(threaded); + static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, + &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); -- cgit v1.2.3 From 01f810ace9ed37255f27608a0864abebccf0aab3 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sat, 6 Feb 2021 20:10:24 -0500 Subject: bpf: Allow variable-offset stack access Before this patch, variable offset access to the stack was dissalowed for regular instructions, but was allowed for "indirect" accesses (i.e. helpers). This patch removes the restriction, allowing reading and writing to the stack through stack pointers with variable offsets. This makes stack-allocated buffers more usable in programs, and brings stack pointers closer to other types of pointers. The motivation is being able to use stack-allocated buffers for data manipulation. When the stack size limit is sufficient, allocating buffers on the stack is simpler than per-cpu arrays, or other alternatives. In unpriviledged programs, variable-offset reads and writes are disallowed (they were already disallowed for the indirect access case) because the speculative execution checking code doesn't support them. Additionally, when writing through a variable-offset stack pointer, if any pointers are in the accessible range, there's possilibities of later leaking pointers because the write cannot be tracked precisely. Writes with variable offset mark the whole range as initialized, even though we don't know which stack slots are actually written. This is in order to not reject future reads to these slots. Note that this doesn't affect writes done through helpers; like before, helpers need the whole stack range to be initialized to begin with. All the stack slots are in range are considered scalars after the write; variable-offset register spills are not tracked. For reads, all the stack slots in the variable range needs to be initialized (but see above about what writes do), otherwise the read is rejected. All register spilled in stack slots that might be read are marked as having been read, however reads through such pointers don't do register filling; the target register will always be either a scalar or a constant zero. Signed-off-by: Andrei Matei Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210207011027.676572-2-andreimatei1@gmail.com --- include/linux/bpf.h | 5 + include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 657 +++++++++++++++++++++++++++++++++---------- 3 files changed, 518 insertions(+), 147 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 321966fc35db..079162bbd387 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1290,6 +1290,11 @@ static inline bool bpf_allow_ptr_leaks(void) return perfmon_capable(); } +static inline bool bpf_allow_uninit_stack(void) +{ + return perfmon_capable(); +} + static inline bool bpf_allow_ptr_to_map_access(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dfe6f85d97dd..532c97836d0d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -195,7 +195,7 @@ struct bpf_func_state { * 0 = main function, 1 = first callee. */ u32 frameno; - /* subprog number == index within subprog_stack_depth + /* subprog number == index within subprog_info * zero == main subprog */ u32 subprogno; @@ -404,6 +404,7 @@ struct bpf_verifier_env { u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool allow_uninit_stack; bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15694246f854..400d79e99fc8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2275,12 +2275,14 @@ static void save_register_state(struct bpf_func_state *state, state->stack[spi].slot_type[i] = STACK_SPILL; } -/* check_stack_read/write functions track spill/fill of registers, +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) +static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + /* stack frame we're writing to */ + struct bpf_func_state *state, + int off, int size, int value_regno, + int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -2406,9 +2408,175 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is + * known to contain a variable offset. + * This function checks whether the write is permitted and conservatively + * tracks the effects of the write, considering that each stack slot in the + * dynamic range is potentially written to. + * + * 'off' includes 'regno->off'. + * 'value_regno' can be -1, meaning that an unknown value is being written to + * the stack. + * + * Spilled pointers in range are not marked as written because we don't know + * what's going to be actually written. This means that read propagation for + * future reads cannot be terminated by this write. + * + * For privileged programs, uninitialized stack slots are considered + * initialized by this write (even though we don't know exactly what offsets + * are going to be written to). The idea is that we don't want the verifier to + * reject future reads that access slots written to through variable offsets. + */ +static int check_stack_write_var_off(struct bpf_verifier_env *env, + /* func where register points to */ + struct bpf_func_state *state, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_func_state *cur; /* state of the current function */ + int min_off, max_off; + int i, err; + struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + bool writing_zero = false; + /* set if the fact that we're writing a zero is used to let any + * stack slots remain STACK_ZERO + */ + bool zero_used = false; + + cur = env->cur_state->frame[env->cur_state->curframe]; + ptr_reg = &cur->regs[ptr_regno]; + min_off = ptr_reg->smin_value + off; + max_off = ptr_reg->smax_value + off + size; + if (value_regno >= 0) + value_reg = &cur->regs[value_regno]; + if (value_reg && register_is_null(value_reg)) + writing_zero = true; + + err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), + state->acquired_refs, true); + if (err) + return err; + + + /* Variable offset writes destroy any spilled pointers in range. */ + for (i = min_off; i < max_off; i++) { + u8 new_type, *stype; + int slot, spi; + + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + + if (!env->allow_ptr_leaks + && *stype != NOT_INIT + && *stype != SCALAR_VALUE) { + /* Reject the write if there's are spilled pointers in + * range. If we didn't reject here, the ptr status + * would be erased below (even though not all slots are + * actually overwritten), possibly opening the door to + * leaks. + */ + verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", + insn_idx, i); + return -EINVAL; + } + + /* Erase all spilled pointers. */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + + /* Update the slot type. */ + new_type = STACK_MISC; + if (writing_zero && *stype == STACK_ZERO) { + new_type = STACK_ZERO; + zero_used = true; + } + /* If the slot is STACK_INVALID, we check whether it's OK to + * pretend that it will be initialized by this write. The slot + * might not actually be written to, and so if we mark it as + * initialized future reads might leak uninitialized memory. + * For privileged programs, we will accept such reads to slots + * that may or may not be written because, if we're reject + * them, the error would be too confusing. + */ + if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", + insn_idx, i); + return -EINVAL; + } + *stype = new_type; + } + if (zero_used) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + return 0; +} + +/* When register 'dst_regno' is assigned some values from stack[min_off, + * max_off), we set the register's type according to the types of the + * respective stack slots. If all the stack values are known to be zeros, then + * so is the destination reg. Otherwise, the register is considered to be + * SCALAR. This function does not deal with register filling; the caller must + * ensure that all spilled registers in the stack range have been marked as + * read. + */ +static void mark_reg_stack_read(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *ptr_state, + int min_off, int max_off, int dst_regno) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + int i, slot, spi; + u8 *stype; + int zeros = 0; + + for (i = min_off; i < max_off; i++) { + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = ptr_state->stack[spi].slot_type; + if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) + break; + zeros++; + } + if (zeros == max_off - min_off) { + /* any access_size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[dst_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[dst_regno].precise = true; + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, dst_regno); + } + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; +} + +/* Read the stack at 'off' and put the results into the register indicated by + * 'dst_regno'. It handles reg filling if the addressed stack slot is a + * spilled reg. + * + * 'dst_regno' can be -1, meaning that the read value is not going to a + * register. + * + * The access is assumed to be within the current stack bounds. + */ +static int check_stack_read_fixed_off(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *reg_state, + int off, int size, int dst_regno) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -2416,11 +2584,6 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype; - if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", - off, size); - return -EACCES; - } stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -2431,9 +2594,9 @@ static int check_stack_read(struct bpf_verifier_env *env, verbose(env, "invalid size of register fill\n"); return -EACCES; } - if (value_regno >= 0) { - mark_reg_unknown(env, state->regs, value_regno); - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + if (dst_regno >= 0) { + mark_reg_unknown(env, state->regs, dst_regno); + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; @@ -2445,16 +2608,16 @@ static int check_stack_read(struct bpf_verifier_env *env, } } - if (value_regno >= 0) { + if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = *reg; + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { - /* If value_regno==-1, the caller is asking us whether + /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that @@ -2466,70 +2629,167 @@ static int check_stack_read(struct bpf_verifier_env *env, } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { - int zeros = 0; + u8 type; for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + type = stype[(slot - i) % BPF_REG_SIZE]; + if (type == STACK_MISC) continue; - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { - zeros++; + if (type == STACK_ZERO) continue; - } verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - if (value_regno >= 0) { - if (zeros == size) { - /* any size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[value_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[value_regno].precise = true; - } else { - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); - } - state->regs[value_regno].live |= REG_LIVE_WRITTEN; - } + if (dst_regno >= 0) + mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); } return 0; } -static int check_stack_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) +enum stack_access_src { + ACCESS_DIRECT = 1, /* the access is performed by an instruction */ + ACCESS_HELPER = 2, /* the access is performed by a helper */ +}; + +static int check_stack_range_initialized(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum stack_access_src type, + struct bpf_call_arg_meta *meta); + +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + +/* Read the stack at 'ptr_regno + off' and put the result into the register + * 'dst_regno'. + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * but not its variable offset. + * 'size' is assumed to be <= reg size and the access is assumed to be aligned. + * + * As opposed to check_stack_read_fixed_off, this function doesn't deal with + * filling registers (i.e. reads of spilled register cannot be detected when + * the offset is not fixed). We conservatively mark 'dst_regno' as containing + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * offset; for a fixed offset check_stack_read_fixed_off should be used + * instead. + */ +static int check_stack_read_var_off(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, int dst_regno) { - /* Stack accesses must be at a fixed offset, so that we - * can determine what type of data were returned. See - * check_stack_read(). + /* The state of the source register. */ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *ptr_state = func(env, reg); + int err; + int min_off, max_off; + + /* Note that we pass a NULL meta, so raw access will not be permitted. */ - if (!tnum_is_const(reg->var_off)) { + err = check_stack_range_initialized(env, ptr_regno, off, size, + false, ACCESS_DIRECT, NULL); + if (err) + return err; + + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; + mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + return 0; +} + +/* check_stack_read dispatches to check_stack_read_fixed_off or + * check_stack_read_var_off. + * + * The caller must ensure that the offset falls within the allocated stack + * bounds. + * + * 'dst_regno' is a register which will receive the value from the stack. It + * can be -1, meaning that the read value is not going to a register. + */ +static int check_stack_read(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int dst_regno) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + /* Some accesses are only permitted with a static offset. */ + bool var_off = !tnum_is_const(reg->var_off); + + /* The offset is required to be static when reads don't go to a + * register, in order to not leak pointers (see + * check_stack_read_fixed_off). + */ + if (dst_regno < 0 && var_off) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d\n", + verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } + /* Variable offset is prohibited for unprivileged mode for simplicity + * since it requires corresponding support in Spectre masking for stack + * ALU. See also retrieve_ptr_limit(). + */ + if (!env->bypass_spec_v1 && var_off) { + char tn_buf[48]; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + ptr_regno, tn_buf); return -EACCES; } - return 0; + if (!var_off) { + off += reg->var_off.value; + err = check_stack_read_fixed_off(env, state, off, size, + dst_regno); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. Note that dst_regno >= 0 on this + * branch. + */ + err = check_stack_read_var_off(env, ptr_regno, off, size, + dst_regno); + } + return err; +} + + +/* check_stack_write dispatches to check_stack_write_fixed_off or + * check_stack_write_var_off. + * + * 'ptr_regno' is the register used as a pointer into the stack. + * 'off' includes 'ptr_regno->off', but not its variable offset (if any). + * 'value_regno' is the register whose value we're writing to the stack. It can + * be -1, meaning that we're not writing from a register. + * + * The caller must ensure that the offset falls within the maximum stack size. + */ +static int check_stack_write(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + + if (tnum_is_const(reg->var_off)) { + off += reg->var_off.value; + err = check_stack_write_fixed_off(env, state, off, size, + value_regno, insn_idx); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. + */ + err = check_stack_write_var_off(env, state, + ptr_regno, off, size, + value_regno, insn_idx); + } + return err; } static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, @@ -2862,11 +3122,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } -static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); @@ -2985,8 +3240,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; - /* The stack spill tracking logic in check_stack_write() - * and check_stack_read() relies on stack accesses being + /* The stack spill tracking logic in check_stack_write_fixed_off() + * and check_stack_read_fixed_off() relies on stack accesses being * aligned. */ strict = true; @@ -3402,6 +3657,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return 0; } +/* Check that the stack access at the given offset is within bounds. The + * maximum valid offset is -1. + * + * The minimum valid offset is -MAX_BPF_STACK for writes, and + * -state->allocated_stack for reads. + */ +static int check_stack_slot_within_bounds(int off, + struct bpf_func_state *state, + enum bpf_access_type t) +{ + int min_valid_off; + + if (t == BPF_WRITE) + min_valid_off = -MAX_BPF_STACK; + else + min_valid_off = -state->allocated_stack; + + if (off < min_valid_off || off > -1) + return -EACCES; + return 0; +} + +/* Check that the stack access at 'regno + off' falls within the maximum stack + * bounds. + * + * 'off' includes `regno->offset`, but not its dynamic part (if any). + */ +static int check_stack_access_within_bounds( + struct bpf_verifier_env *env, + int regno, int off, int access_size, + enum stack_access_src src, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state = func(env, reg); + int min_off, max_off; + int err; + char *err_extra; + + if (src == ACCESS_HELPER) + /* We don't know if helpers are reading or writing (or both). */ + err_extra = " indirect access to"; + else if (type == BPF_READ) + err_extra = " read from"; + else + err_extra = " write to"; + + if (tnum_is_const(reg->var_off)) { + min_off = reg->var_off.value + off; + if (access_size > 0) + max_off = min_off + access_size - 1; + else + max_off = min_off; + } else { + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smin_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack R%d\n", + err_extra, regno); + return -EACCES; + } + min_off = reg->smin_value + off; + if (access_size > 0) + max_off = reg->smax_value + off + access_size - 1; + else + max_off = min_off; + } + + err = check_stack_slot_within_bounds(min_off, state, type); + if (!err) + err = check_stack_slot_within_bounds(max_off, state, type); + + if (err) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid%s stack R%d off=%d size=%d\n", + err_extra, regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", + err_extra, regno, tn_buf, access_size); + } + } + return err; +} /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory @@ -3517,8 +3857,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - off += reg->var_off.value; - err = check_stack_access(env, reg, off, size); + /* Basic bounds checks. */ + err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err; @@ -3527,12 +3867,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - if (t == BPF_WRITE) - err = check_stack_write(env, state, off, size, - value_regno, insn_idx); - else - err = check_stack_read(env, state, off, size, + if (t == BPF_READ) + err = check_stack_read(env, regno, off, size, value_regno); + else + err = check_stack_write(env, regno, off, size, + value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -3699,49 +4039,53 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return 0; } -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, - int off, int access_size, - bool zero_size_allowed) +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type, that all elements of the stack are initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. + */ +static int check_stack_range_initialized( + struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum stack_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = func(env, reg); + int err, min_off, max_off, i, j, slot, spi; + char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; + enum bpf_access_type bounds_check_type; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = false; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - } else { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", - regno, tn_buf, access_size); - } + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); return -EACCES; } - return 0; -} -/* when register 'regno' is passed into function that will read 'access_size' - * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized. - * Unlike most pointer bounds-checking functions, this one doesn't take an - * 'off' argument, so it has to add in reg->off itself. - */ -static int check_stack_boundary(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, - struct bpf_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, j, slot, spi; + if (type == ACCESS_HELPER) { + /* The bounds checks for writes are more permissive than for + * reads. However, if raw_mode is not set, we'll do extra + * checks below. + */ + bounds_check_type = BPF_WRITE; + clobber = true; + } else { + bounds_check_type = BPF_READ; + } + err = check_stack_access_within_bounds(env, regno, off, access_size, + type, bounds_check_type); + if (err) + return err; + if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) - return err; + min_off = max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@ -3752,8 +4096,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", + regno, err_extra, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -3765,28 +4109,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smax_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded indirect variable offset stack access\n", - regno); - return -EACCES; - } - min_off = reg->smin_value + reg->off; - max_off = reg->smax_value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d min value is outside of stack bound\n", - regno); - return err; - } - err = __check_stack_boundary(env, regno, max_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d max value is outside of stack bound\n", - regno); - return err; - } + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; } if (meta && meta->raw_mode) { @@ -3806,8 +4130,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (*stype == STACK_MISC) goto mark; if (*stype == STACK_ZERO) { - /* helper can write anything into the stack */ - *stype = STACK_MISC; + if (clobber) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + } goto mark; } @@ -3818,22 +4144,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->stack[spi].slot_type[0] == STACK_SPILL && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { - __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - state->stack[spi].slot_type[j] = STACK_MISC; + if (clobber) { + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + } goto mark; } err: if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - min_off, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", + err_extra, regno, min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", - tn_buf, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", + err_extra, regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -3882,8 +4210,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, "rdwr", &env->prog->aux->max_rdwr_access); case PTR_TO_STACK: - return check_stack_boundary(env, regno, access_size, - zero_size_allowed, meta); + return check_stack_range_initialized( + env, + regno, reg->off, access_size, + zero_size_allowed, ACCESS_HELPER, meta); default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -5547,6 +5877,41 @@ do_sim: return !ret ? -EFAULT : 0; } +/* check that stack access falls within stack limits and that 'reg' doesn't + * have a variable offset. + * + * Variable offset is prohibited for unprivileged mode for simplicity since it + * requires corresponding support in Spectre masking for stack ALU. See also + * retrieve_ptr_limit(). + * + * + * 'off' includes 'reg->off'. + */ +static int check_stack_access_for_ptr_arithmetic( + struct bpf_verifier_env *env, + int regno, + const struct bpf_reg_state *reg, + int off) +{ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n", + regno, tn_buf, off); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root; off=%d\n", regno, off); + return -EACCES; + } + + return 0; +} + + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -5790,10 +6155,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, "prohibited for !root\n", dst); return -EACCES; } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access(env, dst_reg, dst_reg->off + - dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " - "prohibited for !root\n", dst); + check_stack_access_for_ptr_arithmetic( + env, dst, dst_reg, dst_reg->off + + dst_reg->var_off.value)) { return -EACCES; } } @@ -12129,6 +12493,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); -- cgit v1.2.3 From 1bcc51ac0731aab1b109b2cd5c3d495f1884e5ca Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 9 Feb 2021 14:37:49 +0800 Subject: net/sched: cls_flower: Reject invalid ct_state flags rules Reject the unsupported and invalid ct_state flags of cls flower rules. Fixes: e0ace68af2ac ("net/sched: cls_flower: Add matching on conntrack info") Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ee95f42fb0ec..88f4bf0047e7 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -591,6 +591,8 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ + + __TCA_FLOWER_KEY_CT_FLAGS_MAX, }; enum { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 84f932532db7..46c1b3e9f66a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -30,6 +30,11 @@ #include +#define TCA_FLOWER_KEY_CT_FLAGS_MAX \ + ((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1) +#define TCA_FLOWER_KEY_CT_FLAGS_MASK \ + (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@ -686,8 +691,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, - [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 }, - [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CT_STATE] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), + [TCA_FLOWER_KEY_CT_STATE_MASK] = + NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK), [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 }, @@ -1390,12 +1397,33 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_validate_ct_state(u16 state, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "no trk, so no other flag can be set"); + return -EINVAL; + } + + if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW && + state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) { + NL_SET_ERR_MSG_ATTR(extack, tb, + "new and est are mutually exclusive"); + return -EINVAL; + } + + return 0; +} + static int fl_set_key_ct(struct nlattr **tb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask, struct netlink_ext_ack *extack) { if (tb[TCA_FLOWER_KEY_CT_STATE]) { + int err; + if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) { NL_SET_ERR_MSG(extack, "Conntrack isn't enabled"); return -EOPNOTSUPP; @@ -1403,6 +1431,13 @@ static int fl_set_key_ct(struct nlattr **tb, fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE, &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK, sizeof(key->ct_state)); + + err = fl_validate_ct_state(mask->ct_state, + tb[TCA_FLOWER_KEY_CT_STATE_MASK], + extack); + if (err) + return err; + } if (tb[TCA_FLOWER_KEY_CT_ZONE]) { if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { -- cgit v1.2.3 From 5b74df80f301e872143fa716f3f4361b2e293e19 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 4 Jan 2021 09:38:57 +0200 Subject: net/mlx5: Delete device list leftover Device list is not stored in mlx5_priv anymore, so delete it as it's not used. Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 88197b87bd81..936302b2c141 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -588,7 +588,6 @@ struct mlx5_priv { /* end: alloc staff */ struct dentry *dbg_root; - struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; struct mlx5_adev **adev; -- cgit v1.2.3 From 700d4796ef59f5faf240d307839bd419e2b6bdff Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:26 -0800 Subject: bpf: Optimize program stats Move bpf_prog_stats from prog->aux into prog to avoid one extra load in critical path of program execution. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 -------- include/linux/filter.h | 14 +++++++++++--- kernel/bpf/core.c | 8 ++++---- kernel/bpf/syscall.c | 2 +- kernel/bpf/trampoline.c | 2 +- kernel/bpf/verifier.c | 2 +- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 079162bbd387..5a388955e6b6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -507,12 +506,6 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_ARGS 12 -struct bpf_prog_stats { - u64 cnt; - u64 nsecs; - struct u64_stats_sync syncp; -} __aligned(2 * sizeof(u64)); - struct btf_func_model { u8 ret_size; u8 nr_args; @@ -845,7 +838,6 @@ struct bpf_prog_aux { u32 linfo_idx; u32 num_exentries; struct exception_table_entry *extable; - struct bpf_prog_stats __percpu *stats; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5b3137d7b690..cecb03c9d251 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -539,6 +540,12 @@ struct bpf_binary_header { u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; +struct bpf_prog_stats { + u64 cnt; + u64 nsecs; + struct u64_stats_sync syncp; +} __aligned(2 * sizeof(u64)); + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -557,10 +564,11 @@ struct bpf_prog { u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; - struct bpf_prog_aux *aux; /* Auxiliary fields */ - struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_prog_stats __percpu *stats; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); + struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ struct sock_filter insns[0]; struct bpf_insn insnsi[]; @@ -581,7 +589,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); struct bpf_prog_stats *__stats; \ u64 __start = sched_clock(); \ __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - __stats = this_cpu_ptr(prog->aux->stats); \ + __stats = this_cpu_ptr(prog->stats); \ u64_stats_update_begin(&__stats->syncp); \ __stats->cnt++; \ __stats->nsecs += sched_clock() - __start; \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5bbd4884ff7a..2cf71fd39c22 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -114,8 +114,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (!prog) return NULL; - prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); - if (!prog->aux->stats) { + prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->stats) { kfree(prog->aux); vfree(prog); return NULL; @@ -124,7 +124,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; - pstats = per_cpu_ptr(prog->aux->stats, cpu); + pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; @@ -249,10 +249,10 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); - free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); } + free_percpu(fp->stats); vfree(fp); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e5999d86c76e..f7df56a704de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1739,7 +1739,7 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, unsigned int start; u64 tnsecs, tcnt; - st = per_cpu_ptr(prog->aux->stats, cpu); + st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 35c5887d82ff..5be3beeedd74 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -412,7 +412,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) * Hence check that 'start' is not zero. */ start) { - stats = this_cpu_ptr(prog->aux->stats); + stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 400d79e99fc8..424c1ba0f52f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11253,7 +11253,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* BPF_PROG_RUN doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->aux->stats will never be accessed and stays NULL + * func[i]->stats will never be accessed and stays NULL */ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) -- cgit v1.2.3 From f2dd3b39467411c53703125a111f45b3672c1771 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:28 -0800 Subject: bpf: Compute program stats for sleepable programs Since sleepable programs don't migrate from the cpu the excution stats can be computed for them as well. Reuse the same infrastructure for both sleepable and non-sleepable programs. run_cnt -> the number of times the program was executed. run_time_ns -> the program execution time in nanoseconds including the off-cpu time when the program was sleeping. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210210033634.62081-4-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 31 ++++++++++++------------------- include/linux/bpf.h | 4 ++-- kernel/bpf/trampoline.c | 42 ++++++++++++++++++++++++++++-------------- 3 files changed, 42 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a3dc3bd154ac..d11b9bcebbea 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1742,15 +1742,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_enter_sleepable : + __bpf_prog_enter, prog)) return -EINVAL; - } else { - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); - } + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1770,18 +1767,14 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_exit_sleepable : + __bpf_prog_exit, prog)) return -EINVAL; - } else { - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; - } *pprog = prog; return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a388955e6b6..e1840ceaf55f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -563,8 +563,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -void notrace __bpf_prog_enter_sleepable(void); -void notrace __bpf_prog_exit_sleepable(void); +u64 notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); struct bpf_ksym { unsigned long start; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 89fc849ba271..48eb021e1421 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,56 +381,70 @@ out: mutex_unlock(&trampoline_mutex); } +#define NO_START_TIME 0 +static u64 notrace bpf_prog_start_time(void) +{ + u64 start = NO_START_TIME; + + if (static_branch_unlikely(&bpf_stats_enabled_key)) + start = sched_clock(); + return start; +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into - * call _bpf_prog_enter + * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit */ u64 notrace __bpf_prog_enter(void) __acquires(RCU) { - u64 start = 0; - rcu_read_lock(); migrate_disable(); - if (static_branch_unlikely(&bpf_stats_enabled_key)) - start = sched_clock(); - return start; + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) - __releases(RCU) +static void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter - * and disabled in __bpf_prog_exit. + /* static_key could be enabled in __bpf_prog_enter* + * and disabled in __bpf_prog_exit*. * And vice versa. - * Hence check that 'start' is not zero. + * Hence check that 'start' is valid. */ - start) { + start > NO_START_TIME) { stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) +{ + update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock(); } -void notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(void) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(void) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { + update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock_trace(); } -- cgit v1.2.3 From ca06f55b90020cd97f4cc6d52db95436162e7dcf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:29 -0800 Subject: bpf: Add per-program recursion prevention mechanism Since both sleepable and non-sleepable programs execute under migrate_disable add recursion prevention mechanism to both types of programs when they're executed via bpf trampoline. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-5-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 15 ++++++++++++++ include/linux/bpf.h | 6 +++--- include/linux/filter.h | 1 + kernel/bpf/core.c | 8 ++++++++ kernel/bpf/trampoline.c | 23 ++++++++++++++++++---- .../selftests/bpf/prog_tests/fexit_stress.c | 4 ++-- .../selftests/bpf/prog_tests/trampoline_count.c | 4 ++-- 7 files changed, 50 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d11b9bcebbea..79e7a0ec1da5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1740,8 +1740,11 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; + u8 *jmp_insn; int cnt = 0; + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter, prog)) @@ -1749,6 +1752,14 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + /* if (__bpf_prog_enter*(prog) == 0) + * goto skip_exec_of_prog; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + /* emit 2 nops that will be replaced with JE insn */ + jmp_insn = prog; + emit_nops(&prog, 2); + /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); /* arg2: progs[i]->insnsi for interpreter */ @@ -1767,6 +1778,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + /* replace 2 nops with JE insn, since jmp target is known */ + jmp_insn[0] = X86_JE; + jmp_insn[1] = prog - jmp_insn - 2; + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: mov rsi, rbx <- start time in nsec */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1840ceaf55f..1c8ea682c007 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -529,7 +529,7 @@ struct btf_func_model { /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -#define BPF_MAX_TRAMP_PROGS 40 +#define BPF_MAX_TRAMP_PROGS 38 struct bpf_tramp_progs { struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; @@ -561,9 +561,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, struct bpf_tramp_progs *tprogs, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(void); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -u64 notrace __bpf_prog_enter_sleepable(void); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); struct bpf_ksym { diff --git a/include/linux/filter.h b/include/linux/filter.h index cecb03c9d251..6a06f3c69f4e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -565,6 +565,7 @@ struct bpf_prog { u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; struct bpf_prog_stats __percpu *stats; + int __percpu *active; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2cf71fd39c22..334070c4b8a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } + fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + if (!fp->active) { + vfree(fp); + kfree(aux); + return NULL; + } fp->pages = size / PAGE_SIZE; fp->aux = aux; @@ -116,6 +122,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { + free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; @@ -253,6 +260,7 @@ void __bpf_prog_free(struct bpf_prog *fp) kfree(fp->aux); } free_percpu(fp->stats); + free_percpu(fp->active); vfree(fp); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 48eb021e1421..89ef6320d19b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,13 +381,16 @@ out: mutex_unlock(&trampoline_mutex); } -#define NO_START_TIME 0 +#define NO_START_TIME 1 static u64 notrace bpf_prog_start_time(void) { u64 start = NO_START_TIME; - if (static_branch_unlikely(&bpf_stats_enabled_key)) + if (static_branch_unlikely(&bpf_stats_enabled_key)) { start = sched_clock(); + if (unlikely(!start)) + start = NO_START_TIME; + } return start; } @@ -397,12 +400,20 @@ static u64 notrace bpf_prog_start_time(void) * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit + * + * __bpf_prog_enter returns: + * 0 - skip execution of the bpf prog + * 1 - execute bpf prog + * [2..MAX_U64] - excute bpf prog and record execution time. + * This is start time. */ -u64 notrace __bpf_prog_enter(void) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog) __acquires(RCU) { rcu_read_lock(); migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + return 0; return bpf_prog_start_time(); } @@ -430,21 +441,25 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) __releases(RCU) { update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + return 0; return bpf_prog_start_time(); } void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c index 3b9dbf7433f0..7c9b62e971f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c @@ -2,8 +2,8 @@ /* Copyright (c) 2019 Facebook */ #include -/* x86-64 fits 55 JITed and 43 interpreted progs into half page */ -#define CNT 40 +/* that's kernel internal BPF_MAX_TRAMP_PROGS define */ +#define CNT 38 void test_fexit_stress(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 781c8d11604b..f3022d934e2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -4,7 +4,7 @@ #include #include -#define MAX_TRAMP_PROGS 40 +#define MAX_TRAMP_PROGS 38 struct inst { struct bpf_object *obj; @@ -52,7 +52,7 @@ void test_trampoline_count(void) struct bpf_link *link; char comm[16] = {}; - /* attach 'allowed' 40 trampoline programs */ + /* attach 'allowed' trampoline programs */ for (i = 0; i < MAX_TRAMP_PROGS; i++) { obj = bpf_object__open_file(object, NULL); if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { -- cgit v1.2.3 From 9ed9e9ba2337205311398a312796c213737bac35 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:31 -0800 Subject: bpf: Count the number of times recursion was prevented Add per-program counter for number of times recursion prevention mechanism was triggered and expose it via show_fdinfo and bpf_prog_info. Teach bpftool to print it. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-7-alexei.starovoitov@gmail.com --- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 14 ++++++++++---- kernel/bpf/trampoline.c | 18 ++++++++++++++++-- tools/bpf/bpftool/prog.c | 4 ++++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 33 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6a06f3c69f4e..3b00fc906ccd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,6 +543,7 @@ struct bpf_binary_header { struct bpf_prog_stats { u64 cnt; u64 nsecs; + u64 misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c001766adcbc..c547ad1ffe43 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4501,6 +4501,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f7df56a704de..c859bc46d06c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_stats *stats) { - u64 nsecs = 0, cnt = 0; + u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; - u64 tnsecs, tcnt; + u64 tnsecs, tcnt, tmisses; st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; tcnt = st->cnt; + tmisses = st->misses; } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; + misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; + stats->misses = misses; } #ifdef CONFIG_PROC_FS @@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" - "run_cnt:\t%llu\n", + "run_cnt:\t%llu\n" + "recursion_misses:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, - stats.cnt); + stats.cnt, + stats.misses); } #endif @@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; + info.recursion_misses = stats.misses; if (!bpf_capable()) { info.jited_prog_len = 0; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 89ef6320d19b..7bc3b3209224 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -394,6 +394,16 @@ static u64 notrace bpf_prog_start_time(void) return start; } +static void notrace inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->misses++; + u64_stats_update_end(&stats->syncp); +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -412,8 +422,10 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog) { rcu_read_lock(); migrate_disable(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); return 0; + } return bpf_prog_start_time(); } @@ -451,8 +463,10 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) rcu_read_lock_trace(); migrate_disable(); might_fault(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); return 0; + } return bpf_prog_start_time(); } diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 1fe3ba255bad..f2b915b20546 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -368,6 +368,8 @@ static void print_prog_header_json(struct bpf_prog_info *info) jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); } + if (info->recursion_misses) + jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses); } static void print_prog_json(struct bpf_prog_info *info, int fd) @@ -446,6 +448,8 @@ static void print_prog_header_plain(struct bpf_prog_info *info) if (info->run_time_ns) printf(" run_time_ns %lld run_cnt %lld", info->run_time_ns, info->run_cnt); + if (info->recursion_misses) + printf(" recursion_misses %lld", info->recursion_misses); printf("\n"); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c001766adcbc..c547ad1ffe43 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4501,6 +4501,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From e13e4536f0922a5bf8df92bb64964c9279fb4cdc Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 9 Feb 2021 12:59:55 +0200 Subject: devlink: Fix dmac_filter trap name, align to its documentation %s/dest_mac_filter/dmac_filter/g Fixes: e78ab164591f ("devlink: Add DMAC filter generic packet trap") Signed-off-by: Aya Levin Reported-by: Ido Schimmel Reviewed-by: Tariq Toukan Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 47b4b063401b..853420db5d32 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1090,7 +1090,7 @@ enum devlink_trap_group_generic_id { #define DEVLINK_TRAP_GENERIC_NAME_BLACKHOLE_NEXTHOP \ "blackhole_nexthop" #define DEVLINK_TRAP_GENERIC_NAME_DMAC_FILTER \ - "dest_mac_filter" + "dmac_filter" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From 4217a64e18a1647a0dbc68cb3169a5a06f054ec8 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 9 Feb 2021 17:38:52 +0100 Subject: net: phy: introduce phydev->port At the moment, PORT_MII is reported in the ethtool ops. This is odd because it is an interface between the MAC and the PHY and no external port. Some network card drivers will overwrite the port to twisted pair or fiber, though. Even worse, the MDI/MDIX setting is only used by ethtool if the port is twisted pair. Set the port to PORT_TP by default because most PHY drivers are copper ones. If there is fibre support and it is enabled, the PHY driver will set it to PORT_FIBRE. This will change reporting PORT_MII to either PORT_TP or PORT_FIBRE; except for the genphy fallback driver. Suggested-by: Andrew Lunn Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 2 ++ drivers/net/phy/dp83822.c | 3 +++ drivers/net/phy/dp83869.c | 4 ++++ drivers/net/phy/lxt.c | 1 + drivers/net/phy/marvell.c | 1 + drivers/net/phy/marvell10g.c | 2 ++ drivers/net/phy/micrel.c | 14 +++++++++++--- drivers/net/phy/phy.c | 2 +- drivers/net/phy/phy_device.c | 9 +++++++++ include/linux/phy.h | 2 ++ 10 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 3142ba768313..0472b3470c59 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -410,6 +410,8 @@ static int bcm54616s_probe(struct phy_device *phydev) */ if (!(val & BCM54616S_100FX_MODE)) phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX; + + phydev->port = PORT_FIBRE; } return 0; diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index fff371ca1086..be1224b4447b 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -554,6 +554,9 @@ static int dp83822_probe(struct phy_device *phydev) dp83822_of_init(phydev); + if (dp83822->fx_enabled) + phydev->port = PORT_FIBRE; + return 0; } diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index b30bc142d82e..755220c6451f 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -855,6 +855,10 @@ static int dp83869_probe(struct phy_device *phydev) if (ret) return ret; + if (dp83869->mode == DP83869_RGMII_100_BASE || + dp83869->mode == DP83869_RGMII_1000_BASE) + phydev->port = PORT_FIBRE; + return dp83869_config_init(phydev); } diff --git a/drivers/net/phy/lxt.c b/drivers/net/phy/lxt.c index 0ee23d29c0d4..bde3356a2f86 100644 --- a/drivers/net/phy/lxt.c +++ b/drivers/net/phy/lxt.c @@ -292,6 +292,7 @@ static int lxt973_probe(struct phy_device *phydev) phy_write(phydev, MII_BMCR, val); /* Remember that the port is in fiber mode. */ phydev->priv = lxt973_probe; + phydev->port = PORT_FIBRE; } else { phydev->priv = NULL; } diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index b523aa37ebf0..3238d0fbf437 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1552,6 +1552,7 @@ static int marvell_read_status_page(struct phy_device *phydev, int page) phydev->asym_pause = 0; phydev->speed = SPEED_UNKNOWN; phydev->duplex = DUPLEX_UNKNOWN; + phydev->port = fiber ? PORT_FIBRE : PORT_TP; if (phydev->autoneg == AUTONEG_ENABLE) err = marvell_read_status_page_an(phydev, fiber, status); diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 1901ba277413..b1bb9b8e1e4e 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -631,6 +631,7 @@ static int mv3310_read_status_10gbaser(struct phy_device *phydev) phydev->link = 1; phydev->speed = SPEED_10000; phydev->duplex = DUPLEX_FULL; + phydev->port = PORT_FIBRE; return 0; } @@ -690,6 +691,7 @@ static int mv3310_read_status_copper(struct phy_device *phydev) phydev->duplex = cssr1 & MV_PCS_CSSR1_DUPLEX_FULL ? DUPLEX_FULL : DUPLEX_HALF; + phydev->port = PORT_TP; phydev->mdix = cssr1 & MV_PCS_CSSR1_MDIX ? ETH_TP_MDI_X : ETH_TP_MDI; diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 494abf608b8f..7ec6f70d6a82 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -341,14 +341,19 @@ static int kszphy_config_init(struct phy_device *phydev) return kszphy_config_reset(phydev); } +static int ksz8041_fiber_mode(struct phy_device *phydev) +{ + struct device_node *of_node = phydev->mdio.dev.of_node; + + return of_property_read_bool(of_node, "micrel,fiber-mode"); +} + static int ksz8041_config_init(struct phy_device *phydev) { __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; - struct device_node *of_node = phydev->mdio.dev.of_node; - /* Limit supported and advertised modes in fiber mode */ - if (of_property_read_bool(of_node, "micrel,fiber-mode")) { + if (ksz8041_fiber_mode(phydev)) { phydev->dev_flags |= MICREL_PHY_FXEN; linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, mask); linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, mask); @@ -1176,6 +1181,9 @@ static int kszphy_probe(struct phy_device *phydev) } } + if (ksz8041_fiber_mode(phydev)) + phydev->port = PORT_FIBRE; + /* Support legacy board-file configuration */ if (phydev->dev_flags & MICREL_PHY_50MHZ_CLK) { priv->rmii_ref_clk_sel = true; diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 9cb7e4dbf8f4..fdb914b5b857 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -308,7 +308,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, if (phydev->interface == PHY_INTERFACE_MODE_MOCA) cmd->base.port = PORT_BNC; else - cmd->base.port = PORT_MII; + cmd->base.port = phydev->port; cmd->base.transceiver = phy_is_internal(phydev) ? XCVR_INTERNAL : XCVR_EXTERNAL; cmd->base.phy_address = phydev->mdio.addr; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8447e56ba572..30a20a29ae05 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -606,6 +606,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, dev->pause = 0; dev->asym_pause = 0; dev->link = 0; + dev->port = PORT_TP; dev->interface = PHY_INTERFACE_MODE_GMII; dev->autoneg = AUTONEG_ENABLE; @@ -1403,6 +1404,14 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, phydev->state = PHY_READY; + /* Port is set to PORT_TP by default and the actual PHY driver will set + * it to different value depending on the PHY configuration. If we have + * the generic PHY driver we can't figure it out, thus set the old + * legacy PORT_MII value. + */ + if (using_genphy) + phydev->port = PORT_MII; + /* Initial carrier state is off as the phy is about to be * (re)initialized. */ diff --git a/include/linux/phy.h b/include/linux/phy.h index bc323fbdd21e..c130788306c8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -503,6 +503,7 @@ struct macsec_ops; * * @speed: Current link speed * @duplex: Current duplex + * @port: Current port * @pause: Current pause * @asym_pause: Current asymmetric pause * @supported: Combined MAC/PHY supported linkmodes @@ -581,6 +582,7 @@ struct phy_device { */ int speed; int duplex; + int port; int pause; int asym_pause; u8 master_slave_get; -- cgit v1.2.3 From dcf0cd1cc58b8e88793ad6531db9b3a47324ca09 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 9 Feb 2021 19:02:11 -0600 Subject: net: hsr: add offloading support Add support for offloading of HSR/PRP (IEC 62439-3) tag insertion tag removal, duplicate generation and forwarding. For HSR, insertion involves the switch adding a 6 byte HSR header after the 14 byte Ethernet header. For PRP it adds a 6 byte trailer. Tag removal involves automatically stripping the HSR/PRP header/trailer in the switch. This is possible when the switch also performs auto deduplication using the HSR/PRP header/trailer (making it no longer required). Forwarding involves automatically forwarding between redundant ports in an HSR. This is crucial because delay is accumulated as a frame passes through each node in the ring. Duplication involves the switch automatically sending a single frame from the CPU port to both redundant ports. This is required because the inserted HSR/PRP header/trailer must contain the same sequence number on the frames sent out both redundant ports. Export is_hsr_master so DSA can tell them apart from other devices in dsa_slave_changeupper. Signed-off-by: George McCollister Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/netdev-features.rst | 21 +++++++++++++++++++++ include/linux/if_hsr.h | 27 +++++++++++++++++++++++++++ include/linux/netdev_features.h | 9 +++++++++ net/ethtool/common.c | 4 ++++ net/hsr/hsr_device.c | 14 +++----------- net/hsr/hsr_device.h | 1 - net/hsr/hsr_forward.c | 27 ++++++++++++++++++++++++--- net/hsr/hsr_forward.h | 1 + net/hsr/hsr_framereg.c | 2 ++ net/hsr/hsr_main.c | 11 +++++++++++ net/hsr/hsr_main.h | 8 +------- net/hsr/hsr_slave.c | 10 ++++++---- 12 files changed, 109 insertions(+), 26 deletions(-) create mode 100644 include/linux/if_hsr.h (limited to 'include') diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index a2d7d7160e39..d7b15bb64deb 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -182,3 +182,24 @@ stricter than Hardware LRO. A packet stream merged by Hardware GRO must be re-segmentable by GSO or TSO back to the exact original packet stream. Hardware GRO is dependent on RXCSUM since every packet successfully merged by hardware must also have the checksum verified by hardware. + +* hsr-tag-ins-offload + +This should be set for devices which insert an HSR (High-availability Seamless +Redundancy) or PRP (Parallel Redundancy Protocol) tag automatically. + +* hsr-tag-rm-offload + +This should be set for devices which remove HSR (High-availability Seamless +Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically. + +* hsr-fwd-offload + +This should be set for devices which forward HSR (High-availability Seamless +Redundancy) frames from one port to another in hardware. + +* hsr-dup-offload + +This should be set for devices which duplicate outgoing HSR (High-availability +Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically +frames in hardware. diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h new file mode 100644 index 000000000000..38bbc537d4e4 --- /dev/null +++ b/include/linux/if_hsr.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IF_HSR_H_ +#define _LINUX_IF_HSR_H_ + +/* used to differentiate various protocols */ +enum hsr_version { + HSR_V0 = 0, + HSR_V1, + PRP_V1, +}; + +#if IS_ENABLED(CONFIG_HSR) +extern bool is_hsr_master(struct net_device *dev); +extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver); +#else +static inline bool is_hsr_master(struct net_device *dev) +{ + return false; +} +static inline int hsr_get_version(struct net_device *dev, + enum hsr_version *ver) +{ + return -EINVAL; +} +#endif /* CONFIG_HSR */ + +#endif /*_LINUX_IF_HSR_H_*/ diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index c06d6aaba9df..3de38d6a0aea 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -86,6 +86,11 @@ enum { NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */ NETIF_F_GRO_UDP_FWD_BIT, /* Allow UDP GRO for forwarding */ + NETIF_F_HW_HSR_TAG_INS_BIT, /* Offload HSR tag insertion */ + NETIF_F_HW_HSR_TAG_RM_BIT, /* Offload HSR tag removal */ + NETIF_F_HW_HSR_FWD_BIT, /* Offload HSR forwarding */ + NETIF_F_HW_HSR_DUP_BIT, /* Offload HSR duplication */ + /* * Add your fresh new feature above and remember to update * netdev_features_strings[] in net/core/ethtool.c and maybe @@ -159,6 +164,10 @@ enum { #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST) #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC) #define NETIF_F_GRO_UDP_FWD __NETIF_F(GRO_UDP_FWD) +#define NETIF_F_HW_HSR_TAG_INS __NETIF_F(HW_HSR_TAG_INS) +#define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM) +#define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) +#define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) /* Finds the next feature with the highest number of the range of start till 0. */ diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 835b9bba3e7e..c6a383dfd6c2 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -69,6 +69,10 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload", [NETIF_F_GRO_UDP_FWD_BIT] = "rx-udp-gro-forwarding", + [NETIF_F_HW_HSR_TAG_INS_BIT] = "hsr-tag-ins-offload", + [NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload", + [NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload", + [NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload", }; const char diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index ec6a68b403d5..7444ec6e298e 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -417,6 +417,7 @@ static struct hsr_proto_ops hsr_ops = { .send_sv_frame = send_hsr_supervision_frame, .create_tagged_frame = hsr_create_tagged_frame, .get_untagged_frame = hsr_get_untagged_frame, + .drop_frame = hsr_drop_frame, .fill_frame_info = hsr_fill_frame_info, .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, }; @@ -464,10 +465,11 @@ void hsr_dev_setup(struct net_device *dev) /* Return true if dev is a HSR master; return false otherwise. */ -inline bool is_hsr_master(struct net_device *dev) +bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } +EXPORT_SYMBOL(is_hsr_master); /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { @@ -520,16 +522,6 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr->prot_version = protocol_version; - /* FIXME: should I modify the value of these? - * - * - hsr_dev->flags - i.e. - * IFF_MASTER/SLAVE? - * - hsr_dev->priv_flags - i.e. - * IFF_EBRIDGE? - * IFF_TX_SKB_SHARING? - * IFF_HSR_MASTER/SLAVE? - */ - /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 868373822ee4..9060c92168f9 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -19,6 +19,5 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec, u8 protocol_version, struct netlink_ext_ack *extack); void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); -bool is_hsr_master(struct net_device *dev); int hsr_get_max_mtu(struct hsr_priv *hsr); #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index d32cd87d5c5b..ed82a470b6e1 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -249,6 +249,8 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, /* set the lane id properly */ hsr_set_path_id(hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); + } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { + return skb_clone(frame->skb_std, GFP_ATOMIC); } /* Create the new skb with enough headroom to fit the HSR tag */ @@ -291,6 +293,8 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, return NULL; } return skb_clone(frame->skb_prp, GFP_ATOMIC); + } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { + return skb_clone(frame->skb_std, GFP_ATOMIC); } skb = skb_copy_expand(frame->skb_std, 0, @@ -343,6 +347,14 @@ bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) port->type == HSR_PT_SLAVE_A)); } +bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) +{ + if (port->dev->features & NETIF_F_HW_HSR_FWD) + return prp_drop_frame(frame, port); + + return false; +} + /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before @@ -359,6 +371,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) { struct hsr_port *port; struct sk_buff *skb; + bool sent = false; hsr_for_each_port(frame->port_rcv->hsr, port) { struct hsr_priv *hsr = port->hsr; @@ -374,6 +387,12 @@ static void hsr_forward_do(struct hsr_frame_info *frame) if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; + /* If hardware duplicate generation is enabled, only send out + * one port. + */ + if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent) + continue; + /* Don't send frame over port where it has been sent before. * Also fro SAN, this shouldn't be done. */ @@ -405,10 +424,12 @@ static void hsr_forward_do(struct hsr_frame_info *frame) } skb->dev = port->dev; - if (port->type == HSR_PT_MASTER) + if (port->type == HSR_PT_MASTER) { hsr_deliver_master(skb, port->dev, frame->node_src); - else - hsr_xmit(skb, port, frame); + } else { + if (!hsr_xmit(skb, port, frame)) + sent = true; + } } } diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 618140d484ad..b6acaafa83fc 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -23,6 +23,7 @@ struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port); bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port); +bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port); void prp_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame); void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 5c97de459905..f9a8cc82ae2e 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -277,6 +277,8 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; + else if (frame->skb_std) + skb = frame->skb_std; if (!skb) return; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 2fd1976e5b1c..f7e284f23b1f 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -131,6 +131,17 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) return NULL; } +int hsr_get_version(struct net_device *dev, enum hsr_version *ver) +{ + struct hsr_priv *hsr; + + hsr = netdev_priv(dev); + *ver = hsr->prot_version; + + return 0; +} +EXPORT_SYMBOL(hsr_get_version); + static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index a9c30a608e35..a169808ee78a 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -13,6 +13,7 @@ #include #include #include +#include /* Time constants as specified in the HSR specification (IEC-62439-3 2010) * Table 8. @@ -171,13 +172,6 @@ struct hsr_port { enum hsr_port_type type; }; -/* used by driver internally to differentiate various protocols */ -enum hsr_version { - HSR_V0 = 0, - HSR_V1, - PRP_V1, -}; - struct hsr_frame_info; struct hsr_node; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 36d5fcf09c61..c5227d42faf5 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -48,12 +48,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) goto finish_consume; } - /* For HSR, only tagged frames are expected, but for PRP - * there could be non tagged frames as well from Single - * attached nodes (SANs). + /* For HSR, only tagged frames are expected (unless the device offloads + * HSR tag removal), but for PRP there could be non tagged frames as + * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; - if (hsr->proto_ops->invalid_dan_ingress_frame && + + if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; -- cgit v1.2.3 From 18596f504a3e56c4f8e132b2a437cbe23a3f4635 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 9 Feb 2021 19:02:12 -0600 Subject: net: dsa: add support for offloading HSR Add support for offloading of HSR/PRP (IEC 62439-3) tag insertion tag removal, duplicate generation and forwarding on DSA switches. Add DSA_NOTIFIER_HSR_JOIN and DSA_NOTIFIER_HSR_LEAVE which trigger calls to .port_hsr_join and .port_hsr_leave in the DSA driver for the switch. The DSA switch driver should then set netdev feature flags for the HSR/PRP operation that it offloads. NETIF_F_HW_HSR_TAG_INS NETIF_F_HW_HSR_TAG_RM NETIF_F_HW_HSR_FWD NETIF_F_HW_HSR_DUP Signed-off-by: George McCollister Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 13 +++++++++++++ net/dsa/dsa_priv.h | 11 +++++++++++ net/dsa/port.c | 34 ++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 14 ++++++++++++++ net/dsa/switch.c | 24 ++++++++++++++++++++++++ 5 files changed, 96 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 60acb9fca124..d8de23ce7221 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -172,6 +172,10 @@ struct dsa_switch_tree { list_for_each_entry((_dp), &(_dst)->ports, list) \ if ((_dp)->lag_dev == (_lag)) +#define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ + list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ + if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) + static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, unsigned int id) { @@ -264,6 +268,7 @@ struct dsa_port { struct phylink_config pl_config; struct net_device *lag_dev; bool lag_tx_enabled; + struct net_device *hsr_dev; struct list_head list; @@ -769,6 +774,14 @@ struct dsa_switch_ops { struct netdev_lag_upper_info *info); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct net_device *lag); + + /* + * HSR integration + */ + int (*port_hsr_join)(struct dsa_switch *ds, int port, + struct net_device *hsr); + int (*port_hsr_leave)(struct dsa_switch *ds, int port, + struct net_device *hsr); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 8a1bcb2b4208..7060f128386b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -20,6 +20,8 @@ enum { DSA_NOTIFIER_BRIDGE_LEAVE, DSA_NOTIFIER_FDB_ADD, DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_HSR_JOIN, + DSA_NOTIFIER_HSR_LEAVE, DSA_NOTIFIER_LAG_CHANGE, DSA_NOTIFIER_LAG_JOIN, DSA_NOTIFIER_LAG_LEAVE, @@ -100,6 +102,13 @@ struct dsa_switchdev_event_work { u16 vid; }; +/* DSA_NOTIFIER_HSR_* */ +struct dsa_notifier_hsr_info { + struct net_device *hsr; + int sw_index; + int port; +}; + struct dsa_slave_priv { /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, @@ -183,6 +192,8 @@ int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); +int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); +void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; static inline bool dsa_port_offloads_netdev(struct dsa_port *dp, diff --git a/net/dsa/port.c b/net/dsa/port.c index 5e079a61528e..b93bda463026 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -868,3 +868,37 @@ int dsa_port_get_phy_sset_count(struct dsa_port *dp) return ret; } EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count); + +int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr) +{ + struct dsa_notifier_hsr_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .hsr = hsr, + }; + int err; + + dp->hsr_dev = hsr; + + err = dsa_port_notify(dp, DSA_NOTIFIER_HSR_JOIN, &info); + if (err) + dp->hsr_dev = NULL; + + return err; +} + +void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) +{ + struct dsa_notifier_hsr_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .hsr = hsr, + }; + int err; + + dp->hsr_dev = NULL; + + err = dsa_port_notify(dp, DSA_NOTIFIER_HSR_LEAVE, &info); + if (err) + pr_err("DSA: failed to notify DSA_NOTIFIER_HSR_LEAVE\n"); +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 431bdbdd8473..a95e78d59740 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -1938,6 +1939,19 @@ static int dsa_slave_changeupper(struct net_device *dev, dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } + } else if (is_hsr_master(info->upper_dev)) { + if (info->linking) { + err = dsa_port_hsr_join(dp, info->upper_dev); + if (err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(info->info.extack, + "Offloading not supported"); + err = 0; + } + err = notifier_from_errno(err); + } else { + dsa_port_hsr_leave(dp, info->upper_dev); + err = NOTIFY_OK; + } } return err; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5026e4143663..1906179e59f7 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -166,6 +166,24 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); } +static int dsa_switch_hsr_join(struct dsa_switch *ds, + struct dsa_notifier_hsr_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_hsr_join) + return ds->ops->port_hsr_join(ds, info->port, info->hsr); + + return -EOPNOTSUPP; +} + +static int dsa_switch_hsr_leave(struct dsa_switch *ds, + struct dsa_notifier_hsr_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_hsr_leave) + return ds->ops->port_hsr_leave(ds, info->port, info->hsr); + + return -EOPNOTSUPP; +} + static int dsa_switch_lag_change(struct dsa_switch *ds, struct dsa_notifier_lag_info *info) { @@ -371,6 +389,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_FDB_DEL: err = dsa_switch_fdb_del(ds, info); break; + case DSA_NOTIFIER_HSR_JOIN: + err = dsa_switch_hsr_join(ds, info); + break; + case DSA_NOTIFIER_HSR_LEAVE: + err = dsa_switch_hsr_leave(ds, info); + break; case DSA_NOTIFIER_LAG_CHANGE: err = dsa_switch_lag_change(ds, info); break; -- cgit v1.2.3 From 3d368ab87cf6681f928de1ddf804d69600671bb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Feb 2021 06:41:44 -0800 Subject: net: initialize net->net_cookie at netns setup It is simpler to make net->net_cookie a plain u64 written once in setup_net() instead of looping and using atomic64 helpers. Lorenz Bauer wants to add SO_NETNS_COOKIE socket option and this patch would makes his patch series simpler. Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Cc: Lorenz Bauer Acked-by: Daniel Borkmann Tested-by: Lorenz Bauer Signed-off-by: David S. Miller --- include/net/net_namespace.h | 4 +--- net/core/filter.c | 8 +++----- net/core/net_namespace.c | 19 +++---------------- 3 files changed, 7 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 29567875f428..dcaee24a4d87 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -165,7 +165,7 @@ struct net { struct netns_xfrm xfrm; #endif - atomic64_t net_cookie; /* written once */ + u64 net_cookie; /* written once */ #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; @@ -224,8 +224,6 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int fd); -u64 __net_gen_cookie(struct net *net); - #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); void ipx_unregister_sysctl(void); diff --git a/net/core/filter.c b/net/core/filter.c index 9ab94e90d660..74bd401bf483 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4645,11 +4645,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { static u64 __bpf_get_netns_cookie(struct sock *sk) { -#ifdef CONFIG_NET_NS - return __net_gen_cookie(sk ? sk->sk_net.net : &init_net); -#else - return 0; -#endif + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2ef3b4557f40..43b6ac4c4439 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -72,18 +72,6 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; DEFINE_COOKIE(net_cookie); -u64 __net_gen_cookie(struct net *net) -{ - while (1) { - u64 res = atomic64_read(&net->net_cookie); - - if (res) - return res; - res = gen_cookie_next(&net_cookie); - atomic64_cmpxchg(&net->net_cookie, 0, res); - } -} - static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -332,6 +320,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -1103,10 +1094,6 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - preempt_disable(); - __net_gen_cookie(&init_net); - preempt_enable(); - down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); -- cgit v1.2.3 From f2fa0e5e9f31dd90741f1151043ca1eaa4086690 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 11 Feb 2021 11:16:13 +0100 Subject: xen/events: link interdomain events to associated xenbus device In order to support the possibility of per-device event channel settings (e.g. lateeoi spurious event thresholds) add a xenbus device pointer to struct irq_info() and modify the related event channel binding interfaces to take the pointer to the xenbus device as a parameter instead of the domain id of the other side. While at it remove the stale prototype of bind_evtchn_to_irq_lateeoi(). Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Reviewed-by: Wei Liu Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/net/xen-netback/interface.c | 16 +++++++-------- drivers/xen/events/events_base.c | 41 ++++++++++++++++++++++--------------- drivers/xen/pvcalls-back.c | 4 ++-- drivers/xen/xen-pciback/xenbus.c | 2 +- drivers/xen/xen-scsiback.c | 2 +- include/xen/events.h | 7 ++++--- 7 files changed, 41 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 9860d4842f36..c2aaf690352c 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -245,7 +245,7 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, if (req_prod - rsp_prod > size) goto fail; - err = bind_interdomain_evtchn_to_irqhandler_lateeoi(blkif->domid, + err = bind_interdomain_evtchn_to_irqhandler_lateeoi(blkif->be->dev, evtchn, xen_blkif_be_int, 0, "blkif-backend", ring); if (err < 0) goto fail; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 8e36542fe24d..193b723fe3bd 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -630,13 +630,13 @@ int xenvif_connect_ctrl(struct xenvif *vif, grant_ref_t ring_ref, unsigned int evtchn) { struct net_device *dev = vif->dev; + struct xenbus_device *xendev = xenvif_to_xenbus_device(vif); void *addr; struct xen_netif_ctrl_sring *shared; RING_IDX rsp_prod, req_prod; int err; - err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif), - &ring_ref, 1, &addr); + err = xenbus_map_ring_valloc(xendev, &ring_ref, 1, &addr); if (err) goto err; @@ -650,7 +650,7 @@ int xenvif_connect_ctrl(struct xenvif *vif, grant_ref_t ring_ref, if (req_prod - rsp_prod > RING_SIZE(&vif->ctrl)) goto err_unmap; - err = bind_interdomain_evtchn_to_irq_lateeoi(vif->domid, evtchn); + err = bind_interdomain_evtchn_to_irq_lateeoi(xendev, evtchn); if (err < 0) goto err_unmap; @@ -673,8 +673,7 @@ err_deinit: vif->ctrl_irq = 0; err_unmap: - xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(vif), - vif->ctrl.sring); + xenbus_unmap_ring_vfree(xendev, vif->ctrl.sring); vif->ctrl.sring = NULL; err: @@ -719,6 +718,7 @@ int xenvif_connect_data(struct xenvif_queue *queue, unsigned int tx_evtchn, unsigned int rx_evtchn) { + struct xenbus_device *dev = xenvif_to_xenbus_device(queue->vif); struct task_struct *task; int err; @@ -755,7 +755,7 @@ int xenvif_connect_data(struct xenvif_queue *queue, if (tx_evtchn == rx_evtchn) { /* feature-split-event-channels == 0 */ err = bind_interdomain_evtchn_to_irqhandler_lateeoi( - queue->vif->domid, tx_evtchn, xenvif_interrupt, 0, + dev, tx_evtchn, xenvif_interrupt, 0, queue->name, queue); if (err < 0) goto err; @@ -766,7 +766,7 @@ int xenvif_connect_data(struct xenvif_queue *queue, snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name), "%s-tx", queue->name); err = bind_interdomain_evtchn_to_irqhandler_lateeoi( - queue->vif->domid, tx_evtchn, xenvif_tx_interrupt, 0, + dev, tx_evtchn, xenvif_tx_interrupt, 0, queue->tx_irq_name, queue); if (err < 0) goto err; @@ -776,7 +776,7 @@ int xenvif_connect_data(struct xenvif_queue *queue, snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name), "%s-rx", queue->name); err = bind_interdomain_evtchn_to_irqhandler_lateeoi( - queue->vif->domid, rx_evtchn, xenvif_rx_interrupt, 0, + dev, rx_evtchn, xenvif_rx_interrupt, 0, queue->rx_irq_name, queue); if (err < 0) goto err; diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index e850f79351cb..b249f2d6b0cc 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include "events_internal.h" @@ -115,6 +116,7 @@ struct irq_info { unsigned char flags; uint16_t domid; } pirq; + struct xenbus_device *interdomain; } u; }; @@ -313,11 +315,16 @@ static int xen_irq_info_common_setup(struct irq_info *info, } static int xen_irq_info_evtchn_setup(unsigned irq, - evtchn_port_t evtchn) + evtchn_port_t evtchn, + struct xenbus_device *dev) { struct irq_info *info = info_for_irq(irq); + int ret; - return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); + ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); + info->u.interdomain = dev; + + return ret; } static int xen_irq_info_ipi_setup(unsigned cpu, @@ -1116,7 +1123,8 @@ int xen_pirq_from_irq(unsigned irq) } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); -static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip) +static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, + struct xenbus_device *dev) { int irq; int ret; @@ -1136,7 +1144,7 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip) irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "event"); - ret = xen_irq_info_evtchn_setup(irq, evtchn); + ret = xen_irq_info_evtchn_setup(irq, evtchn, dev); if (ret < 0) { __unbind_from_irq(irq); irq = ret; @@ -1163,7 +1171,7 @@ out: int bind_evtchn_to_irq(evtchn_port_t evtchn) { - return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip); + return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL); } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); @@ -1212,27 +1220,27 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; } -static int bind_interdomain_evtchn_to_irq_chip(unsigned int remote_domain, +static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, evtchn_port_t remote_port, struct irq_chip *chip) { struct evtchn_bind_interdomain bind_interdomain; int err; - bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_dom = dev->otherend_id; bind_interdomain.remote_port = remote_port; err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain); return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, - chip); + chip, dev); } -int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain, +int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, evtchn_port_t remote_port) { - return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + return bind_interdomain_evtchn_to_irq_chip(dev, remote_port, &xen_lateeoi_chip); } EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); @@ -1345,7 +1353,7 @@ static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn, { int irq, retval; - irq = bind_evtchn_to_irq_chip(evtchn, chip); + irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL); if (irq < 0) return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); @@ -1380,14 +1388,13 @@ int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi); static int bind_interdomain_evtchn_to_irqhandler_chip( - unsigned int remote_domain, evtchn_port_t remote_port, + struct xenbus_device *dev, evtchn_port_t remote_port, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id, struct irq_chip *chip) { int irq, retval; - irq = bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, - chip); + irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip); if (irq < 0) return irq; @@ -1400,14 +1407,14 @@ static int bind_interdomain_evtchn_to_irqhandler_chip( return irq; } -int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain, +int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev, evtchn_port_t remote_port, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { - return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain, + return bind_interdomain_evtchn_to_irqhandler_chip(dev, remote_port, handler, irqflags, devname, dev_id, &xen_lateeoi_chip); } @@ -1679,7 +1686,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - (void)xen_irq_info_evtchn_setup(irq, evtchn); + (void)xen_irq_info_evtchn_setup(irq, evtchn, NULL); mutex_unlock(&irq_mapping_update_lock); diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index a7d293fa8d14..b47fd8435061 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -348,7 +348,7 @@ static struct sock_mapping *pvcalls_new_active_socket( map->bytes = page; ret = bind_interdomain_evtchn_to_irqhandler_lateeoi( - fedata->dev->otherend_id, evtchn, + fedata->dev, evtchn, pvcalls_back_conn_event, 0, "pvcalls-backend", map); if (ret < 0) goto out; @@ -948,7 +948,7 @@ static int backend_connect(struct xenbus_device *dev) goto error; } - err = bind_interdomain_evtchn_to_irq_lateeoi(dev->otherend_id, evtchn); + err = bind_interdomain_evtchn_to_irq_lateeoi(dev, evtchn); if (err < 0) goto error; fedata->irq = err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index e7c692cfb2cf..5188f02e75fb 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -124,7 +124,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, pdev->sh_info = vaddr; err = bind_interdomain_evtchn_to_irqhandler_lateeoi( - pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, + pdev->xdev, remote_evtchn, xen_pcibk_handle_event, 0, DRV_NAME, pdev); if (err < 0) { xenbus_dev_fatal(pdev->xdev, err, diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 862162dca33c..8b59897b2df9 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -799,7 +799,7 @@ static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, sring = (struct vscsiif_sring *)area; BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); - err = bind_interdomain_evtchn_to_irq_lateeoi(info->domid, evtchn); + err = bind_interdomain_evtchn_to_irq_lateeoi(info->dev, evtchn); if (err < 0) goto unmap_page; diff --git a/include/xen/events.h b/include/xen/events.h index 8ec418e30c7f..c204262d9fc2 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -12,10 +12,11 @@ #include #include +struct xenbus_device; + unsigned xen_evtchn_nr_channels(void); int bind_evtchn_to_irq(evtchn_port_t evtchn); -int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn); int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, irq_handler_t handler, unsigned long irqflags, const char *devname, @@ -35,9 +36,9 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, unsigned long irqflags, const char *devname, void *dev_id); -int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain, +int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, evtchn_port_t remote_port); -int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain, +int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev, evtchn_port_t remote_port, irq_handler_t handler, unsigned long irqflags, -- cgit v1.2.3 From 4c236d5dc8b86222dc155cd68e7934624264150f Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Thu, 11 Feb 2021 21:28:27 +0530 Subject: octeontx2-pf: cn10k: Use LMTST lines for NPA/NIX operations This patch adds support to use new LMTST lines for NPA batch free and burst SQE flush. Adds new dev_hw_ops structure to hold platform specific functions and create new files cn10k.c and cn10k.h. Signed-off-by: Geetha sowjanya Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/nic/Makefile | 2 +- drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c | 181 +++++++++++++++++++++ drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h | 17 ++ .../ethernet/marvell/octeontx2/nic/otx2_common.c | 84 ++++------ .../ethernet/marvell/octeontx2/nic/otx2_common.h | 68 +++++++- .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 36 +--- .../net/ethernet/marvell/octeontx2/nic/otx2_reg.h | 1 + .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 39 ++--- .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.h | 7 + .../net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 28 +--- include/linux/soc/marvell/octeontx2/asm.h | 8 + 11 files changed, 336 insertions(+), 135 deletions(-) create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h (limited to 'include') diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile index 29c82b94b2dc..745aa8a19499 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_OCTEONTX2_PF) += rvu_nicpf.o obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \ - otx2_ptp.o otx2_flows.o + otx2_ptp.o otx2_flows.o cn10k.o rvu_nicvf-y := otx2_vf.o ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c new file mode 100644 index 000000000000..d6ca809edaed --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell OcteonTx2 RVU Physcial Function ethernet driver + * + * Copyright (C) 2020 Marvell. + */ + +#include "cn10k.h" +#include "otx2_reg.h" +#include "otx2_struct.h" + +static struct dev_hw_ops otx2_hw_ops = { + .sq_aq_init = otx2_sq_aq_init, + .sqe_flush = otx2_sqe_flush, + .aura_freeptr = otx2_aura_freeptr, + .refill_pool_ptrs = otx2_refill_pool_ptrs, +}; + +static struct dev_hw_ops cn10k_hw_ops = { + .sq_aq_init = cn10k_sq_aq_init, + .sqe_flush = cn10k_sqe_flush, + .aura_freeptr = cn10k_aura_freeptr, + .refill_pool_ptrs = cn10k_refill_pool_ptrs, +}; + +int cn10k_pf_lmtst_init(struct otx2_nic *pf) +{ + int size, num_lines; + u64 base; + + if (!test_bit(CN10K_LMTST, &pf->hw.cap_flag)) { + pf->hw_ops = &otx2_hw_ops; + return 0; + } + + pf->hw_ops = &cn10k_hw_ops; + base = pci_resource_start(pf->pdev, PCI_MBOX_BAR_NUM) + + (MBOX_SIZE * (pf->total_vfs + 1)); + + size = pci_resource_len(pf->pdev, PCI_MBOX_BAR_NUM) - + (MBOX_SIZE * (pf->total_vfs + 1)); + + pf->hw.lmt_base = ioremap(base, size); + + if (!pf->hw.lmt_base) { + dev_err(pf->dev, "Unable to map PF LMTST region\n"); + return -ENOMEM; + } + + /* FIXME: Get the num of LMTST lines from LMT table */ + pf->tot_lmt_lines = size / LMT_LINE_SIZE; + num_lines = (pf->tot_lmt_lines - NIX_LMTID_BASE) / + pf->hw.tx_queues; + /* Number of LMT lines per SQ queues */ + pf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines; + + pf->nix_lmt_size = pf->nix_lmt_lines * LMT_LINE_SIZE; + return 0; +} + +int cn10k_vf_lmtst_init(struct otx2_nic *vf) +{ + int size, num_lines; + + if (!test_bit(CN10K_LMTST, &vf->hw.cap_flag)) { + vf->hw_ops = &otx2_hw_ops; + return 0; + } + + vf->hw_ops = &cn10k_hw_ops; + size = pci_resource_len(vf->pdev, PCI_MBOX_BAR_NUM); + vf->hw.lmt_base = ioremap_wc(pci_resource_start(vf->pdev, + PCI_MBOX_BAR_NUM), + size); + if (!vf->hw.lmt_base) { + dev_err(vf->dev, "Unable to map VF LMTST region\n"); + return -ENOMEM; + } + + vf->tot_lmt_lines = size / LMT_LINE_SIZE; + /* LMTST lines per SQ */ + num_lines = (vf->tot_lmt_lines - NIX_LMTID_BASE) / + vf->hw.tx_queues; + vf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines; + vf->nix_lmt_size = vf->nix_lmt_lines * LMT_LINE_SIZE; + return 0; +} +EXPORT_SYMBOL(cn10k_vf_lmtst_init); + +int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura) +{ + struct nix_cn10k_aq_enq_req *aq; + struct otx2_nic *pfvf = dev; + struct otx2_snd_queue *sq; + + sq = &pfvf->qset.sq[qidx]; + sq->lmt_addr = (__force u64 *)((u64)pfvf->hw.nix_lmt_base + + (qidx * pfvf->nix_lmt_size)); + + /* Get memory to put this msg */ + aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox); + if (!aq) + return -ENOMEM; + + aq->sq.cq = pfvf->hw.rx_queues + qidx; + aq->sq.max_sqe_size = NIX_MAXSQESZ_W16; /* 128 byte */ + aq->sq.cq_ena = 1; + aq->sq.ena = 1; + /* Only one SMQ is allocated, map all SQ's to that SMQ */ + aq->sq.smq = pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0]; + /* FIXME: set based on NIX_AF_DWRR_RPM_MTU*/ + aq->sq.smq_rr_weight = OTX2_MAX_MTU; + aq->sq.default_chan = pfvf->hw.tx_chan_base; + aq->sq.sqe_stype = NIX_STYPE_STF; /* Cache SQB */ + aq->sq.sqb_aura = sqb_aura; + aq->sq.sq_int_ena = NIX_SQINT_BITS; + aq->sq.qint_idx = 0; + /* Due pipelining impact minimum 2000 unused SQ CQE's + * need to maintain to avoid CQ overflow. + */ + aq->sq.cq_limit = ((SEND_CQ_SKID * 256) / (pfvf->qset.sqe_cnt)); + + /* Fill AQ info */ + aq->qidx = qidx; + aq->ctype = NIX_AQ_CTYPE_SQ; + aq->op = NIX_AQ_INSTOP_INIT; + + return otx2_sync_mbox_msg(&pfvf->mbox); +} + +#define NPA_MAX_BURST 16 +void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) +{ + struct otx2_nic *pfvf = dev; + u64 ptrs[NPA_MAX_BURST]; + int num_ptrs = 1; + dma_addr_t bufptr; + + /* Refill pool with new buffers */ + while (cq->pool_ptrs) { + if (otx2_alloc_buffer(pfvf, cq, &bufptr)) { + if (num_ptrs--) + __cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs, + num_ptrs, + cq->rbpool->lmt_addr); + break; + } + cq->pool_ptrs--; + ptrs[num_ptrs] = (u64)bufptr + OTX2_HEAD_ROOM; + num_ptrs++; + if (num_ptrs == NPA_MAX_BURST || cq->pool_ptrs == 0) { + __cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs, + num_ptrs, + cq->rbpool->lmt_addr); + num_ptrs = 1; + } + } +} + +void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx) +{ + struct otx2_nic *pfvf = dev; + int lmt_id = NIX_LMTID_BASE + (qidx * pfvf->nix_lmt_lines); + u64 val = 0, tar_addr = 0; + + /* FIXME: val[0:10] LMT_ID. + * [12:15] no of LMTST - 1 in the burst. + * [19:63] data size of each LMTST in the burst except first. + */ + val = (lmt_id & 0x7FF); + /* Target address for LMTST flush tells HW how many 128bit + * words are present. + * tar_addr[6:4] size of first LMTST - 1 in units of 128b. + */ + tar_addr |= sq->io_addr | (((size / 16) - 1) & 0x7) << 4; + dma_wmb(); + memcpy(sq->lmt_addr, sq->sqe_base, size); + cn10k_lmt_flush(val, tar_addr); + + sq->head++; + sq->head &= (sq->sqe_cnt - 1); +} diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h new file mode 100644 index 000000000000..e0bc595cbb78 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Marvell OcteonTx2 RVU Ethernet driver + * + * Copyright (C) 2020 Marvell. + */ + +#ifndef CN10K_H +#define CN10K_H + +#include "otx2_common.h" + +void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); +int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); +int cn10k_pf_lmtst_init(struct otx2_nic *pf); +int cn10k_vf_lmtst_init(struct otx2_nic *vf); +#endif /* CN10K_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index dbbdc3453f1a..2779802eed84 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -15,6 +15,7 @@ #include "otx2_reg.h" #include "otx2_common.h" #include "otx2_struct.h" +#include "cn10k.h" static void otx2_nix_rq_op_stats(struct queue_stats *stats, struct otx2_nic *pfvf, int qidx) @@ -526,6 +527,26 @@ static int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, return ret; } +int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, + dma_addr_t *dma) +{ + if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) { + struct refill_work *work; + struct delayed_work *dwork; + + work = &pfvf->refill_wrk[cq->cq_idx]; + dwork = &work->pool_refill_work; + /* Schedule a task if no other task is running */ + if (!cq->refill_task_sched) { + cq->refill_task_sched = true; + schedule_delayed_work(dwork, + msecs_to_jiffies(100)); + } + return -ENOMEM; + } + return 0; +} + void otx2_tx_timeout(struct net_device *netdev, unsigned int txq) { struct otx2_nic *pfvf = netdev_priv(netdev); @@ -728,9 +749,6 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) #define RQ_PASS_LVL_AURA (255 - ((95 * 256) / 100)) /* RED when 95% is full */ #define RQ_DROP_LVL_AURA (255 - ((99 * 256) / 100)) /* Drop when 99% is full */ -/* Send skid of 2000 packets required for CQ size of 4K CQEs. */ -#define SEND_CQ_SKID 2000 - static int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura) { struct otx2_qset *qset = &pfvf->qset; @@ -764,45 +782,14 @@ static int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura) return otx2_sync_mbox_msg(&pfvf->mbox); } -static int cn10k_sq_aq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) -{ - struct nix_cn10k_aq_enq_req *aq; - - /* Get memory to put this msg */ - aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox); - if (!aq) - return -ENOMEM; - - aq->sq.cq = pfvf->hw.rx_queues + qidx; - aq->sq.max_sqe_size = NIX_MAXSQESZ_W16; /* 128 byte */ - aq->sq.cq_ena = 1; - aq->sq.ena = 1; - /* Only one SMQ is allocated, map all SQ's to that SMQ */ - aq->sq.smq = pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0]; - /* FIXME: set based on NIX_AF_DWRR_RPM_MTU*/ - aq->sq.smq_rr_weight = OTX2_MAX_MTU; - aq->sq.default_chan = pfvf->hw.tx_chan_base; - aq->sq.sqe_stype = NIX_STYPE_STF; /* Cache SQB */ - aq->sq.sqb_aura = sqb_aura; - aq->sq.sq_int_ena = NIX_SQINT_BITS; - aq->sq.qint_idx = 0; - /* Due pipelining impact minimum 2000 unused SQ CQE's - * need to maintain to avoid CQ overflow. - */ - aq->sq.cq_limit = ((SEND_CQ_SKID * 256) / (pfvf->qset.sqe_cnt)); - - /* Fill AQ info */ - aq->qidx = qidx; - aq->ctype = NIX_AQ_CTYPE_SQ; - aq->op = NIX_AQ_INSTOP_INIT; - - return otx2_sync_mbox_msg(&pfvf->mbox); -} - -static int otx2_sq_aq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) +int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura) { + struct otx2_nic *pfvf = dev; + struct otx2_snd_queue *sq; struct nix_aq_enq_req *aq; + sq = &pfvf->qset.sq[qidx]; + sq->lmt_addr = (__force u64 *)(pfvf->reg_base + LMT_LF_LMTLINEX(qidx)); /* Get memory to put this msg */ aq = otx2_mbox_alloc_msg_nix_aq_enq(&pfvf->mbox); if (!aq) @@ -873,16 +860,12 @@ static int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) sq->sqe_thresh = ((sq->num_sqbs * sq->sqe_per_sqb) * 10) / 100; sq->aura_id = sqb_aura; sq->aura_fc_addr = pool->fc_addr->base; - sq->lmt_addr = (__force u64 *)(pfvf->reg_base + LMT_LF_LMTLINEX(qidx)); sq->io_addr = (__force u64)otx2_get_regaddr(pfvf, NIX_LF_OP_SENDX(0)); sq->stats.bytes = 0; sq->stats.pkts = 0; - if (is_dev_otx2(pfvf->pdev)) - return otx2_sq_aq_init(pfvf, qidx, sqb_aura); - else - return cn10k_sq_aq_init(pfvf, qidx, sqb_aura); + return pfvf->hw_ops->sq_aq_init(pfvf, qidx, sqb_aura); } @@ -987,7 +970,7 @@ static void otx2_pool_refill_task(struct work_struct *work) } return; } - otx2_aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM); + pfvf->hw_ops->aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM); cq->pool_ptrs--; } cq->refill_task_sched = false; @@ -1231,6 +1214,11 @@ static int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, pool->rbsize = buf_size; + /* Set LMTST addr for NPA batch free */ + if (test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) + pool->lmt_addr = (__force u64 *)((u64)pfvf->hw.npa_lmt_base + + (pool_id * LMT_LINE_SIZE)); + /* Initialize this pool's context via AF */ aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox); if (!aq) { @@ -1319,7 +1307,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) for (ptr = 0; ptr < num_sqbs; ptr++) { if (otx2_alloc_rbuf(pfvf, pool, &bufptr)) return -ENOMEM; - otx2_aura_freeptr(pfvf, pool_id, bufptr); + pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr); sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr; } } @@ -1369,8 +1357,8 @@ int otx2_rq_aura_pool_init(struct otx2_nic *pfvf) for (ptr = 0; ptr < num_ptrs; ptr++) { if (otx2_alloc_rbuf(pfvf, pool, &bufptr)) return -ENOMEM; - otx2_aura_freeptr(pfvf, pool_id, - bufptr + OTX2_HEAD_ROOM); + pfvf->hw_ops->aura_freeptr(pfvf, pool_id, + bufptr + OTX2_HEAD_ROOM); } } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 9ac9b420dd95..51aaa6ae0fd3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -50,6 +50,9 @@ enum arua_mapped_qtypes { #define NIX_LF_ERR_VEC 0x81 #define NIX_LF_POISON_VEC 0x82 +/* Send skid of 2000 packets required for CQ size of 4K CQEs. */ +#define SEND_CQ_SKID 2000 + /* RSS configuration */ struct otx2_rss_ctx { u8 ind_tbl[MAX_RSS_INDIR_TBL_SIZE]; @@ -275,9 +278,18 @@ struct otx2_flow_config { struct list_head flow_list; }; +struct dev_hw_ops { + int (*sq_aq_init)(void *dev, u16 qidx, u16 sqb_aura); + void (*sqe_flush)(void *dev, struct otx2_snd_queue *sq, + int size, int qidx); + void (*refill_pool_ptrs)(void *dev, struct otx2_cq_queue *cq); + void (*aura_freeptr)(void *dev, int aura, u64 buf); +}; + struct otx2_nic { void __iomem *reg_base; struct net_device *netdev; + struct dev_hw_ops *hw_ops; void *iommu_domain; u16 max_frs; u16 rbsize; /* Receive buffer size */ @@ -507,10 +519,51 @@ static inline u64 otx2_atomic64_add(u64 incr, u64 *ptr) } #else -#define otx2_write128(lo, hi, addr) +#define otx2_write128(lo, hi, addr) writeq((hi) | (lo), addr) #define otx2_atomic64_add(incr, ptr) ({ *ptr += incr; }) #endif +static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura, + u64 *ptrs, u64 num_ptrs, + u64 *lmt_addr) +{ + u64 size = 0, count_eot = 0; + u64 tar_addr, val = 0; + + tar_addr = (__force u64)otx2_get_regaddr(pfvf, NPA_LF_AURA_BATCH_FREE0); + /* LMTID is same as AURA Id */ + val = (aura & 0x7FF) | BIT_ULL(63); + /* Set if [127:64] of last 128bit word has a valid pointer */ + count_eot = (num_ptrs % 2) ? 0ULL : 1ULL; + /* Set AURA ID to free pointer */ + ptrs[0] = (count_eot << 32) | (aura & 0xFFFFF); + /* Target address for LMTST flush tells HW how many 128bit + * words are valid from NPA_LF_AURA_BATCH_FREE0. + * + * tar_addr[6:4] is LMTST size-1 in units of 128b. + */ + if (num_ptrs > 2) { + size = (sizeof(u64) * num_ptrs) / 16; + if (!count_eot) + size++; + tar_addr |= ((size - 1) & 0x7) << 4; + } + memcpy(lmt_addr, ptrs, sizeof(u64) * num_ptrs); + /* Perform LMTST flush */ + cn10k_lmt_flush(val, tar_addr); +} + +static inline void cn10k_aura_freeptr(void *dev, int aura, u64 buf) +{ + struct otx2_nic *pfvf = dev; + struct otx2_pool *pool; + u64 ptrs[2]; + + pool = &pfvf->qset.pool[aura]; + ptrs[1] = buf; + __cn10k_aura_freeptr(pfvf, aura, ptrs, 2, pool->lmt_addr); +} + /* Alloc pointer from pool/aura */ static inline u64 otx2_aura_allocptr(struct otx2_nic *pfvf, int aura) { @@ -522,11 +575,12 @@ static inline u64 otx2_aura_allocptr(struct otx2_nic *pfvf, int aura) } /* Free pointer to a pool/aura */ -static inline void otx2_aura_freeptr(struct otx2_nic *pfvf, - int aura, u64 buf) +static inline void otx2_aura_freeptr(void *dev, int aura, u64 buf) { - otx2_write128(buf, (u64)aura | BIT_ULL(63), - otx2_get_regaddr(pfvf, NPA_LF_AURA_OP_FREE0)); + struct otx2_nic *pfvf = dev; + void __iomem *addr = otx2_get_regaddr(pfvf, NPA_LF_AURA_OP_FREE0); + + otx2_write128(buf, (u64)aura | BIT_ULL(63), addr); } static inline int otx2_get_pool_idx(struct otx2_nic *pfvf, int type, int idx) @@ -681,6 +735,10 @@ void otx2_ctx_disable(struct mbox *mbox, int type, bool npa); int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable); void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); +int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); +int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); +int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, + dma_addr_t *dma); /* RSS configuration APIs*/ int otx2_rss_init(struct otx2_nic *pfvf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index afafaec88c8b..f87cfcfc2832 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -22,6 +22,7 @@ #include "otx2_txrx.h" #include "otx2_struct.h" #include "otx2_ptp.h" +#include "cn10k.h" #include #define DRV_NAME "rvu_nicpf" @@ -46,39 +47,6 @@ enum { static int otx2_config_hw_tx_tstamp(struct otx2_nic *pfvf, bool enable); static int otx2_config_hw_rx_tstamp(struct otx2_nic *pfvf, bool enable); -static int cn10k_lmtst_init(struct otx2_nic *pf) -{ - int size, num_lines; - u64 base; - - if (!test_bit(CN10K_LMTST, &pf->hw.cap_flag)) - return 0; - - base = pci_resource_start(pf->pdev, PCI_MBOX_BAR_NUM) + - (MBOX_SIZE * (pf->total_vfs + 1)); - - size = pci_resource_len(pf->pdev, PCI_MBOX_BAR_NUM) - - (MBOX_SIZE * (pf->total_vfs + 1)); - - pf->hw.lmt_base = ioremap(base, size); - - if (!pf->hw.lmt_base) { - dev_err(pf->dev, "Unable to map PF LMTST region\n"); - return -ENOMEM; - } - - /* FIXME: Get the num of LMTST lines from LMT table */ - pf->tot_lmt_lines = size / LMT_LINE_SIZE; - num_lines = (pf->tot_lmt_lines - NIX_LMTID_BASE) / - pf->hw.tx_queues; - /* Number of LMT lines per SQ queues */ - pf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines; - - pf->nix_lmt_size = pf->nix_lmt_lines * LMT_LINE_SIZE; - - return 0; -} - static int otx2_change_mtu(struct net_device *netdev, int new_mtu) { bool if_up = netif_running(netdev); @@ -2404,7 +2372,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_detach_rsrc; - err = cn10k_lmtst_init(pf); + err = cn10k_pf_lmtst_init(pf); if (err) goto err_detach_rsrc; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h index 1e052d76a580..21b811c6ee0f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h @@ -94,6 +94,7 @@ #define NPA_LF_QINTX_INT_W1S(a) (NPA_LFBASE | 0x318 | (a) << 12) #define NPA_LF_QINTX_ENA_W1S(a) (NPA_LFBASE | 0x320 | (a) << 12) #define NPA_LF_QINTX_ENA_W1C(a) (NPA_LFBASE | 0x330 | (a) << 12) +#define NPA_LF_AURA_BATCH_FREE0 (NPA_LFBASE | 0x400) /* NIX LF registers */ #define NIX_LFBASE (BLKTYPE_NIX << RVU_FUNC_BLKADDR_SHIFT) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 68f80e75c8c7..59a7bd88d907 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -17,6 +17,7 @@ #include "otx2_struct.h" #include "otx2_txrx.h" #include "otx2_ptp.h" +#include "cn10k.h" #define CQE_ADDR(CQ, idx) ((CQ)->cqe_base + ((CQ)->cqe_size * (idx))) @@ -199,7 +200,8 @@ static void otx2_free_rcv_seg(struct otx2_nic *pfvf, struct nix_cqe_rx_s *cqe, sg = (struct nix_rx_sg_s *)start; seg_addr = &sg->seg_addr; for (seg = 0; seg < sg->segs; seg++, seg_addr++) - otx2_aura_freeptr(pfvf, qidx, *seg_addr & ~0x07ULL); + pfvf->hw_ops->aura_freeptr(pfvf, qidx, + *seg_addr & ~0x07ULL); start += sizeof(*sg); } } @@ -304,7 +306,6 @@ static int otx2_rx_napi_handler(struct otx2_nic *pfvf, { struct nix_cqe_rx_s *cqe; int processed_cqe = 0; - dma_addr_t bufptr; while (likely(processed_cqe < budget)) { cqe = (struct nix_cqe_rx_s *)CQE_ADDR(cq, cq->cq_head); @@ -330,28 +331,23 @@ static int otx2_rx_napi_handler(struct otx2_nic *pfvf, if (unlikely(!cq->pool_ptrs)) return 0; - /* Refill pool with new buffers */ + pfvf->hw_ops->refill_pool_ptrs(pfvf, cq); + + return processed_cqe; +} + +void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) +{ + struct otx2_nic *pfvf = dev; + dma_addr_t bufptr; + while (cq->pool_ptrs) { - if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, &bufptr))) { - struct refill_work *work; - struct delayed_work *dwork; - - work = &pfvf->refill_wrk[cq->cq_idx]; - dwork = &work->pool_refill_work; - /* Schedule a task if no other task is running */ - if (!cq->refill_task_sched) { - cq->refill_task_sched = true; - schedule_delayed_work(dwork, - msecs_to_jiffies(100)); - } + if (otx2_alloc_buffer(pfvf, cq, &bufptr)) break; - } otx2_aura_freeptr(pfvf, cq->cq_idx, bufptr + OTX2_HEAD_ROOM); cq->pool_ptrs--; } - - return processed_cqe; } static int otx2_tx_napi_handler(struct otx2_nic *pfvf, @@ -438,7 +434,8 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) return workdone; } -static void otx2_sqe_flush(struct otx2_snd_queue *sq, int size) +void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, + int size, int qidx) { u64 status; @@ -796,7 +793,7 @@ static void otx2_sq_append_tso(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, sqe_hdr->sizem1 = (offset / 16) - 1; /* Flush SQE to HW */ - otx2_sqe_flush(sq, offset); + pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx); } } @@ -915,7 +912,7 @@ bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq, netdev_tx_sent_queue(txq, skb->len); /* Flush SQE to HW */ - otx2_sqe_flush(sq, offset); + pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx); return true; } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index 73af15685657..d2b26b3357f3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -114,6 +114,7 @@ struct otx2_cq_poll { struct otx2_pool { struct qmem *stack; struct qmem *fc_addr; + u64 *lmt_addr; u16 rbsize; }; @@ -156,4 +157,10 @@ static inline u64 otx2_iova_to_phys(void *iommu_domain, dma_addr_t dma_addr) int otx2_napi_handler(struct napi_struct *napi, int budget); bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq, struct sk_buff *skb, u16 qidx); +void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, + int size, int qidx); +void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, + int size, int qidx); +void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); #endif /* OTX2_TXRX_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 9ed850b75d59..31e03253e612 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -7,6 +7,7 @@ #include "otx2_common.h" #include "otx2_reg.h" +#include "cn10k.h" #define DRV_NAME "rvu_nicvf" #define DRV_STRING "Marvell RVU NIC Virtual Function Driver" @@ -27,31 +28,6 @@ enum { RVU_VF_INT_VEC_MBOX = 0x0, }; -static int cn10k_lmtst_init(struct otx2_nic *vf) -{ - int size, num_lines; - - if (!test_bit(CN10K_LMTST, &vf->hw.cap_flag)) - return 0; - - size = pci_resource_len(vf->pdev, PCI_MBOX_BAR_NUM); - vf->hw.lmt_base = ioremap_wc(pci_resource_start(vf->pdev, - PCI_MBOX_BAR_NUM), - size); - if (!vf->hw.lmt_base) { - dev_err(vf->dev, "Unable to map VF LMTST region\n"); - return -ENOMEM; - } - - vf->tot_lmt_lines = size / LMT_LINE_SIZE; - /* LMTST lines per SQ */ - num_lines = (vf->tot_lmt_lines - NIX_LMTID_BASE) / - vf->hw.tx_queues; - vf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines; - vf->nix_lmt_size = vf->nix_lmt_lines * LMT_LINE_SIZE; - return 0; -} - static void otx2vf_process_vfaf_mbox_msg(struct otx2_nic *vf, struct mbox_msghdr *msg) { @@ -585,7 +561,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_detach_rsrc; - err = cn10k_lmtst_init(vf); + err = cn10k_vf_lmtst_init(vf); if (err) goto err_detach_rsrc; diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h index ae2279fe830a..28c04d918f0f 100644 --- a/include/linux/soc/marvell/octeontx2/asm.h +++ b/include/linux/soc/marvell/octeontx2/asm.h @@ -22,8 +22,16 @@ : [rs]"r" (ioaddr)); \ (result); \ }) +#define cn10k_lmt_flush(val, addr) \ +({ \ + __asm__ volatile(".cpu generic+lse\n" \ + "steor %x[rf],[%[rs]]" \ + : [rf]"+r"(val) \ + : [rs]"r"(addr)); \ +}) #else #define otx2_lmt_flush(ioaddr) ({ 0; }) +#define cn10k_lmt_flush(val, addr) ({ addr = val; }) #endif #endif /* __SOC_OTX2_ASM_H */ -- cgit v1.2.3 From 07881ccbf40cc7893869f3f170301889ddca54ac Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 10 Feb 2021 12:14:02 +0100 Subject: bpf: Be less specific about socket cookies guarantees Since "92acdc58ab11 bpf, net: Rework cookie generator as per-cpu one" socket cookies are not guaranteed to be non-decreasing. The bpf_get_socket_cookie helper descriptions are currently specifying that cookies are non-decreasing but we don't want users to rely on that. Reported-by: Daniel Borkmann Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210210111406.785541-1-revest@chromium.org --- include/uapi/linux/bpf.h | 8 ++++---- tools/include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c547ad1ffe43..dbf10bf08582 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1656,22 +1656,22 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c547ad1ffe43..dbf10bf08582 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1656,22 +1656,22 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return -- cgit v1.2.3 From c5dbb89fc2ac013afe67b9e4fcb3743c02b567cd Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 10 Feb 2021 12:14:03 +0100 Subject: bpf: Expose bpf_get_socket_cookie to tracing programs This needs a new helper that: - can work in a sleepable context (using sock_gen_cookie) - takes a struct sock pointer and checks that it's not NULL Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210111406.785541-2-revest@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 12 ++++++++++++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 5 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1c8ea682c007..cccaef1088ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1885,6 +1885,7 @@ extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; +extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dbf10bf08582..07cc2e404291 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1673,6 +1673,14 @@ union bpf_attr { * Return * A 8-byte long unique number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a..845b2168e006 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1760,6 +1760,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index e15d4741719a..57aaed478362 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4631,6 +4631,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dbf10bf08582..07cc2e404291 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1673,6 +1673,14 @@ union bpf_attr { * Return * A 8-byte long unique number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket -- cgit v1.2.3 From 3b23a32a63219f51a5298bc55a65ecee866e79d0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 11 Feb 2021 11:34:10 -0800 Subject: net: fix dev_ifsioc_locked() race condition dev_ifsioc_locked() is called with only RCU read lock, so when there is a parallel writer changing the mac address, it could get a partially updated mac address, as shown below: Thread 1 Thread 2 // eth_commit_mac_addr_change() memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); // dev_ifsioc_locked() memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,...); Close this race condition by guarding them with a RW semaphore, like netdev_get_name(). We can not use seqlock here as it does not allow blocking. The writers already take RTNL anyway, so this does not affect the slow path. To avoid bothering existing dev_set_mac_address() callers in drivers, introduce a new wrapper just for user-facing callers on ioctl and rtnetlink paths. Note, bonding also changes slave mac addresses but that requires a separate patch due to the complexity of bonding code. Fixes: 3710becf8a58 ("net: RCU locking for simple ioctl()") Reported-by: "Gong, Sishuai" Cc: Eric Dumazet Cc: Jakub Kicinski Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/tap.c | 7 +++---- drivers/net/tun.c | 5 ++--- include/linux/netdevice.h | 3 +++ net/core/dev.c | 42 ++++++++++++++++++++++++++++++++++++++++++ net/core/dev_ioctl.c | 20 +++++++------------- net/core/rtnetlink.c | 2 +- 6 files changed, 58 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index ff4aa35979a1..8e3a28ba6b28 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1090,10 +1090,9 @@ static long tap_ioctl(struct file *file, unsigned int cmd, return -ENOLINK; } ret = 0; - u = tap->dev->type; + dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || - copy_to_user(&ifr->ifr_hwaddr.sa_data, tap->dev->dev_addr, ETH_ALEN) || - put_user(u, &ifr->ifr_hwaddr.sa_family)) + copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); @@ -1108,7 +1107,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, rtnl_unlock(); return -ENOLINK; } - ret = dev_set_mac_address(tap->dev, &sa, NULL); + ret = dev_set_mac_address_user(tap->dev, &sa, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 62690baa19bc..fc86da7f1628 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3107,15 +3107,14 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case SIOCGIFHWADDR: /* Get hw address */ - memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN); - ifr.ifr_hwaddr.sa_family = tun->dev->type; + dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ - ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr, NULL); + ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a20310ff5083..bfadf3b82f9c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3902,6 +3902,9 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack); +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); diff --git a/net/core/dev.c b/net/core/dev.c index 321d41a110e7..ce6291bc2e16 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8937,6 +8937,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); +static DECLARE_RWSEM(dev_addr_sem); + +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + ret = dev_set_mac_address(dev, sa, extack); + up_write(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_get_mac_address); + /** * dev_change_carrier - Change device carrier * @dev: device diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index db8a0ff86f36..478d032f34ac 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -123,17 +123,6 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm ifr->ifr_mtu = dev->mtu; return 0; - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, - sizeof(ifr->ifr_hwaddr.sa_data)); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof(ifr->ifr_hwaddr.sa_data), - (size_t)dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - case SIOCGIFSLAVE: err = -EINVAL; break; @@ -274,7 +263,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -418,6 +407,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c */ switch (cmd) { + case SIOCGIFHWADDR: + dev_load(net, ifr->ifr_name); + ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + if (colon) + *colon = ':'; + return ret; /* * These ioctl calls: * - can be done by all. @@ -427,7 +422,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: - case SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c313aaf2bce1..0edc0b2baaa4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2660,7 +2660,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa, extack); + err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; -- cgit v1.2.3 From 3c5a2fd042d0bfac71a2dfb99515723d318df47b Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 11 Feb 2021 13:21:07 -0800 Subject: tcp: Sanitize CMSG flags and reserved args in tcp_zerocopy_receive. Explicitly define reserved field and require it and any subsequent fields to be zero-valued for now. Additionally, limit the valid CMSG flags that tcp_zerocopy_receive accepts. Fixes: 7eeba1706eba ("tcp: Add receive timestamp support for receive zerocopy.") Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Suggested-by: David Ahern Suggested-by: Leon Romanovsky Suggested-by: Jakub Kicinski Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 2 +- net/ipv4/tcp.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 42fc5a640df4..8fc09e8638b3 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -357,6 +357,6 @@ struct tcp_zerocopy_receive { __u64 msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags; - /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ + __u32 reserved; /* set to 0 for now */ }; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e1a17c6b473c..9896ca10bb34 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2030,6 +2030,7 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, err); } +#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); static void tcp_zc_finalize_rx_tstamp(struct sock *sk, @@ -4152,13 +4153,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; if (len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; - if (len > sizeof(zc)) { + if (unlikely(len > sizeof(zc))) { + err = check_zeroed_user(optval + sizeof(zc), + len - sizeof(zc)); + if (err < 1) + return err == 0 ? -EINVAL : err; len = sizeof(zc); if (put_user(len, optlen)) return -EFAULT; } if (copy_from_user(&zc, optval, len)) return -EFAULT; + if (zc.reserved) + return -EINVAL; + if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) + return -EINVAL; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); release_sock(sk); -- cgit v1.2.3 From 4e1beecc3b586e5d05401c0a9e456f96aab0e5a4 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 11 Feb 2021 13:35:51 +0200 Subject: net/sock: Add kernel config SOCK_RX_QUEUE_MAPPING Use a new config SOCK_RX_QUEUE_MAPPING to compile-in the socket RX queue field and logic, instead of the XPS config. This breaks dependency in XPS, and allows selecting it from non-XPS use cases, as we do in the next patch. In addition, use the new flag to wrap the logic in sk_rx_queue_get() and protect access to the sk_rx_queue_mapping field, while keeping the function exposed unconditionally, just like sk_rx_queue_set() and sk_rx_queue_clear(). Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Signed-off-by: David S. Miller --- include/net/sock.h | 12 ++++++------ net/Kconfig | 4 ++++ net/core/filter.c | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 690e496a0e79..855c068c6c86 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -226,7 +226,7 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { @@ -356,7 +356,7 @@ struct sock { #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif @@ -1838,7 +1838,7 @@ static inline int sk_tx_queue_get(const struct sock *sk) static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); @@ -1852,20 +1852,20 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) static inline void sk_rx_queue_clear(struct sock *sk) { -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; #endif } -#ifdef CONFIG_XPS static inline int sk_rx_queue_get(const struct sock *sk) { +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) return sk->sk_rx_queue_mapping; +#endif return -1; } -#endif static inline void sk_set_socket(struct sock *sk, struct socket *sock) { diff --git a/net/Kconfig b/net/Kconfig index f4c32d982af6..8cea808ad9e8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -256,9 +256,13 @@ config RFS_ACCEL select CPU_RMAP default y +config SOCK_RX_QUEUE_MAPPING + bool + config XPS bool depends on SMP + select SOCK_RX_QUEUE_MAPPING default y config HWBM diff --git a/net/core/filter.c b/net/core/filter.c index 74bd401bf483..3b728ab79a61 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8814,7 +8814,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, -- cgit v1.2.3 From b6db0f899a16a23f5a9ea6c8b0fafc7bbd38e03d Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 4 Feb 2021 06:46:10 -0800 Subject: cfg80211/mac80211: Support disabling HE mode Allow user to disable HE mode, similar to how VHT and HT can be disabled. Useful for testing. Signed-off-by: Ben Greear Link: https://lore.kernel.org/r/20210204144610.25971-1-greearb@candelatech.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 2 ++ net/mac80211/mlme.c | 3 +++ net/wireless/nl80211.c | 7 +++++++ 4 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4cdd75449d73..911fae42b0c0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2583,12 +2583,14 @@ struct cfg80211_auth_request { * authentication capability. Drivers can offload authentication to * userspace if this flag is set. Only applicable for cfg80211_connect() * request (connect callback). + * @ASSOC_REQ_DISABLE_HE: Disable HE */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), ASSOC_REQ_DISABLE_VHT = BIT(1), ASSOC_REQ_USE_RRM = BIT(2), CONNECT_REQ_EXTERNAL_AUTH_SUPPORT = BIT(3), + ASSOC_REQ_DISABLE_HE = BIT(4), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 40832d13c2f1..5188fe581efc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3045,6 +3045,8 @@ enum nl80211_attrs { NL80211_ATTR_SAR_SPEC, + NL80211_ATTR_DISABLE_HE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0e4d950cf907..2e33a1263518 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5754,6 +5754,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (req->flags & ASSOC_REQ_DISABLE_VHT) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + if (req->flags & ASSOC_REQ_DISABLE_HE) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + err = ieee80211_prep_connection(sdata, req->bss, true, override); if (err) goto err_clear; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3b45a9593e71..521d36bb0803 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -752,6 +752,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NL80211_SAE_PWE_BOTH), [NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT }, [NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy), + [NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -10019,6 +10020,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT])) req.flags |= ASSOC_REQ_DISABLE_VHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE])) + req.flags |= ASSOC_REQ_DISABLE_HE; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&req.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -10802,6 +10806,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT])) connect.flags |= ASSOC_REQ_DISABLE_VHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE])) + connect.flags |= ASSOC_REQ_DISABLE_HE; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&connect.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), -- cgit v1.2.3 From 735a48481cca453525d9199772f9c3733a47cff4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Feb 2021 10:50:23 +0100 Subject: nl80211: add documentation for HT/VHT/HE disable attributes These were missed earlier, add the necessary documentation and, while at it, clarify it. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20210212105023.895c3389f063.I46dea3bfc64385bc6f600c50d294007510994f8f@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5188fe581efc..ac78da99fccd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1963,8 +1963,15 @@ enum nl80211_commands { * @NL80211_ATTR_PROBE_RESP: Probe Response template data. Contains the entire * probe-response frame. The DA field in the 802.11 header is zero-ed out, * to be filled by the FW. - * @NL80211_ATTR_DISABLE_HT: Force HT capable interfaces to disable - * this feature. Currently, only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_HT: Force HT capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_VHT: Force VHT capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_HE: Force HE capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. * @NL80211_ATTR_HT_CAPABILITY_MASK: Specify which bits of the * ATTR_HT_CAPABILITY to which attention should be paid. * Currently, only mac80211 NICs support this feature. -- cgit v1.2.3 From e1850ea9bd9eca3656820b4875967d6f9c11c237 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Feb 2021 14:38:19 +0100 Subject: bpf: bpf_fib_lookup return MTU value as output when looked up The BPF-helpers for FIB lookup (bpf_xdp_fib_lookup and bpf_skb_fib_lookup) can perform MTU check and return BPF_FIB_LKUP_RET_FRAG_NEEDED. The BPF-prog don't know the MTU value that caused this rejection. If the BPF-prog wants to implement PMTU (Path MTU Discovery) (rfc1191) it need to know this MTU value for the ICMP packet. Patch change lookup and result struct bpf_fib_lookup, to contain this MTU value as output via a union with 'tot_len' as this is the value used for the MTU lookup. V5: - Fixed uninit value spotted by Dan Carpenter. - Name struct output member mtu_result Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161287789952.790810.13134700381067698781.stgit@firesoul --- include/uapi/linux/bpf.h | 11 +++++++++-- net/core/filter.c | 22 +++++++++++++++------- tools/include/uapi/linux/bpf.h | 11 +++++++++-- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 07cc2e404291..6b1f6058cccf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2239,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -4990,9 +4993,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index 1fec2e998e37..e7a9b1667dd6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5301,12 +5301,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, const struct neighbour *neigh, - const struct net_device *dev) + const struct net_device *dev, u32 mtu) { memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ return 0; } @@ -5322,8 +5324,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct net_device *dev; struct fib_result res; struct flowi4 fl4; + u32 mtu = 0; int err; - u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -5390,8 +5392,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } nhc = res.nhc; @@ -5425,7 +5429,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5442,7 +5446,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif, err; - u32 mtu; + u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -5517,8 +5521,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } if (res.nh->fib_nh_lws) @@ -5538,7 +5544,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5614,6 +5620,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 07cc2e404291..6b1f6058cccf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2239,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -4990,9 +4993,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ -- cgit v1.2.3 From 34b2021cc61642d61c3cf943d9e71925b827941b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Feb 2021 14:38:24 +0100 Subject: bpf: Add BPF-helper for MTU checking This BPF-helper bpf_check_mtu() works for both XDP and TC-BPF programs. The SKB object is complex and the skb->len value (accessible from BPF-prog) also include the length of any extra GRO/GSO segments, but without taking into account that these GRO/GSO segments get added transport (L4) and network (L3) headers before being transmitted. Thus, this BPF-helper is created such that the BPF-programmer don't need to handle these details in the BPF-prog. The API is designed to help the BPF-programmer, that want to do packet context size changes, which involves other helpers. These other helpers usually does a delta size adjustment. This helper also support a delta size (len_diff), which allow BPF-programmer to reuse arguments needed by these other helpers, and perform the MTU check prior to doing any actual size adjustment of the packet context. It is on purpose, that we allow the len adjustment to become a negative result, that will pass the MTU check. This might seem weird, but it's not this helpers responsibility to "catch" wrong len_diff adjustments. Other helpers will take care of these checks, if BPF-programmer chooses to do actual size adjustment. V14: - Improve man-page desc of len_diff. V13: - Enforce flag BPF_MTU_CHK_SEGS cannot use len_diff. V12: - Simplify segment check that calls skb_gso_validate_network_len. - Helpers should return long V9: - Use dev->hard_header_len (instead of ETH_HLEN) - Annotate with unlikely req from Daniel - Fix logic error using skb_gso_validate_network_len from Daniel V6: - Took John's advice and dropped BPF_MTU_CHK_RELAX - Returned MTU is kept at L3-level (like fib_lookup) V4: Lot of changes - ifindex 0 now use current netdev for MTU lookup - rename helper from bpf_mtu_check to bpf_check_mtu - fix bug for GSO pkt length (as skb->len is total len) - remove __bpf_len_adj_positive, simply allow negative len adj Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161287790461.790810.3429728639563297353.stgit@firesoul --- include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++ net/core/filter.c | 114 +++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b1f6058cccf..4c24daa43bac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3847,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4012,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5045,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index e7a9b1667dd6..439f43f00483 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5637,6 +5637,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + skb_len = skb->len + len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { @@ -7222,6 +7332,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7291,6 +7403,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b1f6058cccf..4c24daa43bac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3847,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4012,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5045,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ -- cgit v1.2.3 From 5f7d57280c1982d993d5f4ff0edac310f820f607 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Feb 2021 14:38:29 +0100 Subject: bpf: Drop MTU check when doing TC-BPF redirect to ingress The use-case for dropping the MTU check when TC-BPF does redirect to ingress, is described by Eyal Birger in email[0]. The summary is the ability to increase packet size (e.g. with IPv6 headers for NAT64) and ingress redirect packet and let normal netstack fragment packet as needed. [0] https://lore.kernel.org/netdev/CAHsH6Gug-hsLGHQ6N0wtixdOa85LDZ3HNRHVd0opR=19Qo4W4Q@mail.gmail.com/ V15: - missing static for function declaration V9: - Make net_device "up" (IFF_UP) check explicit in skb_do_redirect V4: - Keep net_device "up" (IFF_UP) check. - Adjustment to handle bpf_redirect_peer() helper Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161287790971.790810.11785274340154740591.stgit@firesoul --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++-- net/core/dev.c | 32 ++++++++++++++------------------ net/core/filter.c | 6 +++--- 3 files changed, 47 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef517254367d..b9bcbfde7849 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3931,14 +3931,42 @@ int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); +static __always_inline bool __is_skb_forwardable(const struct net_device *dev, + const struct sk_buff *skb, + const bool check_mtu) +{ + const u32 vlan_hdr_len = 4; /* VLAN_HLEN */ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + if (!check_mtu) + return true; + + len = dev->mtu + dev->hard_header_len + vlan_hdr_len; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + static __always_inline int ____dev_forward_skb(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + const bool check_mtu) { if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { + unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; diff --git a/net/core/dev.c b/net/core/dev.c index 07a0347c33fb..8c820fe0eff3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2194,28 +2194,14 @@ static inline void net_timestamp_set(struct sk_buff *skb) bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; - - if (!(dev->flags & IFF_UP)) - return false; - - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; - - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; - - return false; + return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); -int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); @@ -2224,6 +2210,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return ret; } + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} EXPORT_SYMBOL_GPL(__dev_forward_skb); /** @@ -2250,6 +2241,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) diff --git a/net/core/filter.c b/net/core/filter.c index 439f43f00483..7059cf604d94 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2083,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = { static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { - return dev_forward_skb(dev, skb); + return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; @@ -2480,7 +2480,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; dev = ops->ndo_get_peer_dev(dev); if (unlikely(!dev || - !is_skb_forwardable(dev, skb) || + !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; -- cgit v1.2.3 From 4d54cc32112d8d8b0667559c9309f1a6f764f70b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Feb 2021 15:59:59 -0800 Subject: mptcp: avoid lock_fast usage in accept path Once event support is added this may need to allocate memory while msk lock is held with softirqs disabled. Not using lock_fast also allows to do the allocation with GFP_KERNEL. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/genetlink.h | 1 + net/mptcp/protocol.c | 5 ++--- net/netlink/genetlink.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index e55ec1597ce7..7cb3fa8310ed 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -14,6 +14,7 @@ */ struct genl_multicast_group { char name[GENL_NAMSIZ]; + u8 flags; }; struct genl_ops; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 56240b87d464..fe6da1b77723 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3260,9 +3260,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; struct sock *newsk = newsock->sk; - bool slowpath; - slowpath = lock_sock_fast(newsk); + lock_sock(newsk); /* PM/worker can now acquire the first subflow socket * lock without racing with listener queue cleanup, @@ -3288,7 +3287,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } - unlock_sock_fast(newsk, slowpath); + release_sock(newsk); } if (inet_csk_listen_poll(ssock->sk)) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index c992424e4d63..2d6fdf40df66 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1360,11 +1360,43 @@ static struct genl_family genl_ctrl __ro_after_init = { .netnsok = true, }; +static int genl_bind(struct net *net, int group) +{ + const struct genl_family *family; + unsigned int id; + int ret = 0; + + genl_lock_all(); + + idr_for_each_entry(&genl_fam_idr, family, id) { + const struct genl_multicast_group *grp; + int i; + + if (family->n_mcgrps == 0) + continue; + + i = group - family->mcgrp_offset; + if (i < 0 || i >= family->n_mcgrps) + continue; + + grp = &family->mcgrps[i]; + if ((grp->flags & GENL_UNS_ADMIN_PERM) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) + ret = -EPERM; + + break; + } + + genl_unlock_all(); + return ret; +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, + .bind = genl_bind, }; /* we'll bump the group number right afterwards */ -- cgit v1.2.3 From b911c97c7dc771633c68ea9b8f15070f8af3d323 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Feb 2021 16:00:01 -0800 Subject: mptcp: add netlink event support Allow userspace (mptcpd) to subscribe to mptcp genl multicast events. This implementation reuses the same event API as the mptcp kernel fork to ease integration of existing tools, e.g. mptcpd. Supported events include: 1. start and close of an mptcp connection 2. start and close of subflows (joins) 3. announce and withdrawals of addresses 4. subflow priority (backup/non-backup) change. Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 74 +++++++++++++ net/mptcp/pm.c | 20 +++- net/mptcp/pm_netlink.c | 261 ++++++++++++++++++++++++++++++++++++++++++++- net/mptcp/protocol.c | 10 +- net/mptcp/protocol.h | 6 ++ 5 files changed, 364 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 3674a451a18c..c91578aaab32 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -36,6 +36,7 @@ enum { /* netlink interface */ #define MPTCP_PM_NAME "mptcp_pm" #define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" +#define MPTCP_PM_EV_GRP_NAME "mptcp_pm_events" #define MPTCP_PM_VER 0x1 /* @@ -104,4 +105,77 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; }; +/* + * MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A new MPTCP connection has been created. It is the good time to allocate + * memory and send ADD_ADDR if needed. Depending on the traffic-patterns + * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + * + * MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A MPTCP connection is established (can start new subflows). + * + * MPTCP_EVENT_CLOSED: token + * A MPTCP connection has stopped. + * + * MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] + * A new address has been announced by the peer. + * + * MPTCP_EVENT_REMOVED: token, rem_id + * An address has been lost by the peer. + * + * MPTCP_EVENT_SUB_ESTABLISHED: token, family, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, + * if_idx [, error] + * A new subflow has been established. 'error' should not be set. + * + * MPTCP_EVENT_SUB_CLOSED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup, if_idx [, error] + * A subflow has been closed. An error (copy of sk_err) could be set if an + * error has been detected for this subflow. + * + * MPTCP_EVENT_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup, if_idx [, error] + * The priority of a subflow has changed. 'error' should not be set. + */ +enum mptcp_event_type { + MPTCP_EVENT_UNSPEC = 0, + MPTCP_EVENT_CREATED = 1, + MPTCP_EVENT_ESTABLISHED = 2, + MPTCP_EVENT_CLOSED = 3, + + MPTCP_EVENT_ANNOUNCED = 6, + MPTCP_EVENT_REMOVED = 7, + + MPTCP_EVENT_SUB_ESTABLISHED = 10, + MPTCP_EVENT_SUB_CLOSED = 11, + + MPTCP_EVENT_SUB_PRIORITY = 13, +}; + +enum mptcp_event_attr { + MPTCP_ATTR_UNSPEC = 0, + + MPTCP_ATTR_TOKEN, /* u32 */ + MPTCP_ATTR_FAMILY, /* u16 */ + MPTCP_ATTR_LOC_ID, /* u8 */ + MPTCP_ATTR_REM_ID, /* u8 */ + MPTCP_ATTR_SADDR4, /* be32 */ + MPTCP_ATTR_SADDR6, /* struct in6_addr */ + MPTCP_ATTR_DADDR4, /* be32 */ + MPTCP_ATTR_DADDR6, /* struct in6_addr */ + MPTCP_ATTR_SPORT, /* be16 */ + MPTCP_ATTR_DPORT, /* be16 */ + MPTCP_ATTR_BACKUP, /* u8 */ + MPTCP_ATTR_ERROR, /* u8 */ + MPTCP_ATTR_FLAGS, /* u16 */ + MPTCP_ATTR_TIMEOUT, /* u32 */ + MPTCP_ATTR_IF_IDX, /* s32 */ + + __MPTCP_ATTR_AFTER_LAST +}; + +#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) + #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 1dd0e9d7ed06..6fd4b2c1b076 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -75,6 +75,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); WRITE_ONCE(pm->server_side, server_side); + mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); } bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) @@ -122,13 +123,10 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct mptcp_pm_data *pm = &msk->pm; + bool announce = false; pr_debug("msk=%p", msk); - /* try to avoid acquiring the lock below */ - if (!READ_ONCE(pm->work_pending)) - return; - spin_lock_bh(&pm->lock); /* mptcp_pm_fully_established() can be invoked by multiple @@ -138,9 +136,15 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, if (READ_ONCE(pm->work_pending) && !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); - msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); + if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) + announce = true; + + msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); + + if (announce) + mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) @@ -179,6 +183,8 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, READ_ONCE(pm->accept_addr)); + mptcp_event_addr_announced(msk, addr); + spin_lock_bh(&pm->lock); if (!READ_ONCE(pm->accept_addr)) { @@ -205,6 +211,8 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) pr_debug("msk=%p remote_id=%d", msk, rm_id); + mptcp_event_addr_removed(msk, rm_id); + spin_lock_bh(&pm->lock); mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); pm->rm_id = rm_id; @@ -217,6 +225,8 @@ void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); subflow->backup = bkup; + + mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); } /* path manager helpers */ diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c3abff40fa4e..229fd1af2e29 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -860,10 +860,14 @@ void mptcp_pm_nl_data_init(struct mptcp_sock *msk) WRITE_ONCE(pm->accept_subflow, subflows); } -#define MPTCP_PM_CMD_GRP_OFFSET 0 +#define MPTCP_PM_CMD_GRP_OFFSET 0 +#define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, + [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, + .flags = GENL_UNS_ADMIN_PERM, + }, }; static const struct nla_policy @@ -1482,6 +1486,261 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) return 0; } +static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) +{ + genlmsg_multicast_netns(&mptcp_genl_family, net, + nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); +} + +static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) +{ + const struct inet_sock *issk = inet_sk(ssk); + const struct mptcp_subflow_context *sf; + + if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) + return -EMSGSIZE; + + switch (ssk->sk_family) { + case AF_INET: + if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) + return -EMSGSIZE; + if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) + return -EMSGSIZE; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *np = inet6_sk(ssk); + + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + return -EMSGSIZE; + if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) + return -EMSGSIZE; + break; + } +#endif + default: + WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) + return -EMSGSIZE; + if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) + return -EMSGSIZE; + + sf = mptcp_subflow_ctx(ssk); + if (WARN_ON_ONCE(!sf)) + return -EINVAL; + + if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id)) + return -EMSGSIZE; + + if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) + return -EMSGSIZE; + + return 0; +} + +static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + const struct sock *sk = (const struct sock *)msk; + const struct mptcp_subflow_context *sf; + u8 sk_err; + + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + return -EMSGSIZE; + + if (mptcp_event_add_subflow(skb, ssk)) + return -EMSGSIZE; + + sf = mptcp_subflow_ctx(ssk); + if (WARN_ON_ONCE(!sf)) + return -EINVAL; + + if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) + return -EMSGSIZE; + + if (ssk->sk_bound_dev_if && + nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) + return -EMSGSIZE; + + sk_err = ssk->sk_err; + if (sk_err && sk->sk_state == TCP_ESTABLISHED && + nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) + return -EMSGSIZE; + + return 0; +} + +static int mptcp_event_sub_established(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + return mptcp_event_put_token_and_ssk(skb, msk, ssk); +} + +static int mptcp_event_sub_closed(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) + return -EMSGSIZE; + + return 0; +} + +static int mptcp_event_created(struct sk_buff *skb, + const struct mptcp_sock *msk, + const struct sock *ssk) +{ + int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token); + + if (err) + return err; + + return mptcp_event_add_subflow(skb, ssk); +} + +void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) +{ + struct net *net = sock_net((const struct sock *)msk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); + if (!nlh) + goto nla_put_failure; + + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + goto nla_put_failure; + + if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) + goto nla_put_failure; + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); + return; + +nla_put_failure: + kfree_skb(skb); +} + +void mptcp_event_addr_announced(const struct mptcp_sock *msk, + const struct mptcp_addr_info *info) +{ + struct net *net = sock_net((const struct sock *)msk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, + MPTCP_EVENT_ANNOUNCED); + if (!nlh) + goto nla_put_failure; + + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + goto nla_put_failure; + + if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) + goto nla_put_failure; + + if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port)) + goto nla_put_failure; + + switch (info->family) { + case AF_INET: + if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) + goto nla_put_failure; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: + if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) + goto nla_put_failure; + break; +#endif + default: + WARN_ON_ONCE(1); + goto nla_put_failure; + } + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); + return; + +nla_put_failure: + kfree_skb(skb); +} + +void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, + const struct sock *ssk, gfp_t gfp) +{ + struct net *net = sock_net((const struct sock *)msk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); + if (!nlh) + goto nla_put_failure; + + switch (type) { + case MPTCP_EVENT_UNSPEC: + WARN_ON_ONCE(1); + break; + case MPTCP_EVENT_CREATED: + case MPTCP_EVENT_ESTABLISHED: + if (mptcp_event_created(skb, msk, ssk) < 0) + goto nla_put_failure; + break; + case MPTCP_EVENT_CLOSED: + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0) + goto nla_put_failure; + break; + case MPTCP_EVENT_ANNOUNCED: + case MPTCP_EVENT_REMOVED: + /* call mptcp_event_addr_announced()/removed instead */ + WARN_ON_ONCE(1); + break; + case MPTCP_EVENT_SUB_ESTABLISHED: + case MPTCP_EVENT_SUB_PRIORITY: + if (mptcp_event_sub_established(skb, msk, ssk) < 0) + goto nla_put_failure; + break; + case MPTCP_EVENT_SUB_CLOSED: + if (mptcp_event_sub_closed(skb, msk, ssk) < 0) + goto nla_put_failure; + break; + } + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, gfp); + return; + +nla_put_failure: + kfree_skb(skb); +} + static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_ADD_ADDR, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index fe6da1b77723..c2a8392254dc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2150,6 +2150,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { + if (sk->sk_state == TCP_ESTABLISHED) + mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); __mptcp_close_ssk(sk, ssk, subflow); } @@ -2586,6 +2588,10 @@ cleanup: release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); + + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + sock_put(sk); } @@ -3057,7 +3063,7 @@ bool mptcp_finish_join(struct sock *ssk) return false; if (!msk->pm.server_side) - return true; + goto out; if (!mptcp_pm_allow_new_subflow(msk)) return false; @@ -3084,6 +3090,8 @@ bool mptcp_finish_join(struct sock *ssk) if (parent_sock && !ssk->sk_socket) mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); +out: + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f620e2f98d19..d31edbae8da8 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -10,6 +10,7 @@ #include #include #include +#include #define MPTCP_SUPPORTED_VERSION 1 @@ -666,6 +667,11 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); +void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, + const struct sock *ssk, gfp_t gfp); +void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info); +void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); + static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); -- cgit v1.2.3 From 4c08c586ff29bda47e3db14da096331d84933f48 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 12 Feb 2021 17:15:51 +0200 Subject: net: switchdev: propagate extack to port attributes When a struct switchdev_attr is notified through switchdev, there is no way to report informational messages, unlike for struct switchdev_obj. Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Reviewed-by: Florian Fainelli Reviewed-by: Nikolay Aleksandrov Reviewed-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/prestera/prestera_switchdev.c | 3 ++- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 3 ++- drivers/net/ethernet/mscc/ocelot_net.c | 3 ++- drivers/net/ethernet/ti/am65-cpsw-switchdev.c | 3 ++- drivers/net/ethernet/ti/cpsw_switchdev.c | 3 ++- include/net/switchdev.h | 6 ++++-- net/dsa/slave.c | 3 ++- net/switchdev/switchdev.c | 11 ++++++++--- 8 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 8c2b03151736..2c1619715a4b 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -695,7 +695,8 @@ err_port_stp_set: } static int prestera_port_obj_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { struct prestera_port *port = netdev_priv(dev); int err = 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 20c4f3c2cf23..18e4f1cd5587 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -887,7 +887,8 @@ mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, } static int mlxsw_sp_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); int err; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 8f12fa45b1b5..f9da4aa39444 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1005,7 +1005,8 @@ static void ocelot_port_attr_mc_set(struct ocelot *ocelot, int port, bool mc) } static int ocelot_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot *ocelot = priv->port.ocelot; diff --git a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c index 1067e7772dbf..314825acf0a0 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c +++ b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c @@ -81,7 +81,8 @@ static int am65_cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, } static int am65_cpsw_port_attr_set(struct net_device *ndev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { struct am65_cpsw_port *port = am65_ndev_to_port(ndev); int ret; diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index 9967cf985728..13524cbaa8b6 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -83,7 +83,8 @@ static int cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, } static int cpsw_port_attr_set(struct net_device *ndev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { struct cpsw_priv *priv = netdev_priv(ndev); int ret; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 6dcfc4c51a6e..9279d4245bab 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -281,7 +281,8 @@ int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr)); + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack)); #else static inline void switchdev_deferred_process(void) @@ -372,7 +373,8 @@ switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr)) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack)) { return 0; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a95e78d59740..be29008477d3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -272,7 +272,8 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } static int dsa_slave_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int ret; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 94113ca29dcf..0b84f076591e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -488,14 +488,18 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr)) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack)) { + struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; + extack = switchdev_notifier_info_to_extack(&port_attr_info->info); + if (check_cb(dev)) { - err = set_cb(dev, port_attr_info->attr); + err = set_cb(dev, port_attr_info->attr, extack); if (err != -EOPNOTSUPP) port_attr_info->handled = true; return err; @@ -525,7 +529,8 @@ int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, - const struct switchdev_attr *attr)) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack)) { int err; -- cgit v1.2.3 From e18f4c18ab5b0dd47caaf8377c2e36d66f632a8c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 12 Feb 2021 17:15:55 +0200 Subject: net: switchdev: pass flags and mask to both {PRE_,}BRIDGE_FLAGS attributes This switchdev attribute offers a counterproductive API for a driver writer, because although br_switchdev_set_port_flag gets passed a "flags" and a "mask", those are passed piecemeal to the driver, so while the PRE_BRIDGE_FLAGS listener knows what changed because it has the "mask", the BRIDGE_FLAGS listener doesn't, because it only has the final value. But certain drivers can offload only certain combinations of settings, like for example they cannot change unicast flooding independently of multicast flooding - they must be both on or both off. The way the information is passed to switchdev makes drivers not expressive enough, and unable to reject this request ahead of time, in the PRE_BRIDGE_FLAGS notifier, so they are forced to reject it during the deferred BRIDGE_FLAGS attribute, where the rejection is currently ignored. This patch also changes drivers to make use of the "mask" field for edge detection when possible. Signed-off-by: Vladimir Oltean Reviewed-by: Grygorii Strashko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../ethernet/marvell/prestera/prestera_switchdev.c | 23 ++++++---- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 50 +++++++++++++--------- drivers/net/ethernet/rocker/rocker_main.c | 10 ++--- drivers/net/ethernet/ti/am65-cpsw-switchdev.c | 24 ++++++----- drivers/net/ethernet/ti/cpsw_switchdev.c | 24 ++++++----- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 34 +++++++++------ include/net/switchdev.h | 7 ++- net/bridge/br_switchdev.c | 6 +-- net/dsa/dsa_priv.h | 6 ++- net/dsa/port.c | 34 ++++++++------- 10 files changed, 129 insertions(+), 89 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 2c1619715a4b..49e052273f30 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -581,7 +581,7 @@ int prestera_bridge_port_event(struct net_device *dev, unsigned long event, static int prestera_port_attr_br_flags_set(struct prestera_port *port, struct net_device *dev, - unsigned long flags) + struct switchdev_brport_flags flags) { struct prestera_bridge_port *br_port; int err; @@ -590,15 +590,20 @@ static int prestera_port_attr_br_flags_set(struct prestera_port *port, if (!br_port) return 0; - err = prestera_hw_port_flood_set(port, flags & BR_FLOOD); - if (err) - return err; + if (flags.mask & BR_FLOOD) { + err = prestera_hw_port_flood_set(port, flags.val & BR_FLOOD); + if (err) + return err; + } - err = prestera_hw_port_learning_set(port, flags & BR_LEARNING); - if (err) - return err; + if (flags.mask & BR_LEARNING) { + err = prestera_hw_port_learning_set(port, + flags.val & BR_LEARNING); + if (err) + return err; + } - memcpy(&br_port->flags, &flags, sizeof(flags)); + memcpy(&br_port->flags, &flags.val, sizeof(flags.val)); return 0; } @@ -707,7 +712,7 @@ static int prestera_port_obj_attr_set(struct net_device *dev, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: - if (attr->u.brport_flags & + if (attr->u.brport_flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) err = -EINVAL; break; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 18e4f1cd5587..23b7e8d6386b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -653,11 +653,11 @@ err_port_bridge_vlan_learning_set: return err; } -static int mlxsw_sp_port_attr_br_pre_flags_set(struct mlxsw_sp_port - *mlxsw_sp_port, - unsigned long brport_flags) +static int +mlxsw_sp_port_attr_br_pre_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, + struct switchdev_brport_flags flags) { - if (brport_flags & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) + if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) return -EINVAL; return 0; @@ -665,7 +665,7 @@ static int mlxsw_sp_port_attr_br_pre_flags_set(struct mlxsw_sp_port static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *orig_dev, - unsigned long brport_flags) + struct switchdev_brport_flags flags) { struct mlxsw_sp_bridge_port *bridge_port; int err; @@ -675,29 +675,37 @@ static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, if (!bridge_port) return 0; - err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, - MLXSW_SP_FLOOD_TYPE_UC, - brport_flags & BR_FLOOD); - if (err) - return err; + if (flags.mask & BR_FLOOD) { + err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, + bridge_port, + MLXSW_SP_FLOOD_TYPE_UC, + flags.val & BR_FLOOD); + if (err) + return err; + } - err = mlxsw_sp_bridge_port_learning_set(mlxsw_sp_port, bridge_port, - brport_flags & BR_LEARNING); - if (err) - return err; + if (flags.mask & BR_LEARNING) { + err = mlxsw_sp_bridge_port_learning_set(mlxsw_sp_port, + bridge_port, + flags.val & BR_LEARNING); + if (err) + return err; + } if (bridge_port->bridge_device->multicast_enabled) goto out; - err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, - MLXSW_SP_FLOOD_TYPE_MC, - brport_flags & - BR_MCAST_FLOOD); - if (err) - return err; + if (flags.mask & BR_MCAST_FLOOD) { + err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, + bridge_port, + MLXSW_SP_FLOOD_TYPE_MC, + flags.val & BR_MCAST_FLOOD); + if (err) + return err; + } out: - memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags)); + memcpy(&bridge_port->flags, &flags.val, sizeof(flags.val)); return 0; } diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 740a715c49c6..3473d296b2e2 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -1576,7 +1576,7 @@ rocker_world_port_attr_bridge_flags_support_get(const struct rocker_port * static int rocker_world_port_attr_pre_bridge_flags_set(struct rocker_port *rocker_port, - unsigned long brport_flags) + struct switchdev_brport_flags flags) { struct rocker_world_ops *wops = rocker_port->rocker->wops; unsigned long brport_flags_s; @@ -1590,7 +1590,7 @@ rocker_world_port_attr_pre_bridge_flags_set(struct rocker_port *rocker_port, if (err) return err; - if (brport_flags & ~brport_flags_s) + if (flags.mask & ~brport_flags_s) return -EINVAL; return 0; @@ -1598,14 +1598,14 @@ rocker_world_port_attr_pre_bridge_flags_set(struct rocker_port *rocker_port, static int rocker_world_port_attr_bridge_flags_set(struct rocker_port *rocker_port, - unsigned long brport_flags) + struct switchdev_brport_flags flags) { struct rocker_world_ops *wops = rocker_port->rocker->wops; if (!wops->port_attr_bridge_flags_set) return -EOPNOTSUPP; - return wops->port_attr_bridge_flags_set(rocker_port, brport_flags); + return wops->port_attr_bridge_flags_set(rocker_port, flags.val); } static int @@ -2058,7 +2058,7 @@ static int rocker_port_attr_set(struct net_device *dev, break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: err = rocker_world_port_attr_pre_bridge_flags_set(rocker_port, - attr->u.brport_flags); + attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: err = rocker_world_port_attr_bridge_flags_set(rocker_port, diff --git a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c index 314825acf0a0..d93ffd8a08b0 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c +++ b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c @@ -55,26 +55,30 @@ static int am65_cpsw_port_stp_state_set(struct am65_cpsw_port *port, u8 state) static int am65_cpsw_port_attr_br_flags_set(struct am65_cpsw_port *port, struct net_device *orig_dev, - unsigned long brport_flags) + struct switchdev_brport_flags flags) { struct am65_cpsw_common *cpsw = port->common; - bool unreg_mcast_add = false; - if (brport_flags & BR_MCAST_FLOOD) - unreg_mcast_add = true; - netdev_dbg(port->ndev, "BR_MCAST_FLOOD: %d port %u\n", - unreg_mcast_add, port->port_id); + if (flags.mask & BR_MCAST_FLOOD) { + bool unreg_mcast_add = false; - cpsw_ale_set_unreg_mcast(cpsw->ale, BIT(port->port_id), - unreg_mcast_add); + if (flags.val & BR_MCAST_FLOOD) + unreg_mcast_add = true; + + netdev_dbg(port->ndev, "BR_MCAST_FLOOD: %d port %u\n", + unreg_mcast_add, port->port_id); + + cpsw_ale_set_unreg_mcast(cpsw->ale, BIT(port->port_id), + unreg_mcast_add); + } return 0; } static int am65_cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, - unsigned long flags) + struct switchdev_brport_flags flags) { - if (flags & ~(BR_LEARNING | BR_MCAST_FLOOD)) + if (flags.mask & ~(BR_LEARNING | BR_MCAST_FLOOD)) return -EINVAL; return 0; diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index 13524cbaa8b6..a72bb570756f 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -57,26 +57,30 @@ static int cpsw_port_stp_state_set(struct cpsw_priv *priv, u8 state) static int cpsw_port_attr_br_flags_set(struct cpsw_priv *priv, struct net_device *orig_dev, - unsigned long brport_flags) + struct switchdev_brport_flags flags) { struct cpsw_common *cpsw = priv->cpsw; - bool unreg_mcast_add = false; - if (brport_flags & BR_MCAST_FLOOD) - unreg_mcast_add = true; - dev_dbg(priv->dev, "BR_MCAST_FLOOD: %d port %u\n", - unreg_mcast_add, priv->emac_port); + if (flags.mask & BR_MCAST_FLOOD) { + bool unreg_mcast_add = false; - cpsw_ale_set_unreg_mcast(cpsw->ale, BIT(priv->emac_port), - unreg_mcast_add); + if (flags.val & BR_MCAST_FLOOD) + unreg_mcast_add = true; + + dev_dbg(priv->dev, "BR_MCAST_FLOOD: %d port %u\n", + unreg_mcast_add, priv->emac_port); + + cpsw_ale_set_unreg_mcast(cpsw->ale, BIT(priv->emac_port), + unreg_mcast_add); + } return 0; } static int cpsw_port_attr_br_flags_pre_set(struct net_device *netdev, - unsigned long flags) + struct switchdev_brport_flags flags) { - if (flags & ~(BR_LEARNING | BR_MCAST_FLOOD)) + if (flags.mask & ~(BR_LEARNING | BR_MCAST_FLOOD)) return -EINVAL; return 0; diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index ca3d07fe7f58..703055e063ff 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -908,31 +908,39 @@ static int dpaa2_switch_port_attr_stp_state_set(struct net_device *netdev, return dpaa2_switch_port_set_stp_state(port_priv, state); } -static int dpaa2_switch_port_attr_br_flags_pre_set(struct net_device *netdev, - unsigned long flags) +static int +dpaa2_switch_port_attr_br_flags_pre_set(struct net_device *netdev, + struct switchdev_brport_flags flags) { - if (flags & ~(BR_LEARNING | BR_FLOOD)) + if (flags.mask & ~(BR_LEARNING | BR_FLOOD)) return -EINVAL; return 0; } -static int dpaa2_switch_port_attr_br_flags_set(struct net_device *netdev, - unsigned long flags) +static int +dpaa2_switch_port_attr_br_flags_set(struct net_device *netdev, + struct switchdev_brport_flags flags) { struct ethsw_port_priv *port_priv = netdev_priv(netdev); int err = 0; - /* Learning is enabled per switch */ - err = dpaa2_switch_set_learning(port_priv->ethsw_data, - !!(flags & BR_LEARNING)); - if (err) - goto exit; + if (flags.mask & BR_LEARNING) { + /* Learning is enabled per switch */ + err = dpaa2_switch_set_learning(port_priv->ethsw_data, + !!(flags.val & BR_LEARNING)); + if (err) + return err; + } - err = dpaa2_switch_port_set_flood(port_priv, !!(flags & BR_FLOOD)); + if (flags.mask & BR_FLOOD) { + err = dpaa2_switch_port_set_flood(port_priv, + !!(flags.val & BR_FLOOD)); + if (err) + return err; + } -exit: - return err; + return 0; } static int dpaa2_switch_port_attr_set(struct net_device *netdev, diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 9279d4245bab..25d9e4570934 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -32,6 +32,11 @@ enum switchdev_attr_id { #endif }; +struct switchdev_brport_flags { + unsigned long val; + unsigned long mask; +}; + struct switchdev_attr { struct net_device *orig_dev; enum switchdev_attr_id id; @@ -40,7 +45,7 @@ struct switchdev_attr { void (*complete)(struct net_device *dev, int err, void *priv); union { u8 stp_state; /* PORT_STP_STATE */ - unsigned long brport_flags; /* PORT_{PRE}_BRIDGE_FLAGS */ + struct switchdev_brport_flags brport_flags; /* PORT_BRIDGE_FLAGS */ bool mrouter; /* PORT_MROUTER */ clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index bb21dd35ae85..184cf4c8b06d 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -65,7 +65,6 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, { struct switchdev_attr attr = { .orig_dev = p->dev, - .id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, }; struct switchdev_notifier_port_attr_info info = { .attr = &attr, @@ -76,7 +75,9 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, if (!mask) return 0; - attr.u.brport_flags = mask; + attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS; + attr.u.brport_flags.val = flags; + attr.u.brport_flags.mask = mask; /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, @@ -94,7 +95,6 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; - attr.u.brport_flags = flags; err = switchdev_port_attr_set(p->dev, &attr); if (err) { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7060f128386b..bc835f3de2be 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -183,8 +183,10 @@ int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags); -int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags); +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags); +int dsa_port_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags); int dsa_port_mrouter(struct dsa_port *dp, bool mrouter); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); diff --git a/net/dsa/port.c b/net/dsa/port.c index 1f877bf21bb4..368064dfd93e 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -125,21 +125,22 @@ void dsa_port_disable(struct dsa_port *dp) static void dsa_port_change_brport_flags(struct dsa_port *dp, bool bridge_offload) { - unsigned long mask, flags; - int flag, err; + struct switchdev_brport_flags flags; + int flag; - mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; if (bridge_offload) - flags = mask; + flags.val = flags.mask; else - flags = mask & ~BR_LEARNING; + flags.val = flags.mask & ~BR_LEARNING; - for_each_set_bit(flag, &mask, 32) { - err = dsa_port_pre_bridge_flags(dp, BIT(flag)); - if (err) - continue; + for_each_set_bit(flag, &flags.mask, 32) { + struct switchdev_brport_flags tmp; + + tmp.val = flags.val & BIT(flag); + tmp.mask = BIT(flag); - dsa_port_bridge_flags(dp, flags & BIT(flag)); + dsa_port_bridge_flags(dp, tmp); } } @@ -423,26 +424,29 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock) return 0; } -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, unsigned long flags) +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags) { struct dsa_switch *ds = dp->ds; if (!ds->ops->port_egress_floods || - (flags & ~(BR_FLOOD | BR_MCAST_FLOOD))) + (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD))) return -EINVAL; return 0; } -int dsa_port_bridge_flags(const struct dsa_port *dp, unsigned long flags) +int dsa_port_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags) { struct dsa_switch *ds = dp->ds; int port = dp->index; int err = 0; if (ds->ops->port_egress_floods) - err = ds->ops->port_egress_floods(ds, port, flags & BR_FLOOD, - flags & BR_MCAST_FLOOD); + err = ds->ops->port_egress_floods(ds, port, + flags.val & BR_FLOOD, + flags.val & BR_MCAST_FLOOD); return err; } -- cgit v1.2.3 From a8b659e7ff75a6e766bc5691df57ceb26018db9f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 12 Feb 2021 17:15:56 +0200 Subject: net: dsa: act as passthrough for bridge port flags There are multiple ways in which a PORT_BRIDGE_FLAGS attribute can be expressed by the bridge through switchdev, and not all of them can be emulated by DSA mid-layer API at the same time. One possible configuration is when the bridge offloads the port flags using a mask that has a single bit set - therefore only one feature should change. However, DSA currently groups together unicast and multicast flooding in the .port_egress_floods method, which limits our options when we try to add support for turning off broadcast flooding: do we extend .port_egress_floods with a third parameter which b53 and mv88e6xxx will ignore? But that means that the DSA layer, which currently implements the PRE_BRIDGE_FLAGS attribute all by itself, will see that .port_egress_floods is implemented, and will report that all 3 types of flooding are supported - not necessarily true. Another configuration is when the user specifies more than one flag at the same time, in the same netlink message. If we were to create one individual function per offloadable bridge port flag, we would limit the expressiveness of the switch driver of refusing certain combinations of flag values. For example, a switch may not have an explicit knob for flooding of unknown multicast, just for flooding in general. In that case, the only correct thing to do is to allow changes to BR_FLOOD and BR_MCAST_FLOOD in tandem, and never allow mismatched values. But having a separate .port_set_unicast_flood and .port_set_multicast_flood would not allow the driver to possibly reject that. Also, DSA doesn't consider it necessary to inform the driver that a SWITCHDEV_ATTR_ID_BRIDGE_MROUTER attribute was offloaded, because it just calls .port_egress_floods for the CPU port. When we'll add support for the plain SWITCHDEV_ATTR_ID_PORT_MROUTER, that will become a real problem because the flood settings will need to be held statefully in the DSA middle layer, otherwise changing the mrouter port attribute will impact the flooding attribute. And that's _assuming_ that the underlying hardware doesn't have anything else to do when a multicast router attaches to a port than flood unknown traffic to it. If it does, there will need to be a dedicated .port_set_mrouter anyway. So we need to let the DSA drivers see the exact form that the bridge passes this switchdev attribute in, otherwise we are standing in the way. Therefore we also need to use this form of language when communicating to the driver that it needs to configure its initial (before bridge join) and final (after bridge leave) port flags. The b53 and mv88e6xxx drivers are converted to the passthrough API and their implementation of .port_egress_floods is split into two: a function that configures unicast flooding and another for multicast. The mv88e6xxx implementation is quite hairy, and it turns out that the implementations of unknown unicast flooding are actually the same for 6185 and for 6352: behind the confusing names actually lie two individual bits: NO_UNKNOWN_MC -> FLOOD_UC = 0x4 = BIT(2) NO_UNKNOWN_UC -> FLOOD_MC = 0x8 = BIT(3) so there was no reason to entangle them in the first place. Whereas the 6185 writes to MV88E6185_PORT_CTL0_FORWARD_UNKNOWN of PORT_CTL0, which has the exact same bit index. I have left the implementations separate though, for the only reason that the names are different enough to confuse me, since I am not able to double-check with a user manual. The multicast flooding setting for 6185 is in a different register than for 6352 though. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 91 +++++++++++++++------- drivers/net/dsa/b53/b53_priv.h | 2 - drivers/net/dsa/mv88e6xxx/chip.c | 163 ++++++++++++++++++++++++++++++--------- drivers/net/dsa/mv88e6xxx/chip.h | 6 +- drivers/net/dsa/mv88e6xxx/port.c | 52 ++++++------- drivers/net/dsa/mv88e6xxx/port.h | 19 ++--- include/net/dsa.h | 10 ++- net/dsa/dsa_priv.h | 9 ++- net/dsa/port.c | 31 ++++---- net/dsa/slave.c | 7 +- 10 files changed, 263 insertions(+), 127 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 23fc7225c8d1..72c75c7bdb65 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -510,6 +510,39 @@ void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) } EXPORT_SYMBOL(b53_imp_vlan_setup); +static void b53_port_set_ucast_flood(struct b53_device *dev, int port, + bool unicast) +{ + u16 uc; + + b53_read16(dev, B53_CTRL_PAGE, B53_UC_FLOOD_MASK, &uc); + if (unicast) + uc |= BIT(port); + else + uc &= ~BIT(port); + b53_write16(dev, B53_CTRL_PAGE, B53_UC_FLOOD_MASK, uc); +} + +static void b53_port_set_mcast_flood(struct b53_device *dev, int port, + bool multicast) +{ + u16 mc; + + b53_read16(dev, B53_CTRL_PAGE, B53_MC_FLOOD_MASK, &mc); + if (multicast) + mc |= BIT(port); + else + mc &= ~BIT(port); + b53_write16(dev, B53_CTRL_PAGE, B53_MC_FLOOD_MASK, mc); + + b53_read16(dev, B53_CTRL_PAGE, B53_IPMC_FLOOD_MASK, &mc); + if (multicast) + mc |= BIT(port); + else + mc &= ~BIT(port); + b53_write16(dev, B53_CTRL_PAGE, B53_IPMC_FLOOD_MASK, mc); +} + int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; @@ -522,7 +555,8 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) cpu_port = dsa_to_port(ds, port)->cpu_dp->index; - b53_br_egress_floods(ds, port, true, true); + b53_port_set_ucast_flood(dev, port, true); + b53_port_set_mcast_flood(dev, port, true); if (dev->ops->irq_enable) ret = dev->ops->irq_enable(dev, port); @@ -655,7 +689,8 @@ static void b53_enable_cpu_port(struct b53_device *dev, int port) b53_brcm_hdr_setup(dev->ds, port); - b53_br_egress_floods(dev->ds, port, true, true); + b53_port_set_ucast_flood(dev, port, true); + b53_port_set_mcast_flood(dev, port, true); } static void b53_enable_mib(struct b53_device *dev) @@ -1916,37 +1951,37 @@ void b53_br_fast_age(struct dsa_switch *ds, int port) } EXPORT_SYMBOL(b53_br_fast_age); -int b53_br_egress_floods(struct dsa_switch *ds, int port, - bool unicast, bool multicast) +static int b53_br_flags_pre(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { - struct b53_device *dev = ds->priv; - u16 uc, mc; - - b53_read16(dev, B53_CTRL_PAGE, B53_UC_FLOOD_MASK, &uc); - if (unicast) - uc |= BIT(port); - else - uc &= ~BIT(port); - b53_write16(dev, B53_CTRL_PAGE, B53_UC_FLOOD_MASK, uc); + if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD)) + return -EINVAL; - b53_read16(dev, B53_CTRL_PAGE, B53_MC_FLOOD_MASK, &mc); - if (multicast) - mc |= BIT(port); - else - mc &= ~BIT(port); - b53_write16(dev, B53_CTRL_PAGE, B53_MC_FLOOD_MASK, mc); + return 0; +} - b53_read16(dev, B53_CTRL_PAGE, B53_IPMC_FLOOD_MASK, &mc); - if (multicast) - mc |= BIT(port); - else - mc &= ~BIT(port); - b53_write16(dev, B53_CTRL_PAGE, B53_IPMC_FLOOD_MASK, mc); +static int b53_br_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) +{ + if (flags.mask & BR_FLOOD) + b53_port_set_ucast_flood(ds->priv, port, + !!(flags.val & BR_FLOOD)); + if (flags.mask & BR_MCAST_FLOOD) + b53_port_set_mcast_flood(ds->priv, port, + !!(flags.val & BR_MCAST_FLOOD)); return 0; +} +static int b53_set_mrouter(struct dsa_switch *ds, int port, bool mrouter, + struct netlink_ext_ack *extack) +{ + b53_port_set_mcast_flood(ds->priv, port, mrouter); + + return 0; } -EXPORT_SYMBOL(b53_br_egress_floods); static bool b53_possible_cpu_port(struct dsa_switch *ds, int port) { @@ -2187,9 +2222,11 @@ static const struct dsa_switch_ops b53_switch_ops = { .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, + .port_pre_bridge_flags = b53_br_flags_pre, + .port_bridge_flags = b53_br_flags, + .port_set_mrouter = b53_set_mrouter, .port_stp_state_set = b53_br_set_stp_state, .port_fast_age = b53_br_fast_age, - .port_egress_floods = b53_br_egress_floods, .port_vlan_filtering = b53_vlan_filtering, .port_vlan_add = b53_vlan_add, .port_vlan_del = b53_vlan_del, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 0d2cc0453bef..ae72ef46b0b6 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -326,8 +326,6 @@ int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state); void b53_br_fast_age(struct dsa_switch *ds, int port); -int b53_br_egress_floods(struct dsa_switch *ds, int port, - bool unicast, bool multicast); int b53_setup_devlink_resources(struct dsa_switch *ds); void b53_port_event(struct dsa_switch *ds, int port); void b53_phylink_validate(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index ae0b490f00cd..0ef1fadfec68 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2434,12 +2434,20 @@ static int mv88e6xxx_setup_egress_floods(struct mv88e6xxx_chip *chip, int port) { struct dsa_switch *ds = chip->ds; bool flood; + int err; /* Upstream ports flood frames with unknown unicast or multicast DA */ flood = dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port); - if (chip->info->ops->port_set_egress_floods) - return chip->info->ops->port_set_egress_floods(chip, port, - flood, flood); + if (chip->info->ops->port_set_ucast_flood) { + err = chip->info->ops->port_set_ucast_flood(chip, port, flood); + if (err) + return err; + } + if (chip->info->ops->port_set_mcast_flood) { + err = chip->info->ops->port_set_mcast_flood(chip, port, flood); + if (err) + return err; + } return 0; } @@ -3239,7 +3247,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .port_pause_limit = mv88e6097_port_pause_limit, @@ -3278,7 +3287,8 @@ static const struct mv88e6xxx_ops mv88e6095_ops = { .port_sync_link = mv88e6185_port_sync_link, .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_set_frame_mode = mv88e6085_port_set_frame_mode, - .port_set_egress_floods = mv88e6185_port_set_egress_floods, + .port_set_ucast_flood = mv88e6185_port_set_forward_unknown, + .port_set_mcast_flood = mv88e6185_port_set_default_forward, .port_set_upstream_port = mv88e6095_port_set_upstream_port, .port_get_cmode = mv88e6185_port_get_cmode, .port_setup_message_port = mv88e6xxx_setup_message_port, @@ -3313,7 +3323,8 @@ static const struct mv88e6xxx_ops mv88e6097_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_egress_rate_limiting = mv88e6095_port_egress_rate_limiting, .port_pause_limit = mv88e6097_port_pause_limit, @@ -3357,7 +3368,8 @@ static const struct mv88e6xxx_ops mv88e6123_ops = { .port_sync_link = mv88e6xxx_port_sync_link, .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_set_frame_mode = mv88e6085_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit, .port_disable_pri_override = mv88e6xxx_port_disable_pri_override, .port_get_cmode = mv88e6185_port_get_cmode, @@ -3393,7 +3405,8 @@ static const struct mv88e6xxx_ops mv88e6131_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6185_port_set_egress_floods, + .port_set_ucast_flood = mv88e6185_port_set_forward_unknown, + .port_set_mcast_flood = mv88e6185_port_set_default_forward, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_upstream_port = mv88e6095_port_set_upstream_port, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, @@ -3437,7 +3450,8 @@ static const struct mv88e6xxx_ops mv88e6141_ops = { .port_max_speed_mode = mv88e6341_port_max_speed_mode, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -3487,7 +3501,8 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -3565,7 +3580,8 @@ static const struct mv88e6xxx_ops mv88e6171_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -3609,7 +3625,8 @@ static const struct mv88e6xxx_ops mv88e6172_ops = { .port_tag_remap = mv88e6095_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -3660,7 +3677,8 @@ static const struct mv88e6xxx_ops mv88e6175_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -3704,7 +3722,8 @@ static const struct mv88e6xxx_ops mv88e6176_ops = { .port_tag_remap = mv88e6095_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -3755,7 +3774,8 @@ static const struct mv88e6xxx_ops mv88e6185_ops = { .port_sync_link = mv88e6185_port_sync_link, .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_set_frame_mode = mv88e6085_port_set_frame_mode, - .port_set_egress_floods = mv88e6185_port_set_egress_floods, + .port_set_ucast_flood = mv88e6185_port_set_forward_unknown, + .port_set_mcast_flood = mv88e6185_port_set_default_forward, .port_egress_rate_limiting = mv88e6095_port_egress_rate_limiting, .port_set_upstream_port = mv88e6095_port_set_upstream_port, .port_set_pause = mv88e6185_port_set_pause, @@ -3800,7 +3820,8 @@ static const struct mv88e6xxx_ops mv88e6190_ops = { .port_tag_remap = mv88e6390_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_pause_limit = mv88e6390_port_pause_limit, @@ -3860,7 +3881,8 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = { .port_tag_remap = mv88e6390_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_pause_limit = mv88e6390_port_pause_limit, @@ -3919,7 +3941,8 @@ static const struct mv88e6xxx_ops mv88e6191_ops = { .port_max_speed_mode = mv88e6390_port_max_speed_mode, .port_tag_remap = mv88e6390_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_pause_limit = mv88e6390_port_pause_limit, .port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit, @@ -3979,7 +4002,8 @@ static const struct mv88e6xxx_ops mv88e6240_ops = { .port_tag_remap = mv88e6095_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4037,7 +4061,8 @@ static const struct mv88e6xxx_ops mv88e6250_ops = { .port_set_speed_duplex = mv88e6250_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, .port_pause_limit = mv88e6097_port_pause_limit, @@ -4077,7 +4102,8 @@ static const struct mv88e6xxx_ops mv88e6290_ops = { .port_tag_remap = mv88e6390_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_pause_limit = mv88e6390_port_pause_limit, .port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit, @@ -4136,7 +4162,8 @@ static const struct mv88e6xxx_ops mv88e6320_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4179,7 +4206,8 @@ static const struct mv88e6xxx_ops mv88e6321_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4222,7 +4250,8 @@ static const struct mv88e6xxx_ops mv88e6341_ops = { .port_max_speed_mode = mv88e6341_port_max_speed_mode, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4275,7 +4304,8 @@ static const struct mv88e6xxx_ops mv88e6350_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4316,7 +4346,8 @@ static const struct mv88e6xxx_ops mv88e6351_ops = { .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4362,7 +4393,8 @@ static const struct mv88e6xxx_ops mv88e6352_ops = { .port_tag_remap = mv88e6095_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4424,7 +4456,8 @@ static const struct mv88e6xxx_ops mv88e6390_ops = { .port_tag_remap = mv88e6390_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -4488,7 +4521,8 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = { .port_tag_remap = mv88e6390_port_tag_remap, .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, - .port_set_egress_floods = mv88e6352_port_set_egress_floods, + .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, + .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, .port_set_ether_type = mv88e6351_port_set_ether_type, .port_set_jumbo_size = mv88e6165_port_set_jumbo_size, .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting, @@ -5364,17 +5398,72 @@ static void mv88e6xxx_port_mirror_del(struct dsa_switch *ds, int port, mutex_unlock(&chip->reg_lock); } -static int mv88e6xxx_port_egress_floods(struct dsa_switch *ds, int port, - bool unicast, bool multicast) +static int mv88e6xxx_port_pre_bridge_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) +{ + struct mv88e6xxx_chip *chip = ds->priv; + const struct mv88e6xxx_ops *ops; + + if (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD)) + return -EINVAL; + + ops = chip->info->ops; + + if ((flags.mask & BR_FLOOD) && !ops->port_set_ucast_flood) + return -EINVAL; + + if ((flags.mask & BR_MCAST_FLOOD) && !ops->port_set_mcast_flood) + return -EINVAL; + + return 0; +} + +static int mv88e6xxx_port_bridge_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; int err = -EOPNOTSUPP; mv88e6xxx_reg_lock(chip); - if (chip->info->ops->port_set_egress_floods) - err = chip->info->ops->port_set_egress_floods(chip, port, - unicast, - multicast); + + if (flags.mask & BR_FLOOD) { + bool unicast = !!(flags.val & BR_FLOOD); + + err = chip->info->ops->port_set_ucast_flood(chip, port, + unicast); + if (err) + goto out; + } + + if (flags.mask & BR_MCAST_FLOOD) { + bool multicast = !!(flags.val & BR_MCAST_FLOOD); + + err = chip->info->ops->port_set_mcast_flood(chip, port, + multicast); + if (err) + goto out; + } + +out: + mv88e6xxx_reg_unlock(chip); + + return err; +} + +static int mv88e6xxx_port_set_mrouter(struct dsa_switch *ds, int port, + bool mrouter, + struct netlink_ext_ack *extack) +{ + struct mv88e6xxx_chip *chip = ds->priv; + int err; + + if (!chip->info->ops->port_set_mcast_flood) + return -EOPNOTSUPP; + + mv88e6xxx_reg_lock(chip); + err = chip->info->ops->port_set_mcast_flood(chip, port, mrouter); mv88e6xxx_reg_unlock(chip); return err; @@ -5678,7 +5767,9 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .set_ageing_time = mv88e6xxx_set_ageing_time, .port_bridge_join = mv88e6xxx_port_bridge_join, .port_bridge_leave = mv88e6xxx_port_bridge_leave, - .port_egress_floods = mv88e6xxx_port_egress_floods, + .port_pre_bridge_flags = mv88e6xxx_port_pre_bridge_flags, + .port_bridge_flags = mv88e6xxx_port_bridge_flags, + .port_set_mrouter = mv88e6xxx_port_set_mrouter, .port_stp_state_set = mv88e6xxx_port_stp_state_set, .port_fast_age = mv88e6xxx_port_fast_age, .port_vlan_filtering = mv88e6xxx_port_vlan_filtering, diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 788b3f585ef3..a57c8886f3ac 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -454,8 +454,10 @@ struct mv88e6xxx_ops { int (*port_set_frame_mode)(struct mv88e6xxx_chip *chip, int port, enum mv88e6xxx_frame_mode mode); - int (*port_set_egress_floods)(struct mv88e6xxx_chip *chip, int port, - bool unicast, bool multicast); + int (*port_set_ucast_flood)(struct mv88e6xxx_chip *chip, int port, + bool unicast); + int (*port_set_mcast_flood)(struct mv88e6xxx_chip *chip, int port, + bool multicast); int (*port_set_ether_type)(struct mv88e6xxx_chip *chip, int port, u16 etype); int (*port_set_jumbo_size)(struct mv88e6xxx_chip *chip, int port, diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 4b46e10a2dde..4561f289ab76 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -789,8 +789,8 @@ int mv88e6351_port_set_frame_mode(struct mv88e6xxx_chip *chip, int port, return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL0, reg); } -static int mv88e6185_port_set_forward_unknown(struct mv88e6xxx_chip *chip, - int port, bool unicast) +int mv88e6185_port_set_forward_unknown(struct mv88e6xxx_chip *chip, + int port, bool unicast) { int err; u16 reg; @@ -807,8 +807,8 @@ static int mv88e6185_port_set_forward_unknown(struct mv88e6xxx_chip *chip, return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL0, reg); } -int mv88e6352_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port, - bool unicast, bool multicast) +int mv88e6352_port_set_ucast_flood(struct mv88e6xxx_chip *chip, int port, + bool unicast) { int err; u16 reg; @@ -817,16 +817,28 @@ int mv88e6352_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port, if (err) return err; - reg &= ~MV88E6352_PORT_CTL0_EGRESS_FLOODS_MASK; + if (unicast) + reg |= MV88E6352_PORT_CTL0_EGRESS_FLOODS_UC; + else + reg &= ~MV88E6352_PORT_CTL0_EGRESS_FLOODS_UC; + + return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL0, reg); +} + +int mv88e6352_port_set_mcast_flood(struct mv88e6xxx_chip *chip, int port, + bool multicast) +{ + int err; + u16 reg; + + err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_CTL0, ®); + if (err) + return err; - if (unicast && multicast) - reg |= MV88E6352_PORT_CTL0_EGRESS_FLOODS_ALL_UNKNOWN_DA; - else if (unicast) - reg |= MV88E6352_PORT_CTL0_EGRESS_FLOODS_NO_UNKNOWN_MC_DA; - else if (multicast) - reg |= MV88E6352_PORT_CTL0_EGRESS_FLOODS_NO_UNKNOWN_UC_DA; + if (multicast) + reg |= MV88E6352_PORT_CTL0_EGRESS_FLOODS_MC; else - reg |= MV88E6352_PORT_CTL0_EGRESS_FLOODS_NO_UNKNOWN_DA; + reg &= ~MV88E6352_PORT_CTL0_EGRESS_FLOODS_MC; return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL0, reg); } @@ -1013,8 +1025,8 @@ static const char * const mv88e6xxx_port_8021q_mode_names[] = { [MV88E6XXX_PORT_CTL2_8021Q_MODE_SECURE] = "Secure", }; -static int mv88e6185_port_set_default_forward(struct mv88e6xxx_chip *chip, - int port, bool multicast) +int mv88e6185_port_set_default_forward(struct mv88e6xxx_chip *chip, + int port, bool multicast) { int err; u16 reg; @@ -1031,18 +1043,6 @@ static int mv88e6185_port_set_default_forward(struct mv88e6xxx_chip *chip, return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL2, reg); } -int mv88e6185_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port, - bool unicast, bool multicast) -{ - int err; - - err = mv88e6185_port_set_forward_unknown(chip, port, unicast); - if (err) - return err; - - return mv88e6185_port_set_default_forward(chip, port, multicast); -} - int mv88e6095_port_set_upstream_port(struct mv88e6xxx_chip *chip, int port, int upstream_port) { diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h index a729bba050df..e6d0eaa6aa1d 100644 --- a/drivers/net/dsa/mv88e6xxx/port.h +++ b/drivers/net/dsa/mv88e6xxx/port.h @@ -154,11 +154,8 @@ #define MV88E6185_PORT_CTL0_USE_IP 0x0020 #define MV88E6185_PORT_CTL0_USE_TAG 0x0010 #define MV88E6185_PORT_CTL0_FORWARD_UNKNOWN 0x0004 -#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_MASK 0x000c -#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_NO_UNKNOWN_DA 0x0000 -#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_NO_UNKNOWN_MC_DA 0x0004 -#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_NO_UNKNOWN_UC_DA 0x0008 -#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_ALL_UNKNOWN_DA 0x000c +#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_UC 0x0004 +#define MV88E6352_PORT_CTL0_EGRESS_FLOODS_MC 0x0008 #define MV88E6XXX_PORT_CTL0_STATE_MASK 0x0003 #define MV88E6XXX_PORT_CTL0_STATE_DISABLED 0x0000 #define MV88E6XXX_PORT_CTL0_STATE_BLOCKING 0x0001 @@ -343,10 +340,14 @@ int mv88e6085_port_set_frame_mode(struct mv88e6xxx_chip *chip, int port, enum mv88e6xxx_frame_mode mode); int mv88e6351_port_set_frame_mode(struct mv88e6xxx_chip *chip, int port, enum mv88e6xxx_frame_mode mode); -int mv88e6185_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port, - bool unicast, bool multicast); -int mv88e6352_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port, - bool unicast, bool multicast); +int mv88e6185_port_set_forward_unknown(struct mv88e6xxx_chip *chip, + int port, bool unicast); +int mv88e6185_port_set_default_forward(struct mv88e6xxx_chip *chip, + int port, bool multicast); +int mv88e6352_port_set_ucast_flood(struct mv88e6xxx_chip *chip, int port, + bool unicast); +int mv88e6352_port_set_mcast_flood(struct mv88e6xxx_chip *chip, int port, + bool multicast); int mv88e6352_port_set_policy(struct mv88e6xxx_chip *chip, int port, enum mv88e6xxx_policy_mapping mapping, enum mv88e6xxx_policy_action action); diff --git a/include/net/dsa.h b/include/net/dsa.h index d8de23ce7221..74457aaffec7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -626,8 +626,14 @@ struct dsa_switch_ops { void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); void (*port_fast_age)(struct dsa_switch *ds, int port); - int (*port_egress_floods)(struct dsa_switch *ds, int port, - bool unicast, bool multicast); + int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); + int (*port_bridge_flags)(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); + int (*port_set_mrouter)(struct dsa_switch *ds, int port, bool mrouter, + struct netlink_ext_ack *extack); /* * VLAN support diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index bc835f3de2be..f5949b39f6f7 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -184,10 +184,13 @@ int dsa_port_mdb_add(const struct dsa_port *dp, int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_pre_bridge_flags(const struct dsa_port *dp, - struct switchdev_brport_flags flags); + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); int dsa_port_bridge_flags(const struct dsa_port *dp, - struct switchdev_brport_flags flags); -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter); + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, + struct netlink_ext_ack *extack); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); int dsa_port_vlan_del(struct dsa_port *dp, diff --git a/net/dsa/port.c b/net/dsa/port.c index 368064dfd93e..80e6471a7a5c 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -140,7 +140,7 @@ static void dsa_port_change_brport_flags(struct dsa_port *dp, tmp.val = flags.val & BIT(flag); tmp.mask = BIT(flag); - dsa_port_bridge_flags(dp, tmp); + dsa_port_bridge_flags(dp, tmp, NULL); } } @@ -425,41 +425,38 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock) } int dsa_port_pre_bridge_flags(const struct dsa_port *dp, - struct switchdev_brport_flags flags) + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - if (!ds->ops->port_egress_floods || - (flags.mask & ~(BR_FLOOD | BR_MCAST_FLOOD))) + if (!ds->ops->port_pre_bridge_flags) return -EINVAL; - return 0; + return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack); } int dsa_port_bridge_flags(const struct dsa_port *dp, - struct switchdev_brport_flags flags) + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - int port = dp->index; - int err = 0; - if (ds->ops->port_egress_floods) - err = ds->ops->port_egress_floods(ds, port, - flags.val & BR_FLOOD, - flags.val & BR_MCAST_FLOOD); + if (!ds->ops->port_bridge_flags) + return -EINVAL; - return err; + return ds->ops->port_bridge_flags(ds, dp->index, flags, extack); } -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter) +int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - int port = dp->index; - if (!ds->ops->port_egress_floods) + if (!ds->ops->port_set_mrouter) return -EOPNOTSUPP; - return ds->ops->port_egress_floods(ds, port, true, mrouter); + return ds->ops->port_set_mrouter(ds, dp->index, mrouter, extack); } int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index be29008477d3..8c9a41a7209a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -292,13 +292,14 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: - ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags); + ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, + extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: - ret = dsa_port_bridge_flags(dp, attr->u.brport_flags); + ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: - ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter); + ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, extack); break; default: ret = -EOPNOTSUPP; -- cgit v1.2.3 From b360d94f1b8647bc164e7519ec900471836be14a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 12 Feb 2021 17:15:58 +0200 Subject: net: mscc: ocelot: use separate flooding PGID for broadcast In preparation of offloading the bridge port flags which have independent settings for unknown multicast and for broadcast, we should also start reserving one destination Port Group ID for the flooding of broadcast packets, to allow configuring it individually. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 2 ++ drivers/net/ethernet/mscc/ocelot.c | 13 ++++++++----- include/soc/mscc/ocelot.h | 15 ++++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index ae11d3f030ac..00b053d8294f 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -299,6 +299,7 @@ static int felix_setup_tag_8021q(struct dsa_switch *ds, int cpu) cpu_flood = ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)); ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_UC); ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_MC); + ocelot_rmw_rix(ocelot, 0, cpu_flood, ANA_PGID_PGID, PGID_BC); felix->dsa_8021q_ctx = kzalloc(sizeof(*felix->dsa_8021q_ctx), GFP_KERNEL); @@ -412,6 +413,7 @@ static int felix_setup_tag_npi(struct dsa_switch *ds, int cpu) cpu_flood = ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)); ocelot_rmw_rix(ocelot, cpu_flood, cpu_flood, ANA_PGID_PGID, PGID_UC); ocelot_rmw_rix(ocelot, cpu_flood, cpu_flood, ANA_PGID_PGID, PGID_MC); + ocelot_rmw_rix(ocelot, cpu_flood, cpu_flood, ANA_PGID_PGID, PGID_BC); return 0; } diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 1654a6e22a7d..1a31598e2ae6 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1716,7 +1716,7 @@ int ocelot_init(struct ocelot *ocelot) /* Setup flooding PGIDs */ for (i = 0; i < ocelot->num_flooding_pgids; i++) ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) | - ANA_FLOODING_FLD_BROADCAST(PGID_MC) | + ANA_FLOODING_FLD_BROADCAST(PGID_BC) | ANA_FLOODING_FLD_UNICAST(PGID_UC), ANA_FLOODING, i); ocelot_write(ocelot, ANA_FLOODING_IPMC_FLD_MC6_DATA(PGID_MCIPV6) | @@ -1737,15 +1737,18 @@ int ocelot_init(struct ocelot *ocelot) ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_SRC + port); } - /* Allow broadcast MAC frames. */ for_each_nonreserved_multicast_dest_pgid(ocelot, i) { u32 val = ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports - 1, 0)); ocelot_write_rix(ocelot, val, ANA_PGID_PGID, i); } - ocelot_write_rix(ocelot, - ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports, 0)), - ANA_PGID_PGID, PGID_MC); + /* Allow broadcast and unknown L2 multicast to the CPU. */ + ocelot_rmw_rix(ocelot, ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)), + ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)), + ANA_PGID_PGID, PGID_MC); + ocelot_rmw_rix(ocelot, ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)), + ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)), + ANA_PGID_PGID, PGID_BC); ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_MCIPV4); ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_MCIPV6); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index bfce3df61bfd..9acbef1416f1 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -54,16 +54,17 @@ * PGID_CPU: used for whitelisting certain MAC addresses, such as the addresses * of the switch port net devices, towards the CPU port module. * PGID_UC: the flooding destinations for unknown unicast traffic. - * PGID_MC: the flooding destinations for broadcast and non-IP multicast - * traffic. + * PGID_MC: the flooding destinations for non-IP multicast traffic. * PGID_MCIPV4: the flooding destinations for IPv4 multicast traffic. * PGID_MCIPV6: the flooding destinations for IPv6 multicast traffic. + * PGID_BC: the flooding destinations for broadcast traffic. */ -#define PGID_CPU 59 -#define PGID_UC 60 -#define PGID_MC 61 -#define PGID_MCIPV4 62 -#define PGID_MCIPV6 63 +#define PGID_CPU 58 +#define PGID_UC 59 +#define PGID_MC 60 +#define PGID_MCIPV4 61 +#define PGID_MCIPV6 62 +#define PGID_BC 63 #define for_each_unicast_dest_pgid(ocelot, pgid) \ for ((pgid) = 0; \ -- cgit v1.2.3 From 421741ea5672cf16fa551bcde23e327075ed419e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 12 Feb 2021 17:15:59 +0200 Subject: net: mscc: ocelot: offload bridge port flags to device We should not be unconditionally enabling address learning, since doing that is actively detrimential when a port is standalone and not offloading a bridge. Namely, if a port in the switch is standalone and others are offloading the bridge, then we could enter a situation where we learn an address towards the standalone port, but the bridged ports could not forward the packet there, because the CPU is the only path between the standalone and the bridged ports. The solution of course is to not enable address learning unless the bridge asks for it. We need to set up the initial port flags for no learning and flooding everything, and also when the port joins and leaves the bridge. The flood configuration was already configured ok for standalone mode in ocelot_init, we just need to disable learning in ocelot_init_port. Signed-off-by: Vladimir Oltean Reviewed-by: Alexandre Belloni Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 22 +++++++++ drivers/net/ethernet/mscc/ocelot.c | 87 +++++++++++++++++++++++++++++++++- drivers/net/ethernet/mscc/ocelot_net.c | 49 +++++++++++++++++-- include/soc/mscc/ocelot.h | 5 ++ 4 files changed, 158 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 00b053d8294f..d3180b0f2307 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -556,6 +556,26 @@ static void felix_bridge_stp_state_set(struct dsa_switch *ds, int port, return ocelot_bridge_stp_state_set(ocelot, port, state); } +static int felix_pre_bridge_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags val, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = ds->priv; + + return ocelot_port_pre_bridge_flags(ocelot, port, val); +} + +static int felix_bridge_flags(struct dsa_switch *ds, int port, + struct switchdev_brport_flags val, + struct netlink_ext_ack *extack) +{ + struct ocelot *ocelot = ds->priv; + + ocelot_port_bridge_flags(ocelot, port, val); + + return 0; +} + static int felix_bridge_join(struct dsa_switch *ds, int port, struct net_device *br) { @@ -1376,6 +1396,8 @@ const struct dsa_switch_ops felix_switch_ops = { .port_fdb_del = felix_fdb_del, .port_mdb_add = felix_mdb_add, .port_mdb_del = felix_mdb_del, + .port_pre_bridge_flags = felix_pre_bridge_flags, + .port_bridge_flags = felix_bridge_flags, .port_bridge_join = felix_bridge_join, .port_bridge_leave = felix_bridge_leave, .port_lag_join = felix_lag_join, diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 1a31598e2ae6..d1a9cdbf7a3e 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1038,6 +1038,7 @@ EXPORT_SYMBOL(ocelot_apply_bridge_fwd_mask); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; u32 port_cfg; if (!(BIT(port) & ocelot->bridge_mask)) @@ -1050,7 +1051,8 @@ void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state) ocelot->bridge_fwd_mask |= BIT(port); fallthrough; case BR_STATE_LEARNING: - port_cfg |= ANA_PORT_PORT_CFG_LEARN_ENA; + if (ocelot_port->learn_ena) + port_cfg |= ANA_PORT_PORT_CFG_LEARN_ENA; break; default: @@ -1534,6 +1536,86 @@ int ocelot_get_max_mtu(struct ocelot *ocelot, int port) } EXPORT_SYMBOL(ocelot_get_max_mtu); +static void ocelot_port_set_learning(struct ocelot *ocelot, int port, + bool enabled) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + u32 val = 0; + + if (enabled) + val = ANA_PORT_PORT_CFG_LEARN_ENA; + + ocelot_rmw_gix(ocelot, val, ANA_PORT_PORT_CFG_LEARN_ENA, + ANA_PORT_PORT_CFG, port); + + ocelot_port->learn_ena = enabled; +} + +static void ocelot_port_set_ucast_flood(struct ocelot *ocelot, int port, + bool enabled) +{ + u32 val = 0; + + if (enabled) + val = BIT(port); + + ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_UC); +} + +static void ocelot_port_set_mcast_flood(struct ocelot *ocelot, int port, + bool enabled) +{ + u32 val = 0; + + if (enabled) + val = BIT(port); + + ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MC); +} + +static void ocelot_port_set_bcast_flood(struct ocelot *ocelot, int port, + bool enabled) +{ + u32 val = 0; + + if (enabled) + val = BIT(port); + + ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_BC); +} + +int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port, + struct switchdev_brport_flags flags) +{ + if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | + BR_BCAST_FLOOD)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(ocelot_port_pre_bridge_flags); + +void ocelot_port_bridge_flags(struct ocelot *ocelot, int port, + struct switchdev_brport_flags flags) +{ + if (flags.mask & BR_LEARNING) + ocelot_port_set_learning(ocelot, port, + !!(flags.val & BR_LEARNING)); + + if (flags.mask & BR_FLOOD) + ocelot_port_set_ucast_flood(ocelot, port, + !!(flags.val & BR_FLOOD)); + + if (flags.mask & BR_MCAST_FLOOD) + ocelot_port_set_mcast_flood(ocelot, port, + !!(flags.val & BR_MCAST_FLOOD)); + + if (flags.mask & BR_BCAST_FLOOD) + ocelot_port_set_bcast_flood(ocelot, port, + !!(flags.val & BR_BCAST_FLOOD)); +} +EXPORT_SYMBOL(ocelot_port_bridge_flags); + void ocelot_init_port(struct ocelot *ocelot, int port) { struct ocelot_port *ocelot_port = ocelot->ports[port]; @@ -1583,6 +1665,9 @@ void ocelot_init_port(struct ocelot *ocelot, int port) REW_PORT_VLAN_CFG_PORT_TPID_M, REW_PORT_VLAN_CFG, port); + /* Disable source address learning for standalone mode */ + ocelot_port_set_learning(ocelot, port, false); + /* Enable vcap lookups */ ocelot_vcap_enable(ocelot, port); } diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index f9da4aa39444..b5ffe6724eb7 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1026,6 +1026,13 @@ static int ocelot_port_attr_set(struct net_device *dev, case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED: ocelot_port_attr_mc_set(ocelot, port, !attr->u.mc_disabled); break; + case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: + err = ocelot_port_pre_bridge_flags(ocelot, port, + attr->u.brport_flags); + break; + case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: + ocelot_port_bridge_flags(ocelot, port, attr->u.brport_flags); + break; default: err = -EOPNOTSUPP; break; @@ -1111,6 +1118,40 @@ static int ocelot_port_obj_del(struct net_device *dev, return ret; } +static int ocelot_netdevice_bridge_join(struct ocelot *ocelot, int port, + struct net_device *bridge) +{ + struct switchdev_brport_flags flags; + int err; + + flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + flags.val = flags.mask; + + err = ocelot_port_bridge_join(ocelot, port, bridge); + if (err) + return err; + + ocelot_port_bridge_flags(ocelot, port, flags); + + return 0; +} + +static int ocelot_netdevice_bridge_leave(struct ocelot *ocelot, int port, + struct net_device *bridge) +{ + struct switchdev_brport_flags flags; + int err; + + flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + flags.val = flags.mask & ~BR_LEARNING; + + err = ocelot_port_bridge_leave(ocelot, port, bridge); + + ocelot_port_bridge_flags(ocelot, port, flags); + + return err; +} + static int ocelot_netdevice_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { @@ -1122,11 +1163,11 @@ static int ocelot_netdevice_changeupper(struct net_device *dev, if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { - err = ocelot_port_bridge_join(ocelot, port, - info->upper_dev); + err = ocelot_netdevice_bridge_join(ocelot, port, + info->upper_dev); } else { - err = ocelot_port_bridge_leave(ocelot, port, - info->upper_dev); + err = ocelot_netdevice_bridge_leave(ocelot, port, + info->upper_dev); } } if (netif_is_lag_master(info->upper_dev)) { diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 9acbef1416f1..40792b37bb9f 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -612,6 +612,7 @@ struct ocelot_port { u8 *xmit_template; bool is_dsa_8021q_cpu; + bool learn_ena; struct net_device *bond; bool lag_tx_active; @@ -766,6 +767,10 @@ void ocelot_adjust_link(struct ocelot *ocelot, int port, int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state); void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot); +int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port, + struct switchdev_brport_flags val); +void ocelot_port_bridge_flags(struct ocelot *ocelot, int port, + struct switchdev_brport_flags val); int ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge); int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, -- cgit v1.2.3 From f969dc5a885736842c3511ecdea240fbb02d25d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Feb 2021 15:22:13 -0800 Subject: tcp: fix SO_RCVLOWAT related hangs under mem pressure While commit 24adbc1676af ("tcp: fix SO_RCVLOWAT hangs with fat skbs") fixed an issue vs too small sk_rcvbuf for given sk_rcvlowat constraint, it missed to address issue caused by memory pressure. 1) If we are under memory pressure and socket receive queue is empty. First incoming packet is allowed to be queued, after commit 76dfa6082032 ("tcp: allow one skb to be received per socket under memory pressure") But we do not send EPOLLIN yet, in case tcp_data_ready() sees sk_rcvlowat is bigger than skb length. 2) Then, when next packet comes, it is dropped, and we directly call sk->sk_data_ready(). 3) If application is using poll(), tcp_poll() will then use tcp_stream_is_readable() and decide the socket receive queue is not yet filled, so nothing will happen. Even when sender retransmits packets, phases 2) & 3) repeat and flow is effectively frozen, until memory pressure is off. Fix is to consider tcp_under_memory_pressure() to take care of global memory pressure or memcg pressure. Fixes: 24adbc1676af ("tcp: fix SO_RCVLOWAT hangs with fat skbs") Signed-off-by: Eric Dumazet Reported-by: Arjun Roy Suggested-by: Wei Wang Reviewed-by: Wei Wang Signed-off-by: David S. Miller --- include/net/tcp.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 25bbada379c4..244208f6f6c2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1431,8 +1431,13 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied); */ static inline bool tcp_rmem_pressure(const struct sock *sk) { - int rcvbuf = READ_ONCE(sk->sk_rcvbuf); - int threshold = rcvbuf - (rcvbuf >> 3); + int rcvbuf, threshold; + + if (tcp_under_memory_pressure(sk)) + return true; + + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } -- cgit v1.2.3 From 05dc72aba364d374a27de567fac58c199ff5ee97 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Feb 2021 15:22:14 -0800 Subject: tcp: factorize logic into tcp_epollin_ready() Both tcp_data_ready() and tcp_stream_is_readable() share the same logic. Add tcp_epollin_ready() helper to avoid duplication. Signed-off-by: Eric Dumazet Cc: Arjun Roy Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/tcp.h | 12 ++++++++++++ net/ipv4/tcp.c | 18 +++++------------- net/ipv4/tcp_input.c | 11 ++--------- 3 files changed, 19 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 244208f6f6c2..484eb2362645 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1442,6 +1442,18 @@ static inline bool tcp_rmem_pressure(const struct sock *sk) return atomic_read(&sk->sk_rmem_alloc) > threshold; } +static inline bool tcp_epollin_ready(const struct sock *sk, int target) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); + + if (avail <= 0) + return false; + + return (avail >= target) || tcp_rmem_pressure(sk) || + (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss); +} + extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9896ca10bb34..7a6b58ae408d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -481,19 +481,11 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } -static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, - int target, struct sock *sk) +static bool tcp_stream_is_readable(struct sock *sk, int target) { - int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); - - if (avail > 0) { - if (avail >= target) - return true; - if (tcp_rmem_pressure(sk)) - return true; - if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) - return true; - } + if (tcp_epollin_ready(sk, target)) + return true; + if (sk->sk_prot->stream_memory_read) return sk->sk_prot->stream_memory_read(sk); return false; @@ -568,7 +560,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tcp_stream_is_readable(tp, target, sk)) + if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a8f8f9815953..e32a7056cb76 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4924,15 +4924,8 @@ err: void tcp_data_ready(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - int avail = tp->rcv_nxt - tp->copied_seq; - - if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && - !sock_flag(sk, SOCK_DONE) && - tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) - return; - - sk->sk_data_ready(sk); + if (tcp_epollin_ready(sk, sk->sk_rcvlowat)) + sk->sk_data_ready(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From e5069b9c23b3857db986c58801bebe450cff3392 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Feb 2021 00:56:41 +0400 Subject: bpf: Support pointers in global func args Add an ability to pass a pointer to a type with known size in arguments of a global function. Such pointers may be used to overcome the limit on the maximum number of arguments, avoid expensive and tricky workarounds and to have multiple output arguments. A referenced type may contain pointers but indirect access through them isn't supported. The implementation consists of two parts. If a global function has an argument that is a pointer to a type with known size then: 1) In btf_check_func_arg_match(): check that the corresponding register points to NULL or to a valid memory region that is large enough to contain the expected argument's type. 2) In btf_prepare_func_args(): set the corresponding register type to PTR_TO_MEM_OR_NULL and its size to the size of the expected type. Only global functions are supported because allowance of pointers for static functions might break validation. Consider the following scenario. A static function has a pointer argument. A caller passes pointer to its stack memory. Because the callee can change referenced memory verifier cannot longer assume any particular slot type of the caller's stack memory hence the slot type is changed to SLOT_MISC. If there is an operation that relies on slot type other than SLOT_MISC then verifier won't be able to infer safety of the operation. When verifier sees a static function that has a pointer argument different from PTR_TO_CTX then it skips arguments check and continues with "inline" validation with more information available. The operation that relies on the particular slot type now succeeds. Because global functions were not allowed to have pointer arguments different from PTR_TO_CTX it's not possible to break existing and valid code. Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210212205642.620788-4-me@ubique.spb.ru --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/btf.c | 55 ++++++++++++++++++++++++++++++++++++-------- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 532c97836d0d..971b33aca13d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -471,6 +471,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd5d2c563693..2efeb5f4b343 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5297,9 +5297,10 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; - u32 i, nargs, btf_id; + const struct btf_type *t, *ref_t; + u32 i, nargs, btf_id, type_size; const char *tname; + bool is_global; if (!prog->aux->func_info) return -EINVAL; @@ -5333,6 +5334,8 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); goto out; } + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -5349,10 +5352,6 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, goto out; } if (btf_type_is_ptr(t)) { - if (reg->type == SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer\n", i + 1); - goto out; - } /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5367,6 +5366,25 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, goto out; continue; } + + if (!is_global) + goto out; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, &type_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + goto out; + } + + if (check_mem_reg(env, reg, i + 1, type_size)) + goto out; + + continue; } bpf_log(log, "Unrecognized arg#%d type %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5397,7 +5415,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; + const struct btf_type *t, *ref_t; u32 i, nargs, btf_id; const char *tname; @@ -5470,9 +5488,26 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, reg->type = SCALAR_VALUE; continue; } - if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg->type = PTR_TO_CTX; + if (btf_type_is_ptr(t)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg->type = PTR_TO_CTX; + continue; + } + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, ®->mem_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + return -EINVAL; + } + + reg->type = PTR_TO_MEM_OR_NULL; + reg->id = ++env->id_gen; + continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9f75bd7f8d3..11a242932a2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4272,6 +4272,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) +{ + if (register_is_null(reg)) + return 0; + + if (reg_type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + const struct bpf_reg_state saved_reg = *reg; + int rv; + + mark_ptr_not_null_reg(reg); + rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + *reg = saved_reg; + return rv; + } + + return check_helper_mem_access(env, regno, mem_size, true, NULL); +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -11960,6 +11983,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); + else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); + regs[i].mem_size = mem_size; + regs[i].id = ++env->id_gen; + } } } else { /* 1st arg to a function */ -- cgit v1.2.3 From fec6e49b63989657bc4076dad99fa51d5ece34da Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Feb 2021 14:12:02 +0000 Subject: skbuff: remove __kfree_skb_flush() This function isn't much needed as NAPI skb queue gets bulk-freed anyway when there's no more room, and even may reduce the efficiency of bulk operations. It will be even less needed after reusing skb cache on allocation path, so remove it and this way lighten network softirqs a bit. Suggested-by: Eric Dumazet Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - net/core/dev.c | 7 +------ net/core/skbuff.c | 12 ------------ 3 files changed, 1 insertion(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0a4e91a2f873..0e0707296098 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2919,7 +2919,6 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, } void napi_consume_skb(struct sk_buff *skb, int budget); -void __kfree_skb_flush(void); void __kfree_skb_defer(struct sk_buff *skb); /** diff --git a/net/core/dev.c b/net/core/dev.c index ce6291bc2e16..631807c196ad 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4944,8 +4944,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) else __kfree_skb_defer(skb); } - - __kfree_skb_flush(); } if (sd->output_queue) { @@ -7012,7 +7010,6 @@ static int napi_threaded_poll(void *data) __napi_poll(napi, &repoll); netpoll_poll_unlock(have); - __kfree_skb_flush(); local_bh_enable(); if (!repoll) @@ -7042,7 +7039,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - goto out; + return; break; } @@ -7069,8 +7066,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); -out: - __kfree_skb_flush(); } struct netdev_adjacent { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1c6f6ef70339..4be2bb969535 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -838,18 +838,6 @@ void __consume_stateless_skb(struct sk_buff *skb) kfree_skbmem(skb); } -void __kfree_skb_flush(void) -{ - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - - /* flush skb_cache if containing objects */ - if (nc->skb_count) { - kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, - nc->skb_cache); - nc->skb_count = 0; - } -} - static inline void _kfree_skb_defer(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); -- cgit v1.2.3 From f450d539c05a14c103dd174718f81bb2fe65cb4b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Feb 2021 14:12:25 +0000 Subject: skbuff: introduce {,__}napi_build_skb() which reuses NAPI cache heads Instead of just bulk-flushing skbuff_heads queued up through napi_consume_skb() or __kfree_skb_defer(), try to reuse them on allocation path. If the cache is empty on allocation, bulk-allocate the first 16 elements, which is more efficient than per-skb allocation. If the cache is full on freeing, bulk-wipe the second half of the cache (32 elements). This also includes custom KASAN poisoning/unpoisoning to be double sure there are no use-after-free cases. To not change current behaviour, introduce a new function, napi_build_skb(), to optionally use a new approach later in drivers. Note on selected bulk size, 16: - this equals to XDP_BULK_QUEUE_SIZE, DEV_MAP_BULK_SIZE and especially VETH_XDP_BATCH, which is also used to bulk-allocate skbuff_heads and was tested on powerful setups; - this also showed the best performance in the actual test series (from the array of {8, 16, 32}). Suggested-by: Edward Cree # Divide on two halves Suggested-by: Eric Dumazet # KASAN poisoning Cc: Dmitry Vyukov # Help with KASAN Cc: Paolo Abeni # Reduced batch size Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 94 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 83 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0e0707296098..906122eac82a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1087,6 +1087,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); +struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); + /** * alloc_skb - allocate a network buffer * @size: size to allocate diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 860a9d4f752f..9e1a8ded4acc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -120,6 +120,8 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) } #define NAPI_SKB_CACHE_SIZE 64 +#define NAPI_SKB_CACHE_BULK 16 +#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) struct napi_alloc_cache { struct page_frag_cache page; @@ -164,6 +166,25 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) } EXPORT_SYMBOL(__netdev_alloc_frag_align); +static struct sk_buff *napi_skb_cache_get(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb; + + if (unlikely(!nc->skb_count)) + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, + GFP_ATOMIC, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); + if (unlikely(!nc->skb_count)) + return NULL; + + skb = nc->skb_cache[--nc->skb_count]; + kasan_unpoison_object_data(skbuff_head_cache, skb); + + return skb; +} + /* Caller must provide SKB that is memset cleared */ static void __build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) @@ -265,6 +286,53 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, } EXPORT_SYMBOL(build_skb_around); +/** + * __napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __build_skb() that uses NAPI percpu caches to obtain + * skbuff_head instead of inplace allocation. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; + + skb = napi_skb_cache_get(); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} + +/** + * napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __napi_build_skb() that takes care of skb->head_frag + * and skb->pfmemalloc when the data is a page or page fragment. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __napi_build_skb(data, frag_size); + + if (likely(skb) && frag_size) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } + + return skb; +} +EXPORT_SYMBOL(napi_build_skb); + /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and @@ -838,31 +906,31 @@ void __consume_stateless_skb(struct sk_buff *skb) kfree_skbmem(skb); } -static inline void _kfree_skb_defer(struct sk_buff *skb) +static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 i; /* drop skb->head and call any destructors for packet */ skb_release_all(skb); - /* record skb to CPU local list */ + kasan_poison_object_data(skbuff_head_cache, skb); nc->skb_cache[nc->skb_count++] = skb; -#ifdef CONFIG_SLUB - /* SLUB writes into objects when freeing */ - prefetchw(skb); -#endif - - /* flush skb_cache if it is filled */ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { - kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, - nc->skb_cache); - nc->skb_count = 0; + for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) + kasan_unpoison_object_data(skbuff_head_cache, + nc->skb_cache[i]); + + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, + nc->skb_cache + NAPI_SKB_CACHE_HALF); + nc->skb_count = NAPI_SKB_CACHE_HALF; } } + void __kfree_skb_defer(struct sk_buff *skb) { - _kfree_skb_defer(skb); + napi_skb_cache_put(skb); } void napi_consume_skb(struct sk_buff *skb, int budget) @@ -887,7 +955,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - _kfree_skb_defer(skb); + napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); -- cgit v1.2.3 From 9243adfc311a20371c3f4d8eaf0af4b135e6fac3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Feb 2021 14:13:09 +0000 Subject: skbuff: queue NAPI_MERGED_FREE skbs into NAPI cache instead of freeing napi_frags_finish() and napi_skb_finish() can only be called inside NAPI Rx context, so we can feed NAPI cache with skbuff_heads that got NAPI_MERGED_FREE verdict instead of immediate freeing. Replace __kfree_skb() with __kfree_skb_defer() in napi_skb_finish() and move napi_skb_free_stolen_head() to skbuff.c, so it can drop skbs to NAPI cache. As many drivers call napi_alloc_skb()/napi_get_frags() on their receive path, this becomes especially useful. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/dev.c | 9 +-------- net/core/skbuff.c | 12 +++++++++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 906122eac82a..6d0a33d1c0db 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2921,6 +2921,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, } void napi_consume_skb(struct sk_buff *skb, int budget); +void napi_skb_free_stolen_head(struct sk_buff *skb); void __kfree_skb_defer(struct sk_buff *skb); /** diff --git a/net/core/dev.c b/net/core/dev.c index 631807c196ad..ea9b46318d23 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6095,13 +6095,6 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static void napi_skb_free_stolen_head(struct sk_buff *skb) -{ - skb_dst_drop(skb); - skb_ext_put(skb); - kmem_cache_free(skbuff_head_cache, skb); -} - static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) @@ -6115,7 +6108,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else - __kfree_skb(skb); + __kfree_skb_defer(skb); break; case GRO_HELD: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 875e1a453f7e..545a472273a5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -916,9 +916,6 @@ static void napi_skb_cache_put(struct sk_buff *skb) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; - /* drop skb->head and call any destructors for packet */ - skb_release_all(skb); - kasan_poison_object_data(skbuff_head_cache, skb); nc->skb_cache[nc->skb_count++] = skb; @@ -935,6 +932,14 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __kfree_skb_defer(struct sk_buff *skb) { + skb_release_all(skb); + napi_skb_cache_put(skb); +} + +void napi_skb_free_stolen_head(struct sk_buff *skb) +{ + skb_dst_drop(skb); + skb_ext_put(skb); napi_skb_cache_put(skb); } @@ -960,6 +965,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } + skb_release_all(skb); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); -- cgit v1.2.3 From 137ffbc4bb86a12d7979e6f839d4babc8aef7669 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 14 Feb 2021 00:37:54 +0200 Subject: net: mscc: ocelot: refactor ocelot_port_inject_frame out of ocelot_port_xmit The felix DSA driver will inject some frames through register MMIO, same as ocelot switchdev currently does. So we need to be able to reuse the common code. Also create some shim definitions, since the DSA tagger can be compiled without support for the switch driver. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 80 +++++++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_net.c | 81 +++------------------------------- include/soc/mscc/ocelot.h | 22 +++++++++ 3 files changed, 109 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index d1a9cdbf7a3e..7106d9ee534a 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -628,6 +628,86 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) } EXPORT_SYMBOL(ocelot_get_txtstamp); +/* Generate the IFH for frame injection + * + * The IFH is a 128bit-value + * bit 127: bypass the analyzer processing + * bit 56-67: destination mask + * bit 28-29: pop_cnt: 3 disables all rewriting of the frame + * bit 20-27: cpu extraction queue mask + * bit 16: tag type 0: C-tag, 1: S-tag + * bit 0-11: VID + */ +static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info) +{ + ifh[0] = IFH_INJ_BYPASS | ((0x1ff & info->rew_op) << 21); + ifh[1] = (0xf00 & info->port) >> 8; + ifh[2] = (0xff & info->port) << 24; + ifh[3] = (info->tag_type << 16) | info->vid; + + return 0; +} + +bool ocelot_can_inject(struct ocelot *ocelot, int grp) +{ + u32 val = ocelot_read(ocelot, QS_INJ_STATUS); + + if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp)))) + return false; + if (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp))) + return false; + + return true; +} +EXPORT_SYMBOL(ocelot_can_inject); + +void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, + u32 rew_op, struct sk_buff *skb) +{ + struct frame_info info = {}; + u32 ifh[OCELOT_TAG_LEN / 4]; + unsigned int i, count, last; + + ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | + QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); + + info.port = BIT(port); + info.tag_type = IFH_TAG_TYPE_C; + info.vid = skb_vlan_tag_get(skb); + info.rew_op = rew_op; + + ocelot_gen_ifh(ifh, &info); + + for (i = 0; i < OCELOT_TAG_LEN / 4; i++) + ocelot_write_rix(ocelot, (__force u32)cpu_to_be32(ifh[i]), + QS_INJ_WR, grp); + + count = DIV_ROUND_UP(skb->len, 4); + last = skb->len % 4; + for (i = 0; i < count; i++) + ocelot_write_rix(ocelot, ((u32 *)skb->data)[i], QS_INJ_WR, grp); + + /* Add padding */ + while (i < (OCELOT_BUFFER_CELL_SZ / 4)) { + ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); + i++; + } + + /* Indicate EOF and valid bytes in last word */ + ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | + QS_INJ_CTRL_VLD_BYTES(skb->len < OCELOT_BUFFER_CELL_SZ ? 0 : last) | + QS_INJ_CTRL_EOF, + QS_INJ_CTRL, grp); + + /* Add dummy CRC */ + ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); + skb_tx_timestamp(skb); + + skb->dev->stats.tx_packets++; + skb->dev->stats.tx_bytes += skb->len; +} +EXPORT_SYMBOL(ocelot_port_inject_frame); + int ocelot_fdb_add(struct ocelot *ocelot, int port, const unsigned char *addr, u16 vid) { diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 1ab453298a18..6518262532f0 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -488,53 +488,20 @@ static int ocelot_port_stop(struct net_device *dev) return 0; } -/* Generate the IFH for frame injection - * - * The IFH is a 128bit-value - * bit 127: bypass the analyzer processing - * bit 56-67: destination mask - * bit 28-29: pop_cnt: 3 disables all rewriting of the frame - * bit 20-27: cpu extraction queue mask - * bit 16: tag type 0: C-tag, 1: S-tag - * bit 0-11: VID - */ -static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info) -{ - ifh[0] = IFH_INJ_BYPASS | ((0x1ff & info->rew_op) << 21); - ifh[1] = (0xf00 & info->port) >> 8; - ifh[2] = (0xff & info->port) << 24; - ifh[3] = (info->tag_type << 16) | info->vid; - - return 0; -} - -static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) { struct ocelot_port_private *priv = netdev_priv(dev); - struct skb_shared_info *shinfo = skb_shinfo(skb); struct ocelot_port *ocelot_port = &priv->port; struct ocelot *ocelot = ocelot_port->ocelot; - u32 val, ifh[OCELOT_TAG_LEN / 4]; - struct frame_info info = {}; - u8 grp = 0; /* Send everything on CPU group 0 */ - unsigned int i, count, last; int port = priv->chip_port; + u32 rew_op = 0; - val = ocelot_read(ocelot, QS_INJ_STATUS); - if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp))) || - (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp)))) + if (!ocelot_can_inject(ocelot, 0)) return NETDEV_TX_BUSY; - ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | - QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); - - info.port = BIT(port); - info.tag_type = IFH_TAG_TYPE_C; - info.vid = skb_vlan_tag_get(skb); - /* Check if timestamping is needed */ - if (ocelot->ptp && (shinfo->tx_flags & SKBTX_HW_TSTAMP)) { - info.rew_op = ocelot_port->ptp_cmd; + if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + rew_op = ocelot_port->ptp_cmd; if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { struct sk_buff *clone; @@ -547,45 +514,11 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) ocelot_port_add_txtstamp_skb(ocelot, port, clone); - info.rew_op |= clone->cb[0] << 3; + rew_op |= clone->cb[0] << 3; } } - if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP) { - info.rew_op = ocelot_port->ptp_cmd; - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - info.rew_op |= skb->cb[0] << 3; - } - - ocelot_gen_ifh(ifh, &info); - - for (i = 0; i < OCELOT_TAG_LEN / 4; i++) - ocelot_write_rix(ocelot, (__force u32)cpu_to_be32(ifh[i]), - QS_INJ_WR, grp); - - count = DIV_ROUND_UP(skb->len, 4); - last = skb->len % 4; - for (i = 0; i < count; i++) - ocelot_write_rix(ocelot, ((u32 *)skb->data)[i], QS_INJ_WR, grp); - - /* Add padding */ - while (i < (OCELOT_BUFFER_CELL_SZ / 4)) { - ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); - i++; - } - - /* Indicate EOF and valid bytes in last word */ - ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | - QS_INJ_CTRL_VLD_BYTES(skb->len < OCELOT_BUFFER_CELL_SZ ? 0 : last) | - QS_INJ_CTRL_EOF, - QS_INJ_CTRL, grp); - - /* Add dummy CRC */ - ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp); - skb_tx_timestamp(skb); - - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); kfree_skb(skb); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 40792b37bb9f..a22aa944e921 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -742,6 +742,28 @@ u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, u32 val, u32 reg, u32 offset); +/* Packet I/O */ +#if IS_ENABLED(CONFIG_MSCC_OCELOT_SWITCH_LIB) + +bool ocelot_can_inject(struct ocelot *ocelot, int grp); +void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, + u32 rew_op, struct sk_buff *skb); + +#else + +static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp) +{ + return false; +} + +static inline void ocelot_port_inject_frame(struct ocelot *ocelot, int port, + int grp, u32 rew_op, + struct sk_buff *skb) +{ +} + +#endif + /* Hardware initialization */ int ocelot_regfields_init(struct ocelot *ocelot, const struct reg_field *const regfields); -- cgit v1.2.3 From 40d3f295b5feda409784e569550057b5fbc2a295 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 14 Feb 2021 00:37:56 +0200 Subject: net: mscc: ocelot: use common tag parsing code with DSA The Injection Frame Header and Extraction Frame Header that the switch prepends to frames over the NPI port is also prepended to frames delivered over the CPU port module's queues. Let's unify the handling of the frame headers by making the ocelot driver call some helpers exported by the DSA tagger. Among other things, this allows us to get rid of the strange cpu_to_be32 when transmitting the Injection Frame Header on ocelot, since the packing API uses network byte order natively (when "quirks" is 0). The comments above ocelot_gen_ifh talk about setting pop_cnt to 3, and the cpu extraction queue mask to something, but the code doesn't do it, so we don't do it either. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 6 +- drivers/net/dsa/ocelot/felix_vsc9959.c | 2 +- drivers/net/dsa/ocelot/seville_vsc9953.c | 2 +- drivers/net/ethernet/mscc/ocelot.c | 38 ++---- drivers/net/ethernet/mscc/ocelot.h | 9 -- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 54 +++----- include/linux/dsa/ocelot.h | 208 +++++++++++++++++++++++++++++ include/soc/mscc/ocelot.h | 7 - net/dsa/tag_ocelot.c | 147 ++------------------ 9 files changed, 246 insertions(+), 227 deletions(-) create mode 100644 include/linux/dsa/ocelot.h (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index d3180b0f2307..60e0d354adea 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -1161,9 +1161,9 @@ static int felix_hwtstamp_set(struct dsa_switch *ds, int port, static bool felix_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type) { + u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN; struct skb_shared_hwtstamps *shhwtstamps; struct ocelot *ocelot = ds->priv; - u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN; u32 tstamp_lo, tstamp_hi; struct timespec64 ts; u64 tstamp, val; @@ -1171,7 +1171,7 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port, ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); tstamp = ktime_set(ts.tv_sec, ts.tv_nsec); - packing(extraction, &val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); + ocelot_xfh_get_rew_val(extraction, &val); tstamp_lo = (u32)val; tstamp_hi = tstamp >> 32; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index e944868cc120..cacc6f9c0113 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 512f677a6c1c..d7348ea4831e 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include "felix.h" diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 7106d9ee534a..699b0c1c1780 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -4,6 +4,7 @@ * * Copyright (c) 2017 Microsemi Corporation */ +#include #include #include #include "ocelot.h" @@ -628,26 +629,6 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) } EXPORT_SYMBOL(ocelot_get_txtstamp); -/* Generate the IFH for frame injection - * - * The IFH is a 128bit-value - * bit 127: bypass the analyzer processing - * bit 56-67: destination mask - * bit 28-29: pop_cnt: 3 disables all rewriting of the frame - * bit 20-27: cpu extraction queue mask - * bit 16: tag type 0: C-tag, 1: S-tag - * bit 0-11: VID - */ -static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info) -{ - ifh[0] = IFH_INJ_BYPASS | ((0x1ff & info->rew_op) << 21); - ifh[1] = (0xf00 & info->port) >> 8; - ifh[2] = (0xff & info->port) << 24; - ifh[3] = (info->tag_type << 16) | info->vid; - - return 0; -} - bool ocelot_can_inject(struct ocelot *ocelot, int grp) { u32 val = ocelot_read(ocelot, QS_INJ_STATUS); @@ -664,23 +645,20 @@ EXPORT_SYMBOL(ocelot_can_inject); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb) { - struct frame_info info = {}; - u32 ifh[OCELOT_TAG_LEN / 4]; + u32 ifh[OCELOT_TAG_LEN / 4] = {0}; unsigned int i, count, last; ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) | QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp); - info.port = BIT(port); - info.tag_type = IFH_TAG_TYPE_C; - info.vid = skb_vlan_tag_get(skb); - info.rew_op = rew_op; - - ocelot_gen_ifh(ifh, &info); + ocelot_ifh_set_bypass(ifh, 1); + ocelot_ifh_set_dest(ifh, BIT(port)); + ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C); + ocelot_ifh_set_vid(ifh, skb_vlan_tag_get(skb)); + ocelot_ifh_set_rew_op(ifh, rew_op); for (i = 0; i < OCELOT_TAG_LEN / 4; i++) - ocelot_write_rix(ocelot, (__force u32)cpu_to_be32(ifh[i]), - QS_INJ_WR, grp); + ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp); count = DIV_ROUND_UP(skb->len, 4); last = skb->len % 4; diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index c485795c606b..db6b1a4c3926 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -32,15 +32,6 @@ #define OCELOT_PTP_QUEUE_SZ 128 -struct frame_info { - u32 len; - u16 port; - u16 vid; - u8 tag_type; - u16 rew_op; - u32 timestamp; /* rew_val */ -}; - struct ocelot_port_tc { bool block_shared; unsigned long offload_cnt; diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index b075dc13354a..fe0f8d6a32ce 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -4,6 +4,7 @@ * * Copyright (c) 2017 Microsemi Corporation */ +#include #include #include #include @@ -18,8 +19,6 @@ #include #include "ocelot.h" -#define IFH_EXTRACT_BITFIELD64(x, o, w) (((x) >> (o)) & GENMASK_ULL((w) - 1, 0)) - static const u32 ocelot_ana_regmap[] = { REG(ANA_ADVLEARN, 0x009000), REG(ANA_VLANMASK, 0x009004), @@ -532,29 +531,6 @@ static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops) return 0; } -static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info) -{ - u8 llen, wlen; - u64 ifh[2]; - - ifh[0] = be64_to_cpu(((__force __be64 *)_ifh)[0]); - ifh[1] = be64_to_cpu(((__force __be64 *)_ifh)[1]); - - wlen = IFH_EXTRACT_BITFIELD64(ifh[0], 7, 8); - llen = IFH_EXTRACT_BITFIELD64(ifh[0], 15, 6); - - info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80; - - info->timestamp = IFH_EXTRACT_BITFIELD64(ifh[0], 21, 32); - - info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4); - - info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16, 1); - info->vid = IFH_EXTRACT_BITFIELD64(ifh[1], 0, 12); - - return 0; -} - static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh, u32 *rval) { @@ -609,20 +585,20 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) struct ocelot_port_private *priv; struct ocelot_port *ocelot_port; u64 tod_in_ns, full_ts_in_ns; - struct frame_info info = {}; + u64 src_port, len, timestamp; struct net_device *dev; - u32 ifh[4], val, *buf; + u32 xfh[4], val, *buf; struct timespec64 ts; - int sz, len, buf_len; struct sk_buff *skb; + int sz, buf_len; for (i = 0; i < OCELOT_TAG_LEN / 4; i++) { - err = ocelot_rx_frame_word(ocelot, grp, true, &ifh[i]); + err = ocelot_rx_frame_word(ocelot, grp, true, &xfh[i]); if (err != 4) goto out; } - /* At this point the IFH was read correctly, so it is safe to + /* At this point the XFH was read correctly, so it is safe to * presume that there is no error. The err needs to be reset * otherwise a frame could come in CPU queue between the while * condition and the check for error later on. And in that case @@ -630,21 +606,23 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) */ err = 0; - ocelot_parse_ifh(ifh, &info); + ocelot_xfh_get_src_port(xfh, &src_port); + ocelot_xfh_get_len(xfh, &len); + ocelot_xfh_get_rew_val(xfh, ×tamp); - ocelot_port = ocelot->ports[info.port]; + ocelot_port = ocelot->ports[src_port]; priv = container_of(ocelot_port, struct ocelot_port_private, port); dev = priv->dev; - skb = netdev_alloc_skb(dev, info.len); + skb = netdev_alloc_skb(dev, len); if (unlikely(!skb)) { netdev_err(dev, "Unable to allocate sk_buff\n"); err = -ENOMEM; goto out; } - buf_len = info.len - ETH_FCS_LEN; + buf_len = len - ETH_FCS_LEN; buf = (u32 *)skb_put(skb, buf_len); len = 0; @@ -677,12 +655,12 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec); - if ((tod_in_ns & 0xffffffff) < info.timestamp) + if ((tod_in_ns & 0xffffffff) < timestamp) full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) | - info.timestamp; + timestamp; else full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) | - info.timestamp; + timestamp; shhwtstamps = skb_hwtstamps(skb); memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); @@ -692,7 +670,7 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) /* Everything we see on an interface that is in the HW bridge * has already been forwarded. */ - if (ocelot->bridge_mask & BIT(info.port)) + if (ocelot->bridge_mask & BIT(src_port)) skb->offload_fwd_mark = 1; skb->protocol = eth_type_trans(skb, dev); diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h new file mode 100644 index 000000000000..763dbbfaff7a --- /dev/null +++ b/include/linux/dsa/ocelot.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright 2019-2021 NXP Semiconductors + */ + +#ifndef _NET_DSA_TAG_OCELOT_H +#define _NET_DSA_TAG_OCELOT_H + +#include + +#define OCELOT_TAG_LEN 16 +#define OCELOT_SHORT_PREFIX_LEN 4 +#define OCELOT_LONG_PREFIX_LEN 16 +#define OCELOT_TOTAL_TAG_LEN (OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN) + +/* The CPU injection header and the CPU extraction header can have 3 types of + * prefixes: long, short and no prefix. The format of the header itself is the + * same in all 3 cases. + * + * Extraction with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | ff:ff:ff:ff:ff:ff | fe:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Extraction with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | extraction | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Extraction with no prefix: + * + * +------------+-------+ + * | extraction | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * + * Injection with long prefix: + * + * +-------------------+-------------------+------+------+------------+-------+ + * | any dmac | any smac | 8880 | 000a | injection | frame | + * | | | | | header | | + * +-------------------+-------------------+------+------+------------+-------+ + * 48 bits 48 bits 16 bits 16 bits 128 bits + * + * Injection with short prefix: + * + * +------+------+------------+-------+ + * | 8880 | 000a | injection | frame | + * | | | header | | + * +------+------+------------+-------+ + * 16 bits 16 bits 128 bits + * + * Injection with no prefix: + * + * +------------+-------+ + * | injection | frame | + * | header | | + * +------------+-------+ + * 128 bits + * + * The injection header looks like this (network byte order, bit 127 + * is part of lowest address byte in memory, bit 0 is part of highest + * address byte): + * + * +------+------+------+------+------+------+------+------+ + * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | RSV | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | RSV | DEST | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | DEST | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| + * +------+------+------+------+------+------+------+------+ + * 39: 32 | TFRM_TIMER | RSV | + * +------+------+------+------+------+------+------+------+ + * 31: 24 | RSV | DP | POP_CNT | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + * + * And the extraction header looks like this: + * + * +------+------+------+------+------+------+------+------+ + * 127:120 | RSV | REW_OP | + * +------+------+------+------+------+------+------+------+ + * 119:112 | REW_OP | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 111:104 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 103: 96 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 95: 88 | REW_VAL | + * +------+------+------+------+------+------+------+------+ + * 87: 80 | REW_VAL | LLEN | + * +------+------+------+------+------+------+------+------+ + * 79: 72 | LLEN | WLEN | + * +------+------+------+------+------+------+------+------+ + * 71: 64 | WLEN | RSV | + * +------+------+------+------+------+------+------+------+ + * 63: 56 | RSV | + * +------+------+------+------+------+------+------+------+ + * 55: 48 | RSV | + * +------+------+------+------+------+------+------+------+ + * 47: 40 | RSV | SRC_PORT | ACL_ID | + * +------+------+------+------+------+------+------+------+ + * 39: 32 | ACL_ID | RSV | SFLOW_ID | + * +------+------+------+------+------+------+------+------+ + * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | + * +------+------+------+------+------+------+------+------+ + * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| + * +------+------+------+------+------+------+------+------+ + * 15: 8 | PCP | DEI | VID | + * +------+------+------+------+------+------+------+------+ + * 7: 0 | VID | + * +------+------+------+------+------+------+------+------+ + */ + +static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) +{ + packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_len(void *extraction, u64 *len) +{ + u64 llen, wlen; + + packing(extraction, &llen, 84, 79, OCELOT_TAG_LEN, UNPACK, 0); + packing(extraction, &wlen, 78, 71, OCELOT_TAG_LEN, UNPACK, 0); + + *len = 60 * wlen + llen - 80; +} + +static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port) +{ + packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class) +{ + packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_tag_type(void *extraction, u64 *tag_type) +{ + packing(extraction, tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_xfh_get_vlan_tci(void *extraction, u64 *vlan_tci) +{ + packing(extraction, vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0); +} + +static inline void ocelot_ifh_set_bypass(void *injection, u64 bypass) +{ + packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_rew_op(void *injection, u64 rew_op) +{ + packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_dest(void *injection, u64 dest) +{ + packing(injection, &dest, 67, 56, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_qos_class(void *injection, u64 qos_class) +{ + packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type) +{ + packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_vid(void *injection, u64 vid) +{ + packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0); +} + +#endif diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index a22aa944e921..11cae2e968b5 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -87,9 +87,6 @@ /* Source PGIDs, one per physical port */ #define PGID_SRC 80 -#define IFH_INJ_BYPASS BIT(31) -#define IFH_INJ_POP_CNT_DISABLE (3 << 28) - #define IFH_TAG_TYPE_C 0 #define IFH_TAG_TYPE_S 1 @@ -100,10 +97,6 @@ #define IFH_REW_OP_ORIGIN_PTP 0x5 #define OCELOT_NUM_TC 8 -#define OCELOT_TAG_LEN 16 -#define OCELOT_SHORT_PREFIX_LEN 4 -#define OCELOT_LONG_PREFIX_LEN 16 -#define OCELOT_TOTAL_TAG_LEN (OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN) #define OCELOT_SPEED_2500 0 #define OCELOT_SPEED_1000 1 diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 225b145fd131..8ce0b26f3520 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -1,138 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright 2019 NXP Semiconductors */ +#include #include -#include #include "dsa_priv.h" -/* The CPU injection header and the CPU extraction header can have 3 types of - * prefixes: long, short and no prefix. The format of the header itself is the - * same in all 3 cases. - * - * Extraction with long prefix: - * - * +-------------------+-------------------+------+------+------------+-------+ - * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame | - * | | | | | header | | - * +-------------------+-------------------+------+------+------------+-------+ - * 48 bits 48 bits 16 bits 16 bits 128 bits - * - * Extraction with short prefix: - * - * +------+------+------------+-------+ - * | 8880 | 000a | extraction | frame | - * | | | header | | - * +------+------+------------+-------+ - * 16 bits 16 bits 128 bits - * - * Extraction with no prefix: - * - * +------------+-------+ - * | extraction | frame | - * | header | | - * +------------+-------+ - * 128 bits - * - * - * Injection with long prefix: - * - * +-------------------+-------------------+------+------+------------+-------+ - * | any dmac | any smac | 8880 | 000a | injection | frame | - * | | | | | header | | - * +-------------------+-------------------+------+------+------------+-------+ - * 48 bits 48 bits 16 bits 16 bits 128 bits - * - * Injection with short prefix: - * - * +------+------+------------+-------+ - * | 8880 | 000a | injection | frame | - * | | | header | | - * +------+------+------------+-------+ - * 16 bits 16 bits 128 bits - * - * Injection with no prefix: - * - * +------------+-------+ - * | injection | frame | - * | header | | - * +------------+-------+ - * 128 bits - * - * The injection header looks like this (network byte order, bit 127 - * is part of lowest address byte in memory, bit 0 is part of highest - * address byte): - * - * +------+------+------+------+------+------+------+------+ - * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP| - * +------+------+------+------+------+------+------+------+ - * 119:112 | REW_OP | - * +------+------+------+------+------+------+------+------+ - * 111:104 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 103: 96 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 95: 88 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 87: 80 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 79: 72 | RSV | - * +------+------+------+------+------+------+------+------+ - * 71: 64 | RSV | DEST | - * +------+------+------+------+------+------+------+------+ - * 63: 56 | DEST | - * +------+------+------+------+------+------+------+------+ - * 55: 48 | RSV | - * +------+------+------+------+------+------+------+------+ - * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER| - * +------+------+------+------+------+------+------+------+ - * 39: 32 | TFRM_TIMER | RSV | - * +------+------+------+------+------+------+------+------+ - * 31: 24 | RSV | DP | POP_CNT | CPUQ | - * +------+------+------+------+------+------+------+------+ - * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| - * +------+------+------+------+------+------+------+------+ - * 15: 8 | PCP | DEI | VID | - * +------+------+------+------+------+------+------+------+ - * 7: 0 | VID | - * +------+------+------+------+------+------+------+------+ - * - * And the extraction header looks like this: - * - * +------+------+------+------+------+------+------+------+ - * 127:120 | RSV | REW_OP | - * +------+------+------+------+------+------+------+------+ - * 119:112 | REW_OP | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 111:104 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 103: 96 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 95: 88 | REW_VAL | - * +------+------+------+------+------+------+------+------+ - * 87: 80 | REW_VAL | LLEN | - * +------+------+------+------+------+------+------+------+ - * 79: 72 | LLEN | WLEN | - * +------+------+------+------+------+------+------+------+ - * 71: 64 | WLEN | RSV | - * +------+------+------+------+------+------+------+------+ - * 63: 56 | RSV | - * +------+------+------+------+------+------+------+------+ - * 55: 48 | RSV | - * +------+------+------+------+------+------+------+------+ - * 47: 40 | RSV | SRC_PORT | ACL_ID | - * +------+------+------+------+------+------+------+------+ - * 39: 32 | ACL_ID | RSV | SFLOW_ID | - * +------+------+------+------+------+------+------+------+ - * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ | - * +------+------+------+------+------+------+------+------+ - * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE| - * +------+------+------+------+------+------+------+------+ - * 15: 8 | PCP | DEI | VID | - * +------+------+------+------+------+------+------+------+ - * 7: 0 | VID | - * +------+------+------+------+------+------+------+------+ - */ - static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -142,7 +14,6 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port; u8 *prefix, *injection; - u64 qos_class, rew_op; ocelot_port = ocelot->ports[dp->index]; @@ -155,19 +26,19 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, /* Fix up the fields which are not statically determined * in the template */ - qos_class = skb->priority; - packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); + ocelot_ifh_set_qos_class(injection, skb->priority); /* TX timestamping was requested */ if (clone) { - rew_op = ocelot_port->ptp_cmd; + u64 rew_op = ocelot_port->ptp_cmd; + /* Retrieve timestamp ID populated inside skb->cb[0] of the * clone by ocelot_port_add_txtstamp_skb */ if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) rew_op |= clone->cb[0] << 3; - packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); + ocelot_ifh_set_rew_op(injection, rew_op); } return skb; @@ -208,10 +79,10 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, /* Remove from inet csum the extraction header */ skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN); - packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0); - packing(extraction, &vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0); + ocelot_xfh_get_src_port(extraction, &src_port); + ocelot_xfh_get_qos_class(extraction, &qos_class); + ocelot_xfh_get_tag_type(extraction, &tag_type); + ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) -- cgit v1.2.3 From 7c4bb540e9173c914c2091fdd9b6aee3c2a3e1e5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 14 Feb 2021 00:37:58 +0200 Subject: net: dsa: tag_ocelot: create separate tagger for Seville The ocelot tagger is a hot mess currently, it relies on memory initialized by the attached driver for basic frame transmission. This is against all that DSA tagging protocols stand for, which is that the transmission and reception of a DSA-tagged frame, the data path, should be independent from the switch control path, because the tag protocol is in principle hot-pluggable and reusable across switches (even if in practice it wasn't until very recently). But if another driver like dsa_loop wants to make use of tag_ocelot, it couldn't. This was done to have common code between Felix and Ocelot, which have one bit difference in the frame header format. Quoting from commit 67c2404922c2 ("net: dsa: felix: create a template for the DSA tags on xmit"): Other alternatives have been analyzed, such as: - Create a separate tag_seville.c: too much code duplication for just 1 bit field difference. - Create a separate DSA_TAG_PROTO_SEVILLE under tag_ocelot.c, just like tag_brcm.c, which would have a separate .xmit function. Again, too much code duplication for just 1 bit field difference. - Allocate the template from the init function of the tag_ocelot.c module, instead of from the driver: couldn't figure out a method of accessing the correct port template corresponding to the correct tagger in the .xmit function. The really interesting part is that Seville should have had its own tagging protocol defined - it is not compatible on the wire with Ocelot, even for that single bit. In principle, a packet generated by DSA_TAG_PROTO_OCELOT when booted on NXP LS1028A would look in a certain way, but when booted on NXP T1040 it would look differently. The reverse is also true: a packet generated by a Seville switch would be interpreted incorrectly by Wireshark if it was told it was generated by an Ocelot switch. Actually things are a bit more nuanced. If we concentrate only on the DSA tag, what I said above is true, but Ocelot/Seville also support an optional DSA tag prefix, which can be short or long, and it is possible to distinguish the two taggers based on an integer constant put in that prefix. Nonetheless, creating a separate tagger is still justified, since the tag prefix is optional, and without it, there is again no way to distinguish. Claiming backwards binary compatibility is a bit more tough, since I've already changed the format of tag_ocelot once, in commit 5124197ce58b ("net: dsa: tag_ocelot: use a short prefix on both ingress and egress"). Therefore I am not very concerned with treating this as a bugfix and backporting it to stable kernels (which would be another mess due to the fact that there would be lots of conflicts with the other DSA_TAG_PROTO* definitions). It's just simpler to say that the string values of the taggers have ABI value starting with kernel 5.12, which will be when the changing of tag protocol via /sys/class/net//dsa/tagging goes live. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 18 ++------- drivers/net/dsa/ocelot/felix.h | 1 - drivers/net/dsa/ocelot/felix_vsc9959.c | 26 ------------ drivers/net/dsa/ocelot/seville_vsc9953.c | 28 +------------ include/linux/dsa/ocelot.h | 10 +++++ include/net/dsa.h | 2 + net/dsa/tag_ocelot.c | 68 +++++++++++++++++++++++++------- 7 files changed, 70 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 60e0d354adea..7e1f6350139b 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -431,6 +431,7 @@ static int felix_set_tag_protocol(struct dsa_switch *ds, int cpu, int err; switch (proto) { + case DSA_TAG_PROTO_SEVILLE: case DSA_TAG_PROTO_OCELOT: err = felix_setup_tag_npi(ds, cpu); break; @@ -448,6 +449,7 @@ static void felix_del_tag_protocol(struct dsa_switch *ds, int cpu, enum dsa_tag_protocol proto) { switch (proto) { + case DSA_TAG_PROTO_SEVILLE: case DSA_TAG_PROTO_OCELOT: felix_teardown_tag_npi(ds, cpu); break; @@ -471,7 +473,8 @@ static int felix_change_tag_protocol(struct dsa_switch *ds, int cpu, enum dsa_tag_protocol old_proto = felix->tag_proto; int err; - if (proto != DSA_TAG_PROTO_OCELOT && + if (proto != DSA_TAG_PROTO_SEVILLE && + proto != DSA_TAG_PROTO_OCELOT && proto != DSA_TAG_PROTO_OCELOT_8021Q) return -EPROTONOSUPPORT; @@ -1003,7 +1006,6 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) for (port = 0; port < num_phys_ports; port++) { struct ocelot_port *ocelot_port; struct regmap *target; - u8 *template; ocelot_port = devm_kzalloc(ocelot->dev, sizeof(struct ocelot_port), @@ -1029,22 +1031,10 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) return PTR_ERR(target); } - template = devm_kzalloc(ocelot->dev, OCELOT_TOTAL_TAG_LEN, - GFP_KERNEL); - if (!template) { - dev_err(ocelot->dev, - "Failed to allocate memory for DSA tag\n"); - kfree(port_phy_modes); - return -ENOMEM; - } - ocelot_port->phy_mode = port_phy_modes[port]; ocelot_port->ocelot = ocelot; ocelot_port->target = target; - ocelot_port->xmit_template = template; ocelot->ports[port] = ocelot_port; - - felix->info->xmit_template_populate(ocelot, port); } kfree(port_phy_modes); diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 9d4459f2fffb..b2ea425c5803 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -34,7 +34,6 @@ struct felix_info { enum tc_setup_type type, void *type_data); void (*port_sched_speed_set)(struct ocelot *ocelot, int port, u32 speed); - void (*xmit_template_populate)(struct ocelot *ocelot, int port); }; extern const struct dsa_switch_ops felix_switch_ops; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index cacc6f9c0113..32b885fcaf90 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1339,31 +1339,6 @@ static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port, } } -static void vsc9959_xmit_template_populate(struct ocelot *ocelot, int port) -{ - struct ocelot_port *ocelot_port = ocelot->ports[port]; - u8 *template = ocelot_port->xmit_template; - u64 bypass, dest, src; - __be32 *prefix; - u8 *injection; - - /* Set the source port as the CPU port module and not the - * NPI port - */ - src = ocelot->num_phys_ports; - dest = BIT(port); - bypass = true; - - injection = template + OCELOT_SHORT_PREFIX_LEN; - prefix = (__be32 *)template; - - packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); - - *prefix = cpu_to_be32(0x8880000a); -} - static const struct felix_info felix_info_vsc9959 = { .target_io_res = vsc9959_target_io_res, .port_io_res = vsc9959_port_io_res, @@ -1386,7 +1361,6 @@ static const struct felix_info felix_info_vsc9959 = { .prevalidate_phy_mode = vsc9959_prevalidate_phy_mode, .port_setup_tc = vsc9959_port_setup_tc, .port_sched_speed_set = vsc9959_sched_speed_set, - .xmit_template_populate = vsc9959_xmit_template_populate, }; static irqreturn_t felix_irq_handler(int irq, void *data) diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index d7348ea4831e..84f93a874d50 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1165,31 +1165,6 @@ static void vsc9953_mdio_bus_free(struct ocelot *ocelot) mdiobus_unregister(felix->imdio); } -static void vsc9953_xmit_template_populate(struct ocelot *ocelot, int port) -{ - struct ocelot_port *ocelot_port = ocelot->ports[port]; - u8 *template = ocelot_port->xmit_template; - u64 bypass, dest, src; - __be32 *prefix; - u8 *injection; - - /* Set the source port as the CPU port module and not the - * NPI port - */ - src = ocelot->num_phys_ports; - dest = BIT(port); - bypass = true; - - injection = template + OCELOT_SHORT_PREFIX_LEN; - prefix = (__be32 *)template; - - packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &dest, 67, 57, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); - - *prefix = cpu_to_be32(0x88800005); -} - static const struct felix_info seville_info_vsc9953 = { .target_io_res = vsc9953_target_io_res, .port_io_res = vsc9953_port_io_res, @@ -1206,7 +1181,6 @@ static const struct felix_info seville_info_vsc9953 = { .mdio_bus_free = vsc9953_mdio_bus_free, .phylink_validate = vsc9953_phylink_validate, .prevalidate_phy_mode = vsc9953_prevalidate_phy_mode, - .xmit_template_populate = vsc9953_xmit_template_populate, }; static int seville_probe(struct platform_device *pdev) @@ -1246,7 +1220,7 @@ static int seville_probe(struct platform_device *pdev) ds->ops = &felix_switch_ops; ds->priv = ocelot; felix->ds = ds; - felix->tag_proto = DSA_TAG_PROTO_OCELOT; + felix->tag_proto = DSA_TAG_PROTO_SEVILLE; err = dsa_register_switch(ds); if (err) { diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 763dbbfaff7a..c6bc45ae5e03 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -195,6 +195,16 @@ static inline void ocelot_ifh_set_qos_class(void *injection, u64 qos_class) packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); } +static inline void seville_ifh_set_dest(void *injection, u64 dest) +{ + packing(injection, &dest, 67, 57, OCELOT_TAG_LEN, PACK, 0); +} + +static inline void ocelot_ifh_set_src(void *injection, u64 src) +{ + packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); +} + static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type) { packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0); diff --git a/include/net/dsa.h b/include/net/dsa.h index 74457aaffec7..b095ef114fe8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -48,6 +48,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_HELLCREEK_VALUE 18 #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 +#define DSA_TAG_PROTO_SEVILLE_VALUE 21 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -71,6 +72,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, + DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, }; struct packet_type; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index fe00519229d7..a7dd61c8e005 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -24,33 +24,52 @@ static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection, ocelot_ifh_set_rew_op(injection, rew_op); } -static struct sk_buff *ocelot_xmit(struct sk_buff *skb, - struct net_device *netdev) +static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, + __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); struct sk_buff *clone = DSA_SKB_CB(skb)->clone; struct dsa_switch *ds = dp->ds; - struct ocelot *ocelot = ds->priv; - struct ocelot_port *ocelot_port; - u8 *prefix, *injection; - - ocelot_port = ocelot->ports[dp->index]; + void *injection; + __be32 *prefix; injection = skb_push(skb, OCELOT_TAG_LEN); - prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); - memcpy(prefix, ocelot_port->xmit_template, OCELOT_TOTAL_TAG_LEN); - - /* Fix up the fields which are not statically determined - * in the template - */ + *prefix = ifh_prefix; + memset(injection, 0, OCELOT_TAG_LEN); + ocelot_ifh_set_bypass(injection, 1); + ocelot_ifh_set_src(injection, ds->num_ports); ocelot_ifh_set_qos_class(injection, skb->priority); /* TX timestamping was requested */ if (clone) ocelot_xmit_ptp(dp, injection, clone); + *ifh = injection; +} + +static struct sk_buff *ocelot_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + void *injection; + + ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection); + ocelot_ifh_set_dest(injection, BIT(dp->index)); + + return skb; +} + +static struct sk_buff *seville_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + void *injection; + + ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection); + seville_ifh_set_dest(injection, BIT(dp->index)); + return skb; } @@ -147,7 +166,26 @@ static const struct dsa_device_ops ocelot_netdev_ops = { .promisc_on_master = true, }; -MODULE_LICENSE("GPL v2"); +DSA_TAG_DRIVER(ocelot_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); -module_dsa_tag_driver(ocelot_netdev_ops); +static const struct dsa_device_ops seville_netdev_ops = { + .name = "seville", + .proto = DSA_TAG_PROTO_SEVILLE, + .xmit = seville_xmit, + .rcv = ocelot_rcv, + .overhead = OCELOT_TOTAL_TAG_LEN, + .promisc_on_master = true, +}; + +DSA_TAG_DRIVER(seville_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE); + +static struct dsa_tag_driver *ocelot_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(ocelot_netdev_ops), + &DSA_TAG_DRIVER_NAME(seville_netdev_ops), +}; + +module_dsa_tag_drivers(ocelot_tag_driver_array); + +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 924ee317f72459a49ac4130272c7d33063e60339 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 14 Feb 2021 00:37:59 +0200 Subject: net: mscc: ocelot: refactor ocelot_xtr_irq_handler into ocelot_xtr_poll Since the felix DSA driver will need to poll the CPU port module for extracted frames as well, let's create some common functions that read an Extraction Frame Header, and then an skb, from a CPU extraction group. We abuse the struct ocelot_ops :: port_to_netdev function a little bit, in order to retrieve the DSA port net_device or the ocelot switchdev net_device based on the source port information from the Extraction Frame Header, but it's all in the benefit of code simplification - netdev_alloc_skb needs it. Originally, the port_to_netdev method was intended for parsing act->dev from tc flower offload code. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 151 +++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_vsc7514.c | 140 +------------------------- include/soc/mscc/ocelot.h | 7 ++ 3 files changed, 163 insertions(+), 135 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 699b0c1c1780..981811ffcdae 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -629,6 +629,157 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) } EXPORT_SYMBOL(ocelot_get_txtstamp); +static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh, + u32 *rval) +{ + u32 bytes_valid, val; + + val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + if (val == XTR_NOT_READY) { + if (ifh) + return -EIO; + + do { + val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + } while (val == XTR_NOT_READY); + } + + switch (val) { + case XTR_ABORT: + return -EIO; + case XTR_EOF_0: + case XTR_EOF_1: + case XTR_EOF_2: + case XTR_EOF_3: + case XTR_PRUNED: + bytes_valid = XTR_VALID_BYTES(val); + val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + if (val == XTR_ESCAPE) + *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + else + *rval = val; + + return bytes_valid; + case XTR_ESCAPE: + *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); + + return 4; + default: + *rval = val; + + return 4; + } +} + +static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh) +{ + int i, err = 0; + + for (i = 0; i < OCELOT_TAG_LEN / 4; i++) { + err = ocelot_rx_frame_word(ocelot, grp, true, &xfh[i]); + if (err != 4) + return (err < 0) ? err : -EIO; + } + + return 0; +} + +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) +{ + struct skb_shared_hwtstamps *shhwtstamps; + u64 tod_in_ns, full_ts_in_ns; + u64 timestamp, src_port, len; + u32 xfh[OCELOT_TAG_LEN / 4]; + struct net_device *dev; + struct timespec64 ts; + struct sk_buff *skb; + int sz, buf_len; + u32 val, *buf; + int err; + + err = ocelot_xtr_poll_xfh(ocelot, grp, xfh); + if (err) + return err; + + ocelot_xfh_get_src_port(xfh, &src_port); + ocelot_xfh_get_len(xfh, &len); + ocelot_xfh_get_rew_val(xfh, ×tamp); + + if (WARN_ON(src_port >= ocelot->num_phys_ports)) + return -EINVAL; + + dev = ocelot->ops->port_to_netdev(ocelot, src_port); + if (!dev) + return -EINVAL; + + skb = netdev_alloc_skb(dev, len); + if (unlikely(!skb)) { + netdev_err(dev, "Unable to allocate sk_buff\n"); + return -ENOMEM; + } + + buf_len = len - ETH_FCS_LEN; + buf = (u32 *)skb_put(skb, buf_len); + + len = 0; + do { + sz = ocelot_rx_frame_word(ocelot, grp, false, &val); + if (sz < 0) { + err = sz; + goto out_free_skb; + } + *buf++ = val; + len += sz; + } while (len < buf_len); + + /* Read the FCS */ + sz = ocelot_rx_frame_word(ocelot, grp, false, &val); + if (sz < 0) { + err = sz; + goto out_free_skb; + } + + /* Update the statistics if part of the FCS was read before */ + len -= ETH_FCS_LEN - sz; + + if (unlikely(dev->features & NETIF_F_RXFCS)) { + buf = (u32 *)skb_put(skb, ETH_FCS_LEN); + *buf = val; + } + + if (ocelot->ptp) { + ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); + + tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec); + if ((tod_in_ns & 0xffffffff) < timestamp) + full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) | + timestamp; + else + full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) | + timestamp; + + shhwtstamps = skb_hwtstamps(skb); + memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); + shhwtstamps->hwtstamp = full_ts_in_ns; + } + + /* Everything we see on an interface that is in the HW bridge + * has already been forwarded. + */ + if (ocelot->bridge_mask & BIT(src_port)) + skb->offload_fwd_mark = 1; + + skb->protocol = eth_type_trans(skb, dev); + *nskb = skb; + + return 0; + +out_free_skb: + kfree_skb(skb); + return err; +} +EXPORT_SYMBOL(ocelot_xtr_poll_frame); + bool ocelot_can_inject(struct ocelot *ocelot, int grp) { u32 val = ocelot_read(ocelot, QS_INJ_STATUS); diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index fe0f8d6a32ce..47ee06832a51 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -531,153 +531,23 @@ static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops) return 0; } -static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh, - u32 *rval) -{ - u32 val; - u32 bytes_valid; - - val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - if (val == XTR_NOT_READY) { - if (ifh) - return -EIO; - - do { - val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - } while (val == XTR_NOT_READY); - } - - switch (val) { - case XTR_ABORT: - return -EIO; - case XTR_EOF_0: - case XTR_EOF_1: - case XTR_EOF_2: - case XTR_EOF_3: - case XTR_PRUNED: - bytes_valid = XTR_VALID_BYTES(val); - val = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - if (val == XTR_ESCAPE) - *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - else - *rval = val; - - return bytes_valid; - case XTR_ESCAPE: - *rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp); - - return 4; - default: - *rval = val; - - return 4; - } -} - static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) { struct ocelot *ocelot = arg; - int i = 0, grp = 0; - int err = 0; + int grp = 0, err; while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) { - struct skb_shared_hwtstamps *shhwtstamps; - struct ocelot_port_private *priv; - struct ocelot_port *ocelot_port; - u64 tod_in_ns, full_ts_in_ns; - u64 src_port, len, timestamp; - struct net_device *dev; - u32 xfh[4], val, *buf; - struct timespec64 ts; struct sk_buff *skb; - int sz, buf_len; - - for (i = 0; i < OCELOT_TAG_LEN / 4; i++) { - err = ocelot_rx_frame_word(ocelot, grp, true, &xfh[i]); - if (err != 4) - goto out; - } - - /* At this point the XFH was read correctly, so it is safe to - * presume that there is no error. The err needs to be reset - * otherwise a frame could come in CPU queue between the while - * condition and the check for error later on. And in that case - * the new frame is just removed and not processed. - */ - err = 0; - - ocelot_xfh_get_src_port(xfh, &src_port); - ocelot_xfh_get_len(xfh, &len); - ocelot_xfh_get_rew_val(xfh, ×tamp); - - ocelot_port = ocelot->ports[src_port]; - priv = container_of(ocelot_port, struct ocelot_port_private, - port); - dev = priv->dev; - skb = netdev_alloc_skb(dev, len); - - if (unlikely(!skb)) { - netdev_err(dev, "Unable to allocate sk_buff\n"); - err = -ENOMEM; - goto out; - } - buf_len = len - ETH_FCS_LEN; - buf = (u32 *)skb_put(skb, buf_len); - - len = 0; - do { - sz = ocelot_rx_frame_word(ocelot, grp, false, &val); - if (sz < 0) { - err = sz; - goto out; - } - *buf++ = val; - len += sz; - } while (len < buf_len); - - /* Read the FCS */ - sz = ocelot_rx_frame_word(ocelot, grp, false, &val); - if (sz < 0) { - err = sz; + err = ocelot_xtr_poll_frame(ocelot, grp, &skb); + if (err) goto out; - } - - /* Update the statistics if part of the FCS was read before */ - len -= ETH_FCS_LEN - sz; - - if (unlikely(dev->features & NETIF_F_RXFCS)) { - buf = (u32 *)skb_put(skb, ETH_FCS_LEN); - *buf = val; - } - - if (ocelot->ptp) { - ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); - - tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec); - if ((tod_in_ns & 0xffffffff) < timestamp) - full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) | - timestamp; - else - full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) | - timestamp; - - shhwtstamps = skb_hwtstamps(skb); - memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); - shhwtstamps->hwtstamp = full_ts_in_ns; - } - /* Everything we see on an interface that is in the HW bridge - * has already been forwarded. - */ - if (ocelot->bridge_mask & BIT(src_port)) - skb->offload_fwd_mark = 1; + skb->dev->stats.rx_bytes += skb->len; + skb->dev->stats.rx_packets++; - skb->protocol = eth_type_trans(skb, dev); if (!skb_defer_rx_timestamp(skb)) netif_rx(skb); - dev->stats.rx_bytes += len; - dev->stats.rx_packets++; } out: diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 11cae2e968b5..ded9ae1149bc 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -741,6 +741,7 @@ void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb); +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); #else @@ -755,6 +756,12 @@ static inline void ocelot_port_inject_frame(struct ocelot *ocelot, int port, { } +static inline int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, + struct sk_buff **skb) +{ + return -EIO; +} + #endif /* Hardware initialization */ -- cgit v1.2.3 From 0a6f17c6ae2116809a7b7eb6dd3eab59ef5460ef Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 14 Feb 2021 00:38:01 +0200 Subject: net: dsa: tag_ocelot_8021q: add support for PTP timestamping For TX timestamping, we use the felix_txtstamp method which is common with the regular (non-8021q) ocelot tagger. This method says that skb deferral is needed, prepares a timestamp request ID, and puts a clone of the skb in a queue waiting for the timestamp IRQ. felix_txtstamp is called by dsa_skb_tx_timestamp() just before the tagger's xmit method. In the tagger xmit, we divert the packets classified by dsa_skb_tx_timestamp() as PTP towards the MMIO-based injection registers, and we declare them as dead towards dsa_slave_xmit. If not PTP, we proceed with normal tag_8021q stuff. Then the timestamp IRQ fires, the clone queued up from felix_txtstamp is matched to the TX timestamp retrieved from the switch's FIFO based on the timestamp request ID, and the clone is delivered to the stack. On RX, thanks to the VCAP IS2 rule that redirects the frames with an EtherType for 1588 towards two destinations: - the CPU port module (for MMIO based extraction) and - if the "no XTR IRQ" workaround is in place, the dsa_8021q CPU port the relevant data path processing starts in the ptp_classify_raw BPF classifier installed by DSA in the RX data path (post tagger, which is completely unaware that it saw a PTP packet). This time we can't reuse the same implementation of .port_rxtstamp that also works with the default ocelot tagger. That is because felix_rxtstamp is given an skb with a freshly stripped DSA header, and it says "I don't need deferral for its RX timestamp, it's right in it, let me show you"; and it just points to the header right behind skb->data, from where it unpacks the timestamp and annotates the skb with it. The same thing cannot happen with tag_ocelot_8021q, because for one thing, the skb did not have an extraction frame header in the first place, but a VLAN tag with no timestamp information. So the code paths in felix_rxtstamp for the regular and 8021q tagger are completely independent. With tag_8021q, the timestamp must come from the packet's duplicate delivered to the CPU port module, but there is potentially complex logic to be handled [ and prone to reordering ] if we were to just start reading packets from the CPU port module, and try to match them to the one we received over Ethernet and which needs an RX timestamp. So we do something simple: we tell DSA "give me some time to think" (we request skb deferral by returning false from .port_rxtstamp) and we just drop the frame we got over Ethernet with no attempt to match it to anything - we just treat it as a notification that there's data to be processed from the CPU port module's queues. Then we proceed to read the packets from those, one by one, which we deliver up the stack, timestamped, using netif_rx - the same function that any driver would use anyway if it needed RX timestamp deferral. So the assumption is that we'll come across the PTP packet that triggered the CPU extraction notification eventually, but we don't know when exactly. Thanks to the VCAP IS2 trap/redirect rule and the exclusion of the CPU port module from the flooding replicators, only PTP frames should be present in the CPU port module's RX queues anyway. There is just one conflict between the VCAP IS2 trapping rule and the semantics of the BPF classifier. Namely, ptp_classify_raw() deems general messages as non-timestampable, but still, those are trapped to the CPU port module since they have an EtherType of ETH_P_1588. So, if the "no XTR IRQ" workaround is in place, we need to run another BPF classifier on the frames extracted over MMIO, to avoid duplicates being sent to the stack (once over Ethernet, once over MMIO). It doesn't look like it's possible to install VCAP IS2 rules based on keys extracted from the 1588 frame headers. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 69 ++++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot.c | 7 +++ drivers/net/ethernet/mscc/ocelot_vsc7514.c | 3 +- include/soc/mscc/ocelot.h | 5 +++ net/dsa/tag_ocelot_8021q.c | 33 ++++++++++++++ 5 files changed, 115 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index b7d51e4ec792..336d93d03a9a 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -345,6 +346,15 @@ static int felix_setup_mmio_filtering(struct felix *felix) return ret; } + /* The ownership of the CPU port module's queues might have just been + * transferred to the tag_8021q tagger from the NPI-based tagger. + * So there might still be all sorts of crap in the queues. On the + * other hand, the MMIO-based matching of PTP frames is very brittle, + * so we need to be careful that there are no extra frames to be + * dequeued over MMIO, since we would never know to discard them. + */ + ocelot_drain_cpu_queue(ocelot, 0); + return 0; } @@ -1273,6 +1283,55 @@ static int felix_hwtstamp_set(struct dsa_switch *ds, int port, return ocelot_hwstamp_set(ocelot, port, ifr); } +static bool felix_check_xtr_pkt(struct ocelot *ocelot, unsigned int ptp_type) +{ + struct felix *felix = ocelot_to_felix(ocelot); + int err, grp = 0; + + if (felix->tag_proto != DSA_TAG_PROTO_OCELOT_8021Q) + return false; + + if (!felix->info->quirk_no_xtr_irq) + return false; + + if (ptp_type == PTP_CLASS_NONE) + return false; + + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) { + struct sk_buff *skb; + unsigned int type; + + err = ocelot_xtr_poll_frame(ocelot, grp, &skb); + if (err) + goto out; + + /* We trap to the CPU port module all PTP frames, but + * felix_rxtstamp() only gets called for event frames. + * So we need to avoid sending duplicate general + * message frames by running a second BPF classifier + * here and dropping those. + */ + __skb_push(skb, ETH_HLEN); + + type = ptp_classify_raw(skb); + + __skb_pull(skb, ETH_HLEN); + + if (type == PTP_CLASS_NONE) { + kfree_skb(skb); + continue; + } + + netif_rx(skb); + } + +out: + if (err < 0) + ocelot_drain_cpu_queue(ocelot, 0); + + return true; +} + static bool felix_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type) { @@ -1283,6 +1342,16 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port, struct timespec64 ts; u64 tstamp, val; + /* If the "no XTR IRQ" workaround is in use, tell DSA to defer this skb + * for RX timestamping. Then free it, and poll for its copy through + * MMIO in the CPU port module, and inject that into the stack from + * ocelot_xtr_poll(). + */ + if (felix_check_xtr_pkt(ocelot, type)) { + kfree_skb(skb); + return true; + } + ocelot_ptp_gettime64(&ocelot->ptp_info, &ts); tstamp = ktime_set(ts.tv_sec, ts.tv_nsec); diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 981811ffcdae..8d97c731e953 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -837,6 +837,13 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, } EXPORT_SYMBOL(ocelot_port_inject_frame); +void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) +{ + while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) + ocelot_read_rix(ocelot, QS_XTR_RD, grp); +} +EXPORT_SYMBOL(ocelot_drain_cpu_queue); + int ocelot_fdb_add(struct ocelot *ocelot, int port, const unsigned char *addr, u16 vid) { diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 47ee06832a51..4bd7e9d9ec61 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -552,8 +552,7 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) out: if (err < 0) - while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) - ocelot_read_rix(ocelot, QS_XTR_RD, grp); + ocelot_drain_cpu_queue(ocelot, 0); return IRQ_HANDLED; } diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index ded9ae1149bc..1f2d90976564 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -742,6 +742,7 @@ bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb); int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); +void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp); #else @@ -762,6 +763,10 @@ static inline int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, return -EIO; } +static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) +{ +} + #endif /* Hardware initialization */ diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 190255d06bef..5f3e8e124a82 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -9,8 +9,36 @@ * that on egress */ #include +#include +#include #include "dsa_priv.h" +static struct sk_buff *ocelot_xmit_ptp(struct dsa_port *dp, + struct sk_buff *skb, + struct sk_buff *clone) +{ + struct ocelot *ocelot = dp->ds->priv; + struct ocelot_port *ocelot_port; + int port = dp->index; + u32 rew_op; + + if (!ocelot_can_inject(ocelot, 0)) + return NULL; + + ocelot_port = ocelot->ports[port]; + rew_op = ocelot_port->ptp_cmd; + + /* Retrieve timestamp ID populated inside skb->cb[0] of the + * clone by ocelot_port_add_txtstamp_skb + */ + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + rew_op |= clone->cb[0] << 3; + + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + + return NULL; +} + static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -18,6 +46,11 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + + /* TX timestamping was requested, so inject through MMIO */ + if (clone) + return ocelot_xmit_ptp(dp, skb, clone); return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); -- cgit v1.2.3 From dcbdf1350e3312c199dbc6a76f41cf8f67e8c09c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 13 Feb 2021 22:43:17 +0200 Subject: net: bridge: propagate extack through switchdev_port_attr_set The benefit is the ability to propagate errors from switchdev drivers for the SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING and SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL attributes. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/switchdev.h | 3 ++- net/bridge/br_mrp_switchdev.c | 4 ++-- net/bridge/br_multicast.c | 6 +++--- net/bridge/br_netlink.c | 2 +- net/bridge/br_private.h | 3 ++- net/bridge/br_stp.c | 4 ++-- net/bridge/br_switchdev.c | 6 ++++-- net/bridge/br_vlan.c | 13 +++++++------ net/switchdev/switchdev.c | 19 ++++++++++++------- 9 files changed, 35 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 25d9e4570934..195f62672cc4 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -247,7 +247,8 @@ switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info) void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr); + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack); int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack); diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 75a7e8d0a268..3c9a4abcf4ee 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -178,7 +178,7 @@ int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state) }; int err; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", (unsigned int)p->port_no, p->dev->name); @@ -196,7 +196,7 @@ int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, }; int err; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index bf10ef5bbcd9..9d265447d654 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1381,7 +1381,7 @@ static void br_mc_router_state_change(struct net_bridge *p, .u.mrouter = is_mc_router, }; - switchdev_port_attr_set(p->dev, &attr); + switchdev_port_attr_set(p->dev, &attr, NULL); } static void br_multicast_local_router_expired(struct timer_list *t) @@ -1602,7 +1602,7 @@ static void br_mc_disabled_update(struct net_device *dev, bool value) .u.mc_disabled = !value, }; - switchdev_port_attr_set(dev, &attr); + switchdev_port_attr_set(dev, &attr, NULL); } int br_multicast_add_port(struct net_bridge_port *port) @@ -2645,7 +2645,7 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, .u.mrouter = is_mc_router, }; - switchdev_port_attr_set(p->dev, &attr); + switchdev_port_attr_set(p->dev, &attr, NULL); } /* diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3f5dc6fcc980..f2b1343f8332 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1221,7 +1221,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); - err = __br_vlan_set_proto(br, vlan_proto); + err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a8d483325476..da71e71fcddc 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1087,7 +1087,8 @@ struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); -int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, + struct netlink_ext_ack *extack); int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index a3a5745660dd..21c6781906aa 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -43,7 +43,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) return; p->state = state; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); @@ -591,7 +591,7 @@ int __set_ageing_time(struct net_device *dev, unsigned long t) }; int err; - err = switchdev_port_attr_set(dev, &attr); + err = switchdev_port_attr_set(dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 184cf4c8b06d..b89503832fcc 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -96,9 +96,11 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; - err = switchdev_port_attr_set(p->dev, &attr); + err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { - NL_SET_ERR_MSG_MOD(extack, "error setting offload flag on port"); + if (extack && !extack->_msg) + NL_SET_ERR_MSG_MOD(extack, + "error setting offload flag on port"); return err; } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index c4a51095850a..8829f621b8ec 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -820,7 +820,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; - err = switchdev_port_attr_set(br->dev, &attr); + err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; @@ -850,7 +850,8 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) } EXPORT_SYMBOL_GPL(br_vlan_get_proto); -int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, + struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, @@ -867,7 +868,7 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) if (br->vlan_proto == proto) return 0; - err = switchdev_port_attr_set(br->dev, &attr); + err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; @@ -897,7 +898,7 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) err_filt: attr.u.vlan_protocol = ntohs(oldproto); - switchdev_port_attr_set(br->dev, &attr); + switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) vlan_vid_del(p->dev, proto, vlan->vid); @@ -917,7 +918,7 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val, if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; - return __br_vlan_set_proto(br, htons(val)); + return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) @@ -1165,7 +1166,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) if (!vg) goto out; - ret = switchdev_port_attr_set(p->dev, &attr); + ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 0b84f076591e..89a36db47ab4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -100,7 +100,8 @@ static int switchdev_deferred_enqueue(struct net_device *dev, static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { int err; int rc; @@ -111,7 +112,7 @@ static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, }; rc = call_switchdev_blocking_notifiers(nt, dev, - &attr_info.info, NULL); + &attr_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!attr_info.handled); @@ -125,9 +126,11 @@ static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, } static int switchdev_port_attr_set_now(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { - return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr); + return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, + extack); } static void switchdev_port_attr_set_deferred(struct net_device *dev, @@ -136,7 +139,7 @@ static void switchdev_port_attr_set_deferred(struct net_device *dev, const struct switchdev_attr *attr = data; int err; - err = switchdev_port_attr_set_now(dev, attr); + err = switchdev_port_attr_set_now(dev, attr, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); @@ -156,17 +159,19 @@ static int switchdev_port_attr_set_defer(struct net_device *dev, * * @dev: port device * @attr: attribute to set + * @extack: netlink extended ack, for error message propagation * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { if (attr->flags & SWITCHDEV_F_DEFER) return switchdev_port_attr_set_defer(dev, attr); ASSERT_RTNL(); - return switchdev_port_attr_set_now(dev, attr); + return switchdev_port_attr_set_now(dev, attr, extack); } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); -- cgit v1.2.3 From 31046a5fd92c57d99e8861f3dc56a2584787b473 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 13 Feb 2021 22:43:18 +0200 Subject: net: dsa: propagate extack to .port_vlan_add Allow drivers to communicate their restrictions to user space directly, instead of printing to the kernel log. Where the conversion would have been lossy and things like VLAN ID could no longer be conveyed (due to the lack of support for printf format specifier in netlink extack), I chose to keep the messages in full form to the kernel log only, and leave it up to individual driver maintainers to move more messages to extack. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 3 ++- drivers/net/dsa/b53/b53_priv.h | 3 ++- drivers/net/dsa/bcm_sf2_cfp.c | 2 +- drivers/net/dsa/dsa_loop.c | 3 ++- drivers/net/dsa/hirschmann/hellcreek.c | 12 ++++++++---- drivers/net/dsa/lantiq_gswip.c | 12 ++++++++---- drivers/net/dsa/microchip/ksz8795.c | 3 ++- drivers/net/dsa/microchip/ksz9477.c | 7 ++++--- drivers/net/dsa/mt7530.c | 3 ++- drivers/net/dsa/mv88e6xxx/chip.c | 3 ++- drivers/net/dsa/ocelot/felix.c | 3 ++- drivers/net/dsa/qca8k.c | 3 ++- drivers/net/dsa/realtek-smi-core.h | 3 ++- drivers/net/dsa/rtl8366.c | 11 ++++++++--- drivers/net/dsa/sja1105/sja1105_main.c | 6 ++++-- include/net/dsa.h | 3 ++- net/dsa/dsa_priv.h | 4 +++- net/dsa/port.c | 4 +++- net/dsa/slave.c | 25 ++++++++++++++++++------- net/dsa/switch.c | 3 ++- 20 files changed, 79 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 72c75c7bdb65..98cc051e513e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1444,7 +1444,8 @@ static int b53_vlan_prepare(struct dsa_switch *ds, int port, } int b53_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct b53_device *dev = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index ae72ef46b0b6..fc5d6fddb3fe 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -348,7 +348,8 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, bool tx_pause, bool rx_pause); int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); int b53_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int b53_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); int b53_fdb_add(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index 178218cf73a3..a7e2fcf2df2c 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -891,7 +891,7 @@ static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port, else vlan.flags = 0; - ret = ds->ops->port_vlan_add(ds, port_num, &vlan); + ret = ds->ops->port_vlan_add(ds, port_num, &vlan, NULL); if (ret) return ret; } diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 8c283f59158b..e55b63a7e907 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -199,7 +199,8 @@ static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port, } static int dsa_loop_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index f984ca75a71f..5816ef922e55 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -341,7 +341,8 @@ static u16 hellcreek_private_vid(int port) } static int hellcreek_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct hellcreek *hellcreek = ds->priv; int i; @@ -358,8 +359,10 @@ static int hellcreek_vlan_prepare(struct dsa_switch *ds, int port, if (!dsa_is_user_port(ds, i)) continue; - if (vlan->vid == restricted_vid) + if (vlan->vid == restricted_vid) { + NL_SET_ERR_MSG_MOD(extack, "VID restricted by driver"); return -EBUSY; + } } return 0; @@ -445,14 +448,15 @@ static void hellcreek_unapply_vlan(struct hellcreek *hellcreek, int port, } static int hellcreek_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct hellcreek *hellcreek = ds->priv; int err; - err = hellcreek_vlan_prepare(ds, port, vlan); + err = hellcreek_vlan_prepare(ds, port, vlan, extack); if (err) return err; diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 9fec97773a15..174ca3a484a0 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1128,7 +1128,8 @@ static void gswip_port_bridge_leave(struct dsa_switch *ds, int port, } static int gswip_port_vlan_prepare(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; @@ -1163,15 +1164,18 @@ static int gswip_port_vlan_prepare(struct dsa_switch *ds, int port, } } - if (idx == -1) + if (idx == -1) { + NL_SET_ERR_MSG_MOD(extack, "No slot in VLAN table"); return -ENOSPC; + } } return 0; } static int gswip_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct gswip_priv *priv = ds->priv; struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; @@ -1179,7 +1183,7 @@ static int gswip_port_vlan_add(struct dsa_switch *ds, int port, bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; int err; - err = gswip_port_vlan_prepare(ds, port, vlan); + err = gswip_port_vlan_prepare(ds, port, vlan, extack); if (err) return err; diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index c87d445b30fd..1e27a3e58141 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -793,7 +793,8 @@ static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port, } static int ksz8795_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 00e38c8e0d01..772e34d5b6b8 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -511,7 +511,8 @@ static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port, } static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; u32 vlan_table[3]; @@ -520,7 +521,7 @@ static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, err = ksz9477_get_vlan_table(dev, vlan->vid, vlan_table); if (err) { - dev_dbg(dev->dev, "Failed to get vlan table\n"); + NL_SET_ERR_MSG_MOD(extack, "Failed to get vlan table"); return err; } @@ -535,7 +536,7 @@ static int ksz9477_port_vlan_add(struct dsa_switch *ds, int port, err = ksz9477_set_vlan_table(dev, vlan->vid, vlan_table); if (err) { - dev_dbg(dev->dev, "Failed to set vlan table\n"); + NL_SET_ERR_MSG_MOD(extack, "Failed to set vlan table"); return err; } diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index eb13ba79dd01..c089cd48e65d 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1483,7 +1483,8 @@ mt7530_hw_vlan_update(struct mt7530_priv *priv, u16 vid, static int mt7530_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 0ef1fadfec68..d46f0c096c97 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1982,7 +1982,8 @@ static int mv88e6xxx_port_vlan_join(struct mv88e6xxx_chip *chip, int port, } static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 336d93d03a9a..3951189e5a9a 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -789,7 +789,8 @@ static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) } static int felix_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct ocelot *ocelot = ds->priv; u16 flags = vlan->flags; diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 6127823d6c2e..73978e7e85cd 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1313,7 +1313,8 @@ qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) static int qca8k_port_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; diff --git a/drivers/net/dsa/realtek-smi-core.h b/drivers/net/dsa/realtek-smi-core.h index 26376b052594..93a3e05a6f71 100644 --- a/drivers/net/dsa/realtek-smi-core.h +++ b/drivers/net/dsa/realtek-smi-core.h @@ -133,7 +133,8 @@ int rtl8366_init_vlan(struct realtek_smi *smi); int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); int rtl8366_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int rtl8366_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void rtl8366_get_strings(struct dsa_switch *ds, int port, u32 stringset, diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index 3b24f2e13200..76303a77aa82 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -375,7 +375,8 @@ int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) EXPORT_SYMBOL_GPL(rtl8366_vlan_filtering); int rtl8366_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { bool untagged = !!(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED); bool pvid = !!(vlan->flags & BRIDGE_VLAN_INFO_PVID); @@ -384,16 +385,20 @@ int rtl8366_vlan_add(struct dsa_switch *ds, int port, u32 untag = 0; int ret; - if (!smi->ops->is_vlan_valid(smi, vlan->vid)) + if (!smi->ops->is_vlan_valid(smi, vlan->vid)) { + NL_SET_ERR_MSG_MOD(extack, "VLAN ID not valid"); return -EINVAL; + } /* Enable VLAN in the hardware * FIXME: what's with this 4k business? * Just rtl8366_enable_vlan() seems inconclusive. */ ret = rtl8366_enable_vlan4k(smi, true); - if (ret) + if (ret) { + NL_SET_ERR_MSG_MOD(extack, "Failed to enable VLAN 4K"); return ret; + } dev_info(smi->dev, "add VLAN %d on port %d, %s, %s\n", vlan->vid, port, untagged ? "untagged" : "tagged", diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 1dad94540cc9..92c7cea3ce6e 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2795,7 +2795,8 @@ static int sja1105_vlan_del_one(struct dsa_switch *ds, int port, u16 vid, } static int sja1105_vlan_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct sja1105_private *priv = ds->priv; bool vlan_table_changed = false; @@ -2807,7 +2808,8 @@ static int sja1105_vlan_add(struct dsa_switch *ds, int port, */ if (priv->vlan_state != SJA1105_VLAN_FILTERING_FULL && vid_is_dsa_8021q(vlan->vid)) { - dev_err(ds->dev, "Range 1024-3071 reserved for dsa_8021q operation\n"); + NL_SET_ERR_MSG_MOD(extack, + "Range 1024-3071 reserved for dsa_8021q operation"); return -EBUSY; } diff --git a/include/net/dsa.h b/include/net/dsa.h index b095ef114fe8..01bd154798c5 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -643,7 +643,8 @@ struct dsa_switch_ops { int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering); int (*port_vlan_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); /* diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f5949b39f6f7..17a9f82db937 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -75,6 +75,7 @@ struct dsa_notifier_vlan_info { const struct switchdev_obj_port_vlan *vlan; int sw_index; int port; + struct netlink_ext_ack *extack; }; /* DSA_NOTIFIER_MTU */ @@ -192,7 +193,8 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, struct netlink_ext_ack *extack); int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan); + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); int dsa_port_link_register_of(struct dsa_port *dp); diff --git a/net/dsa/port.c b/net/dsa/port.c index 80e6471a7a5c..03ecefe1064a 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -535,12 +535,14 @@ int dsa_port_mdb_del(const struct dsa_port *dp, } int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan) + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) { struct dsa_notifier_vlan_info info = { .sw_index = dp->ds->index, .port = dp->index, .vlan = vlan, + .extack = extack, }; return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8c9a41a7209a..9ec487b63e13 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -357,11 +357,14 @@ static int dsa_slave_vlan_add(struct net_device *dev, rcu_read_lock(); err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan); rcu_read_unlock(); - if (err) + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Port already has a VLAN upper with this VID"); return err; + } } - err = dsa_port_vlan_add(dp, &vlan); + err = dsa_port_vlan_add(dp, &vlan, extack); if (err) return err; @@ -371,7 +374,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; - err = dsa_port_vlan_add(dp->cpu_dp, &vlan); + err = dsa_port_vlan_add(dp->cpu_dp, &vlan, extack); if (err) return err; @@ -1287,17 +1290,25 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; + struct netlink_ext_ack extack = {0}; int ret; /* User port... */ - ret = dsa_port_vlan_add(dp, &vlan); - if (ret) + ret = dsa_port_vlan_add(dp, &vlan, &extack); + if (ret) { + if (extack._msg) + netdev_err(dev, "%s\n", extack._msg); return ret; + } /* And CPU port... */ - ret = dsa_port_vlan_add(dp->cpu_dp, &vlan); - if (ret) + ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &extack); + if (ret) { + if (extack._msg) + netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, + extack._msg); return ret; + } return vlan_vid_add(master, proto, vid); } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 1906179e59f7..c82d201181a5 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -291,7 +291,8 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, for (port = 0; port < ds->num_ports; port++) { if (dsa_switch_vlan_match(ds, port, info)) { - err = ds->ops->port_vlan_add(ds, port, info->vlan); + err = ds->ops->port_vlan_add(ds, port, info->vlan, + info->extack); if (err) return err; } -- cgit v1.2.3 From 89153ed6ebc14879b04686f0e3f3066b1b6bef05 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 13 Feb 2021 22:43:19 +0200 Subject: net: dsa: propagate extack to .port_vlan_filtering Some drivers can't dynamically change the VLAN filtering option, or impose some restrictions, it would be nice to propagate this info through netlink instead of printing it to a kernel log that might never be read. Also netlink extack includes the module that emitted the message, which means that it's easier to figure out which ones are driver-generated errors as opposed to command misuse. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 3 ++- drivers/net/dsa/b53/b53_priv.h | 3 ++- drivers/net/dsa/dsa_loop.c | 3 ++- drivers/net/dsa/hirschmann/hellcreek.c | 3 ++- drivers/net/dsa/lantiq_gswip.c | 10 +++++++--- drivers/net/dsa/microchip/ksz8795.c | 3 ++- drivers/net/dsa/microchip/ksz9477.c | 3 ++- drivers/net/dsa/mt7530.c | 4 ++-- drivers/net/dsa/mv88e6xxx/chip.c | 3 ++- drivers/net/dsa/ocelot/felix.c | 3 ++- drivers/net/dsa/qca8k.c | 3 ++- drivers/net/dsa/realtek-smi-core.h | 4 ++-- drivers/net/dsa/rtl8366.c | 3 ++- drivers/net/dsa/sja1105/sja1105.h | 3 ++- drivers/net/dsa/sja1105/sja1105_devlink.c | 2 +- drivers/net/dsa/sja1105/sja1105_main.c | 9 +++++---- include/net/dsa.h | 3 ++- net/dsa/dsa_priv.h | 3 ++- net/dsa/port.c | 18 +++++++++++------- net/dsa/slave.c | 3 ++- net/dsa/switch.c | 6 +++++- 21 files changed, 61 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 98cc051e513e..ae86ded1e2a1 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1409,7 +1409,8 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_phylink_mac_link_up); -int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) +int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct b53_device *dev = ds->priv; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index fc5d6fddb3fe..faf983fbca82 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -346,7 +346,8 @@ void b53_phylink_mac_link_up(struct dsa_switch *ds, int port, struct phy_device *phydev, int speed, int duplex, bool tx_pause, bool rx_pause); -int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering); +int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack); int b53_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index e55b63a7e907..bfdf3324aac3 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -190,7 +190,8 @@ static void dsa_loop_port_stp_state_set(struct dsa_switch *ds, int port, } static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { dev_dbg(ds->dev, "%s: port: %d, vlan_filtering: %d\n", __func__, port, vlan_filtering); diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index 5816ef922e55..463137c39db2 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -875,7 +875,8 @@ static int hellcreek_fdb_dump(struct dsa_switch *ds, int port, } static int hellcreek_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct hellcreek *hellcreek = ds->priv; diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 174ca3a484a0..52e865a3912c 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -727,14 +727,18 @@ static int gswip_pce_load_microcode(struct gswip_priv *priv) } static int gswip_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct net_device *bridge = dsa_to_port(ds, port)->bridge_dev; struct gswip_priv *priv = ds->priv; /* Do not allow changing the VLAN filtering options while in bridge */ - if (bridge && !!(priv->port_vlan_filter & BIT(port)) != vlan_filtering) + if (bridge && !!(priv->port_vlan_filter & BIT(port)) != vlan_filtering) { + NL_SET_ERR_MSG_MOD(extack, + "Dynamic toggling of vlan_filtering not supported"); return -EIO; + } if (vlan_filtering) { /* Use port based VLAN tag */ @@ -773,7 +777,7 @@ static int gswip_setup(struct dsa_switch *ds) /* disable port fetch/store dma on all ports */ for (i = 0; i < priv->hw_info->max_ports; i++) { gswip_port_disable(ds, i); - gswip_port_vlan_filtering(ds, i, false); + gswip_port_vlan_filtering(ds, i, false, NULL); } /* enable Switch */ diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 1e27a3e58141..b4b7de63ca79 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -783,7 +783,8 @@ static void ksz8795_flush_dyn_mac_table(struct ksz_device *dev, int port) } static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port, - bool flag) + bool flag, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 772e34d5b6b8..55e5d479acce 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -493,7 +493,8 @@ static void ksz9477_flush_dyn_mac_table(struct ksz_device *dev, int port) } static int ksz9477_port_vlan_filtering(struct dsa_switch *ds, int port, - bool flag) + bool flag, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c089cd48e65d..c17de2bcf2fe 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1376,8 +1376,8 @@ mt7530_vlan_cmd(struct mt7530_priv *priv, enum mt7530_vlan_cmd cmd, u16 vid) } static int -mt7530_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) +mt7530_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { if (vlan_filtering) { /* The port is being kept as VLAN-unaware port when bridge is diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index d46f0c096c97..903d619e08ed 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1600,7 +1600,8 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, } static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; u16 mode = vlan_filtering ? MV88E6XXX_PORT_CTL2_8021Q_MODE_SECURE : diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 3951189e5a9a..800f27d65c6c 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -781,7 +781,8 @@ static int felix_vlan_prepare(struct dsa_switch *ds, int port, flags & BRIDGE_VLAN_INFO_UNTAGGED); } -static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) +static int felix_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, + struct netlink_ext_ack *extack) { struct ocelot *ocelot = ds->priv; diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 73978e7e85cd..cdaf9f85a2cb 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1294,7 +1294,8 @@ qca8k_port_fdb_dump(struct dsa_switch *ds, int port, } static int -qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) +qca8k_port_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct qca8k_priv *priv = ds->priv; diff --git a/drivers/net/dsa/realtek-smi-core.h b/drivers/net/dsa/realtek-smi-core.h index 93a3e05a6f71..fcf465f7f922 100644 --- a/drivers/net/dsa/realtek-smi-core.h +++ b/drivers/net/dsa/realtek-smi-core.h @@ -130,8 +130,8 @@ int rtl8366_enable_vlan4k(struct realtek_smi *smi, bool enable); int rtl8366_enable_vlan(struct realtek_smi *smi, bool enable); int rtl8366_reset_vlan(struct realtek_smi *smi); int rtl8366_init_vlan(struct realtek_smi *smi); -int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, - bool vlan_filtering); +int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack); int rtl8366_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index 76303a77aa82..75897a369096 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -340,7 +340,8 @@ int rtl8366_init_vlan(struct realtek_smi *smi) } EXPORT_SYMBOL_GPL(rtl8366_init_vlan); -int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) +int rtl8366_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct realtek_smi *smi = ds->priv; struct rtl8366_vlan_4k vlan4k; diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 15a0893d0ff1..90f0f6f3124e 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -247,7 +247,8 @@ enum sja1105_reset_reason { int sja1105_static_config_reload(struct sja1105_private *priv, enum sja1105_reset_reason reason); -int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled); +int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, + struct netlink_ext_ack *extack); void sja1105_frame_memory_partitioning(struct sja1105_private *priv); /* From sja1105_devlink.c */ diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c index b4bf1b10e66c..b6a4a16b8c7e 100644 --- a/drivers/net/dsa/sja1105/sja1105_devlink.c +++ b/drivers/net/dsa/sja1105/sja1105_devlink.c @@ -143,7 +143,7 @@ static int sja1105_best_effort_vlan_filtering_set(struct sja1105_private *priv, dp = dsa_to_port(ds, port); vlan_filtering = dsa_port_is_vlan_filtering(dp); - rc = sja1105_vlan_filtering(ds, port, vlan_filtering); + rc = sja1105_vlan_filtering(ds, port, vlan_filtering, NULL); if (rc) break; } diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 92c7cea3ce6e..0818a67a7b2d 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2639,7 +2639,8 @@ out: * which can only be partially reconfigured at runtime (and not the TPID). * So a switch reset is required. */ -int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) +int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled, + struct netlink_ext_ack *extack) { struct sja1105_l2_lookup_params_entry *l2_lookup_params; struct sja1105_general_params_entry *general_params; @@ -2653,8 +2654,8 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) list_for_each_entry(rule, &priv->flow_block.rules, list) { if (rule->type == SJA1105_RULE_VL) { - dev_err(ds->dev, - "Cannot change VLAN filtering with active VL rules\n"); + NL_SET_ERR_MSG_MOD(extack, + "Cannot change VLAN filtering with active VL rules"); return -EBUSY; } } @@ -2736,7 +2737,7 @@ int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) rc = sja1105_static_config_reload(priv, SJA1105_VLAN_FILTERING); if (rc) - dev_err(ds->dev, "Failed to change VLAN Ethertype\n"); + NL_SET_ERR_MSG_MOD(extack, "Failed to change VLAN Ethertype"); /* Switch port identification based on 802.1Q is only passable * if we are not under a vlan_filtering bridge. So make sure diff --git a/include/net/dsa.h b/include/net/dsa.h index 01bd154798c5..68f8159564a3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -641,7 +641,8 @@ struct dsa_switch_ops { * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, - bool vlan_filtering); + bool vlan_filtering, + struct netlink_ext_ack *extack); int (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 17a9f82db937..e9d1e76c42ba 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -170,7 +170,8 @@ int dsa_port_lag_change(struct dsa_port *dp, int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering); +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct netlink_ext_ack *extack); bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, diff --git a/net/dsa/port.c b/net/dsa/port.c index 03ecefe1064a..14a1d0d77657 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -294,7 +294,8 @@ void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) /* Must be called under rcu_read_lock() */ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, - bool vlan_filtering) + bool vlan_filtering, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; int err, i; @@ -324,8 +325,8 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { - dev_err(ds->dev, "Must remove upper %s first\n", - upper_dev->name); + NL_SET_ERR_MSG_MOD(extack, + "Must first remove VLAN uppers having VIDs also present in bridge"); return false; } } @@ -351,14 +352,16 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, if (other_bridge == dp->bridge_dev) continue; if (br_vlan_enabled(other_bridge) != vlan_filtering) { - dev_err(ds->dev, "VLAN filtering is a global setting\n"); + NL_SET_ERR_MSG_MOD(extack, + "VLAN filtering is a global setting"); return false; } } return true; } -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; bool apply; @@ -372,7 +375,7 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) * dsa_slave_switchdev_event(). */ rcu_read_lock(); - apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering); + apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack); rcu_read_unlock(); if (!apply) return -EINVAL; @@ -380,7 +383,8 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering) if (dsa_port_is_vlan_filtering(dp) == vlan_filtering) return 0; - err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering); + err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering, + extack); if (err) return err; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9ec487b63e13..5ecb43a1b6e0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -286,7 +286,8 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = dsa_port_set_state(dp, attr->u.stp_state); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: - ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering); + ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, + extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: ret = dsa_port_ageing_time(dp, attr->u.ageing_time); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index c82d201181a5..db2a9b221988 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -106,6 +106,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, { bool unset_vlan_filtering = br_vlan_enabled(info->br); struct dsa_switch_tree *dst = ds->dst; + struct netlink_ext_ack extack = {0}; int err, i; if (dst->index == info->tree_index && ds->index == info->sw_index && @@ -137,7 +138,10 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, } if (unset_vlan_filtering) { err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false); + false, &extack); + if (extack._msg) + dev_err(ds->dev, "port %d: %s\n", info->port, + extack._msg); if (err && err != EOPNOTSUPP) return err; } -- cgit v1.2.3 From 6001a930ce0378b62210d4f83583fc88a903d89d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Feb 2021 12:28:07 +0100 Subject: netfilter: nftables: introduce table ownership A userspace daemon like firewalld might need to monitor for netlink updates to detect its ruleset removal by the (global) flush ruleset command to ensure ruleset persistency. This adds extra complexity from userspace and, for some little time, the firewall policy is not in place. This patch adds the NFT_TABLE_F_OWNER flag which allows a userspace program to own the table that creates in exclusivity. Tables that are owned... - can only be updated and removed by the owner, non-owners hit EPERM if they try to update it or remove it. - are destroyed when the owner closes the netlink socket or the process is gone (implicit netlink socket closure). - are skipped by the global flush ruleset command. - are listed in the global ruleset. The userspace process that sets on the NFT_TABLE_F_OWNER flag need to leave open the netlink socket. A new NFTA_TABLE_OWNER netlink attribute specifies the netlink port ID to identify the owner from userspace. This patch also updates error reporting when an unknown table flag is specified to change it from EINVAL to EOPNOTSUPP given that EINVAL is usually reserved to report for malformed netlink messages to userspace. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 ++ include/uapi/linux/netfilter/nf_tables.h | 5 + net/netfilter/nf_tables_api.c | 163 ++++++++++++++++++++++--------- 3 files changed, 128 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 80bc2e8282ae..fdec57d862b7 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1106,11 +1106,17 @@ struct nft_table { u16 family:6, flags:8, genmask:2; + u32 nlpid; char *name; u16 udlen; u8 *udata; }; +static inline bool nft_table_has_owner(const struct nft_table *table) +{ + return table->flags & NFT_TABLE_F_OWNER; +} + static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b1633e7ba529..79bab7a36b30 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -164,7 +164,10 @@ enum nft_hook_attributes { */ enum nft_table_flags { NFT_TABLE_F_DORMANT = 0x1, + NFT_TABLE_F_OWNER = 0x2, }; +#define NFT_TABLE_F_MASK (NFT_TABLE_F_DORMANT | \ + NFT_TABLE_F_OWNER) /** * enum nft_table_attributes - nf_tables table netlink attributes @@ -173,6 +176,7 @@ enum nft_table_flags { * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) * @NFTA_TABLE_USERDATA: user data (NLA_BINARY) + * @NFTA_TABLE_OWNER: owner of this table through netlink portID (NLA_U32) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, @@ -182,6 +186,7 @@ enum nft_table_attributes { NFTA_TABLE_HANDLE, NFTA_TABLE_PAD, NFTA_TABLE_USERDATA, + NFTA_TABLE_OWNER, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dffb4f8ef17f..c1eb5cdb3033 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -508,7 +508,7 @@ static int nft_delflowtable(struct nft_ctx *ctx, static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, - u8 family, u8 genmask) + u8 family, u8 genmask, u32 nlpid) { struct nft_table *table; @@ -519,8 +519,13 @@ static struct nft_table *nft_table_lookup(const struct net *net, lockdep_is_held(&net->nft.commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && - nft_active_genmask(table, genmask)) + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -679,6 +684,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) goto nla_put_failure; + if (nft_table_has_owner(table) && + nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid))) + goto nla_put_failure; if (table->udata) { if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) @@ -821,7 +829,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); @@ -902,8 +910,8 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return 0; flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + if (flags & ~NFT_TABLE_F_MASK) + return -EOPNOTSUPP; if (flags == ctx->table->flags) return 0; @@ -1003,7 +1011,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, lockdep_assert_held(&net->nft.commit_mutex); attr = nla[NFTA_TABLE_NAME]; - table = nft_table_lookup(net, attr, family, genmask); + table = nft_table_lookup(net, attr, family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -1021,8 +1030,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (nla[NFTA_TABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + if (flags & ~NFT_TABLE_F_MASK) + return -EOPNOTSUPP; } err = -ENOMEM; @@ -1053,6 +1062,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, table->family = family; table->flags = flags; table->handle = ++table_handle; + if (table->flags & NFT_TABLE_F_OWNER) + table->nlpid = NETLINK_CB(skb).portid; nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); @@ -1160,6 +1171,9 @@ static int nft_flush(struct nft_ctx *ctx, int family) if (!nft_is_active_next(ctx->net, table)) continue; + if (nft_table_has_owner(table) && table->nlpid != ctx->portid) + continue; + if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; @@ -1196,7 +1210,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, table = nft_table_lookup_byhandle(net, attr, genmask); } else { attr = nla[NFTA_TABLE_NAME]; - table = nft_table_lookup(net, attr, family, genmask); + table = nft_table_lookup(net, attr, family, genmask, + NETLINK_CB(skb).portid); } if (IS_ERR(table)) { @@ -1579,7 +1594,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -2299,7 +2314,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, lockdep_assert_held(&net->nft.commit_mutex); - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -2395,7 +2411,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, u32 use; int err; - table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); @@ -3041,7 +3058,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3179,7 +3196,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, lockdep_assert_held(&net->nft.commit_mutex); - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3403,7 +3421,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); @@ -3584,7 +3603,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct nlmsghdr *nlh, const struct nlattr * const nla[], struct netlink_ext_ack *extack, - u8 genmask) + u8 genmask, u32 nlpid) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int family = nfmsg->nfgen_family; @@ -3592,7 +3611,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, if (nla[NFTA_SET_TABLE] != NULL) { table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask); + genmask, nlpid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); @@ -4007,7 +4026,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, /* Verify existence before starting dump */ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, 0); if (err < 0) return err; @@ -4236,7 +4255,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) desc.expr = true; - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); @@ -4413,7 +4433,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return -EINVAL; err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -4608,14 +4628,14 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct nlmsghdr *nlh, const struct nlattr * const nla[], struct netlink_ext_ack *extack, - u8 genmask) + u8 genmask, u32 nlpid) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int family = nfmsg->nfgen_family; struct nft_table *table; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask); + genmask, nlpid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); @@ -5032,7 +5052,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, int rem, err = 0; err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5613,7 +5633,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return -EINVAL; err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5821,7 +5841,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, int rem, err = 0; err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, - genmask); + genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -6124,7 +6144,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6394,7 +6415,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6468,7 +6489,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, + NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); @@ -6885,7 +6907,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); @@ -7069,7 +7091,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); @@ -7277,7 +7299,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, - genmask); + genmask, 0); if (IS_ERR(table)) return PTR_ERR(table); @@ -9051,14 +9073,55 @@ static void __nft_release_table(struct net *net, struct nft_table *table) nf_tables_table_destroy(&ctx); } -static void __nft_release_tables(struct net *net) +static void __nft_release_tables(struct net *net, u32 nlpid) { struct nft_table *table, *nt; - list_for_each_entry_safe(table, nt, &net->nft.tables, list) + list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + if (nft_table_has_owner(table) && + nlpid != table->nlpid) + continue; + __nft_release_table(net, table); + } +} + +static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct netlink_notify *n = ptr; + struct nft_table *table, *nt; + struct net *net = n->net; + bool release = false; + + if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) + return NOTIFY_DONE; + + mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(table, &net->nft.tables, list) { + if (nft_table_has_owner(table) && + n->portid == table->nlpid) { + __nft_release_hook(net, table); + release = true; + } + } + if (release) { + synchronize_rcu(); + list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + if (nft_table_has_owner(table) && + n->portid == table->nlpid) + __nft_release_table(net, table); + } + } + mutex_unlock(&net->nft.commit_mutex); + + return NOTIFY_DONE; } +static struct notifier_block nft_nl_notifier = { + .notifier_call = nft_rcv_nl_event, +}; + static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); @@ -9082,7 +9145,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); - __nft_release_tables(net); + __nft_release_tables(net, 0); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list)); @@ -9106,43 +9169,50 @@ static int __init nf_tables_module_init(void) err = nft_chain_filter_init(); if (err < 0) - goto err1; + goto err_chain_filter; err = nf_tables_core_module_init(); if (err < 0) - goto err2; + goto err_core_module; err = register_netdevice_notifier(&nf_tables_flowtable_notifier); if (err < 0) - goto err3; + goto err_netdev_notifier; err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params); if (err < 0) - goto err4; + goto err_rht_objname; err = nft_offload_init(); if (err < 0) - goto err5; + goto err_offload; + + err = netlink_register_notifier(&nft_nl_notifier); + if (err < 0) + goto err_netlink_notifier; /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err6; + goto err_nfnl_subsys; nft_chain_route_init(); return err; -err6: + +err_nfnl_subsys: + netlink_unregister_notifier(&nft_nl_notifier); +err_netlink_notifier: nft_offload_exit(); -err5: +err_offload: rhltable_destroy(&nft_objname_ht); -err4: +err_rht_objname: unregister_netdevice_notifier(&nf_tables_flowtable_notifier); -err3: +err_netdev_notifier: nf_tables_core_module_exit(); -err2: +err_core_module: nft_chain_filter_fini(); -err1: +err_chain_filter: unregister_pernet_subsys(&nf_tables_net_ops); return err; } @@ -9150,6 +9220,7 @@ err1: static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); + netlink_unregister_notifier(&nft_nl_notifier); nft_offload_exit(); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); -- cgit v1.2.3 From d0a0bbe7b0a181c58bd22d6942146cfa3ab9e49a Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 14 Feb 2021 18:43:08 -0500 Subject: atm: idt77252: fix build broken on amd64 idt77252 is broken and wont load on amd64 systems modprobe idt77252 shows the following idt77252_init: skb->cb is too small (48 < 56) Add packed attribute to struct idt77252_skb_prv and struct atm_skb_data so that the total size can be <= sizeof(skb->cb) Also convert runtime size check to buildtime size check in idt77252_init() Signed-off-by: Tong Zhang Signed-off-by: David S. Miller --- drivers/atm/idt77252.c | 11 +---------- drivers/atm/idt77252.h | 2 +- include/linux/atmdev.h | 2 +- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 5f0472c18bcb..0c13cac903de 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -3743,16 +3743,7 @@ static int __init idt77252_init(void) struct sk_buff *skb; printk("%s: at %p\n", __func__, idt77252_init); - - if (sizeof(skb->cb) < sizeof(struct atm_skb_data) + - sizeof(struct idt77252_skb_prv)) { - printk(KERN_ERR "%s: skb->cb is too small (%lu < %lu)\n", - __func__, (unsigned long) sizeof(skb->cb), - (unsigned long) sizeof(struct atm_skb_data) + - sizeof(struct idt77252_skb_prv)); - return -EIO; - } - + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct idt77252_skb_prv) + sizeof(struct atm_skb_data)); return pci_register_driver(&idt77252_driver); } diff --git a/drivers/atm/idt77252.h b/drivers/atm/idt77252.h index 9339197d701c..b059d31364dd 100644 --- a/drivers/atm/idt77252.h +++ b/drivers/atm/idt77252.h @@ -789,7 +789,7 @@ struct idt77252_skb_prv { struct scqe tbd; /* Transmit Buffer Descriptor */ dma_addr_t paddr; /* DMA handle */ u32 pool; /* sb_pool handle */ -}; +} __packed; #define IDT77252_PRV_TBD(skb) \ (((struct idt77252_skb_prv *)(ATM_SKB(skb)+1))->tbd) diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index d7493016cd46..60cd25c0461b 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -207,7 +207,7 @@ struct atm_skb_data { struct atm_vcc *vcc; /* ATM VCC */ unsigned long atm_options; /* ATM layer options */ unsigned int acct_truesize; /* truesize accounted to vcc */ -}; +} __packed; #define VCC_HTABLE_SIZE 32 -- cgit v1.2.3 From 419dfaed7ccc9533b3f4d88eb6f4997b41f8a4fc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 15 Feb 2021 23:09:11 +0200 Subject: net: bridge: fix switchdev_port_attr_set stub when CONFIG_SWITCHDEV=n The switchdev_port_attr_set function prototype was updated only for the case where CONFIG_SWITCHDEV=y|m, leaving a prototype mismatch with the stub definition for the disabled case. This results in a build error, so update that function too. Fixes: dcbdf1350e33 ("net: bridge: propagate extack through switchdev_port_attr_set") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/switchdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 195f62672cc4..9a5426b61ca5 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -296,7 +296,8 @@ static inline void switchdev_deferred_process(void) } static inline int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 0caf3ada24e4623d4b2c938a5b6d2d09e4ccee18 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Feb 2021 16:52:02 -0800 Subject: mptcp: add local addr info in mptcp_info Add mptcpi_local_addr_used and mptcpi_local_addr_max in struct mptcp_info. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 2 ++ net/mptcp/mptcp_diag.c | 2 ++ net/mptcp/pm_netlink.c | 3 ++- net/mptcp/protocol.h | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index c91578aaab32..e1172c1ffdfd 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -103,6 +103,8 @@ struct mptcp_info { __u64 mptcpi_write_seq; __u64 mptcpi_snd_una; __u64 mptcpi_rcv_nxt; + __u8 mptcpi_local_addr_used; + __u8 mptcpi_local_addr_max; }; /* diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 00ed742f48a4..f16d9b5ee978 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -128,11 +128,13 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); val = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_signal_max = val; val = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_add_addr_accepted_max = val; + info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 229fd1af2e29..8e8e35fa4002 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -228,13 +228,14 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); -static unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet; pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->local_addr_max); } +EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); static void check_work_pending(struct mptcp_sock *msk) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d31edbae8da8..1b6ec1773678 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -725,6 +725,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { -- cgit v1.2.3 From 17d3a83afbbff34209d6c3636718fc1abe305ef8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 12 Feb 2021 19:46:31 -0800 Subject: net: phy: broadcom: Remove unused flags We have a number of unused flags defined today and since we are scarce on space and may need to introduce new flags in the future remove and shift every existing flag down into a contiguous assignment. PHY_BCM_FLAGS_MODE_1000BX was only used internally for the BCM54616S PHY, so we allocate a driver private structure instead to store that flag instead of canibalizing one from phydev->dev_flags for that purpose. Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 19 ++++++++++++++++--- include/linux/brcmphy.h | 21 ++++++++------------- 2 files changed, 24 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 4142f69c1530..3ce266ab521b 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -381,10 +381,21 @@ static int bcm5481_config_aneg(struct phy_device *phydev) return ret; } +struct bcm54616s_phy_priv { + bool mode_1000bx_en; +}; + static int bcm54616s_probe(struct phy_device *phydev) { + struct bcm54616s_phy_priv *priv; int val, intf_sel; + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE); if (val < 0) return val; @@ -407,7 +418,7 @@ static int bcm54616s_probe(struct phy_device *phydev) * 1000BASE-X configuration. */ if (!(val & BCM54616S_100FX_MODE)) - phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX; + priv->mode_1000bx_en = true; phydev->port = PORT_FIBRE; } @@ -417,10 +428,11 @@ static int bcm54616s_probe(struct phy_device *phydev) static int bcm54616s_config_aneg(struct phy_device *phydev) { + struct bcm54616s_phy_priv *priv = phydev->priv; int ret; /* Aneg firstly. */ - if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) + if (priv->mode_1000bx_en) ret = genphy_c37_config_aneg(phydev); else ret = genphy_config_aneg(phydev); @@ -433,9 +445,10 @@ static int bcm54616s_config_aneg(struct phy_device *phydev) static int bcm54616s_read_status(struct phy_device *phydev) { + struct bcm54616s_phy_priv *priv = phydev->priv; int err; - if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) + if (priv->mode_1000bx_en) err = genphy_c37_read_status(phydev); else err = genphy_read_status(phydev); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index de9430d55c90..844dcfe789a2 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -61,19 +61,14 @@ #define PHY_BCM_OUI_5 0x03625e00 #define PHY_BCM_OUI_6 0xae025000 -#define PHY_BCM_FLAGS_MODE_COPPER 0x00000001 -#define PHY_BCM_FLAGS_MODE_1000BX 0x00000002 -#define PHY_BCM_FLAGS_INTF_SGMII 0x00000010 -#define PHY_BCM_FLAGS_INTF_XAUI 0x00000020 -#define PHY_BRCM_WIRESPEED_ENABLE 0x00000100 -#define PHY_BRCM_AUTO_PWRDWN_ENABLE 0x00000200 -#define PHY_BRCM_RX_REFCLK_UNUSED 0x00000400 -#define PHY_BRCM_STD_IBND_DISABLE 0x00000800 -#define PHY_BRCM_EXT_IBND_RX_ENABLE 0x00001000 -#define PHY_BRCM_EXT_IBND_TX_ENABLE 0x00002000 -#define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 -#define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 -#define PHY_BRCM_EN_MASTER_MODE 0x00010000 +#define PHY_BRCM_AUTO_PWRDWN_ENABLE 0x00000001 +#define PHY_BRCM_RX_REFCLK_UNUSED 0x00000002 +#define PHY_BRCM_STD_IBND_DISABLE 0x00000004 +#define PHY_BRCM_EXT_IBND_RX_ENABLE 0x00000008 +#define PHY_BRCM_EXT_IBND_TX_ENABLE 0x00000010 +#define PHY_BRCM_CLEAR_RGMII_MODE 0x00000020 +#define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00000040 +#define PHY_BRCM_EN_MASTER_MODE 0x00000080 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) -- cgit v1.2.3 From 5d4358ede8ebe2e4ae03a633082f3ce21ec2df3e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 12 Feb 2021 19:46:32 -0800 Subject: net: phy: broadcom: Allow BCM54210E to configure APD BCM54210E/BCM50212E has been verified to work correctly with the auto-power down configuration done by bcm54xx_adjust_rxrefclk(), add it to the list of PHYs working. While we are at it, provide an appropriate name for the bit we are changing which disables the RXC and TXC during auto-power down when there is no energy on the cable. Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 8 +++++--- include/linux/brcmphy.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 3ce266ab521b..91fbd26c809e 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -193,6 +193,7 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) if (BRCM_PHY_MODEL(phydev) != PHY_ID_BCM57780 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610M && + BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54210E && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54810 && BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54811) return; @@ -227,9 +228,10 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) val |= BCM54XX_SHD_SCR3_DLLAPD_DIS; if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) { - if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 || - BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54811) - val |= BCM54810_SHD_SCR3_TRDDAPD; + if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E || + BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 || + BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E) + val |= BCM54XX_SHD_SCR3_RXCTXC_DIS; else val |= BCM54XX_SHD_SCR3_TRDDAPD; } diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 844dcfe789a2..16597d3fa011 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -193,6 +193,7 @@ #define BCM54XX_SHD_SCR3_DEF_CLK125 0x0001 #define BCM54XX_SHD_SCR3_DLLAPD_DIS 0x0002 #define BCM54XX_SHD_SCR3_TRDDAPD 0x0004 +#define BCM54XX_SHD_SCR3_RXCTXC_DIS 0x0100 /* 01010: Auto Power-Down */ #define BCM54XX_SHD_APD 0x0a @@ -253,7 +254,6 @@ #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN (1 << 0) #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) -#define BCM54810_SHD_SCR3_TRDDAPD 0x0100 /* BCM54612E Registers */ #define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) -- cgit v1.2.3 From 93e8990c24bee30696c02e8f6aed043333491a25 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 14 Feb 2021 15:16:23 +0100 Subject: net: phy: rename PHY_IGNORE_INTERRUPT to PHY_MAC_INTERRUPT Some internal PHY's have their events like link change reported by the MAC interrupt. We have PHY_IGNORE_INTERRUPT to deal with this scenario. I'm not too happy with this name. We don't ignore interrupts, typically there is no interrupt exposed at a PHY level. So let's rename it to PHY_MAC_INTERRUPT. This is in line with phy_mac_interrupt(), which is called from the MAC interrupt handler to handle PHY events. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Acked-by: Florian Fainelli Reviewed-by: Russell King Signed-off-by: David S. Miller --- Documentation/networking/phy.rst | 2 +- drivers/net/ethernet/broadcom/genet/bcmmii.c | 2 +- drivers/net/ethernet/realtek/r8169_main.c | 2 +- drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c | 4 ++-- drivers/net/mdio/mdio-moxart.c | 4 ++-- drivers/net/phy/icplus.c | 2 +- drivers/net/phy/phy.c | 2 +- drivers/net/phy/phy_device.c | 4 ++-- include/linux/phy.h | 10 +++++----- 9 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 399f17976a6c..70136cc9e25e 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -216,7 +216,7 @@ put into an unsupported state. Lastly, once the controller is ready to handle network traffic, you call phy_start(phydev). This tells the PAL that you are ready, and configures the PHY to connect to the network. If the MAC interrupt of your network driver -also handles PHY status changes, just set phydev->irq to PHY_IGNORE_INTERRUPT +also handles PHY status changes, just set phydev->irq to PHY_MAC_INTERRUPT before you call phy_start and use phy_mac_interrupt() from the network driver. If you don't want to use interrupts, set phydev->irq to PHY_POLL. phy_start() enables the PHY interrupts (if applicable) and starts the diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 17f997ef950f..5335244e4577 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -359,7 +359,7 @@ int bcmgenet_mii_probe(struct net_device *dev) * those versions of GENET. */ if (priv->internal_phy && !GENET_IS_V5(priv)) - dev->phydev->irq = PHY_IGNORE_INTERRUPT; + dev->phydev->irq = PHY_MAC_INTERRUPT; return 0; } diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 44a438bad153..cbc30df4e08a 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5056,7 +5056,7 @@ static int r8169_mdio_register(struct rtl8169_private *tp) new_bus->name = "r8169"; new_bus->priv = tp; new_bus->parent = &pdev->dev; - new_bus->irq[0] = PHY_IGNORE_INTERRUPT; + new_bus->irq[0] = PHY_MAC_INTERRUPT; snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x", pci_dev_id(pdev)); new_bus->read = r8169_mdio_read_reg; diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c index b1e7f7ab281c..fceb6d637235 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c @@ -203,8 +203,8 @@ int sxgbe_mdio_register(struct net_device *ndev) case PHY_POLL: irq_str = "POLL"; break; - case PHY_IGNORE_INTERRUPT: - irq_str = "IGNORE"; + case PHY_MAC_INTERRUPT: + irq_str = "MAC"; break; default: sprintf(irq_num, "%d", phy->irq); diff --git a/drivers/net/mdio/mdio-moxart.c b/drivers/net/mdio/mdio-moxart.c index b72c6d185175..f0cff584e176 100644 --- a/drivers/net/mdio/mdio-moxart.c +++ b/drivers/net/mdio/mdio-moxart.c @@ -125,7 +125,7 @@ static int moxart_mdio_probe(struct platform_device *pdev) snprintf(bus->id, MII_BUS_ID_SIZE, "%s-%d-mii", pdev->name, pdev->id); bus->parent = &pdev->dev; - /* Setting PHY_IGNORE_INTERRUPT here even if it has no effect, + /* Setting PHY_MAC_INTERRUPT here even if it has no effect, * of_mdiobus_register() sets these PHY_POLL. * Ideally, the interrupt from MAC controller could be used to * detect link state changes, not polling, i.e. if there was @@ -133,7 +133,7 @@ static int moxart_mdio_probe(struct platform_device *pdev) * interrupt handled in ethernet drivercode. */ for (i = 0; i < PHY_MAX_ADDR; i++) - bus->irq[i] = PHY_IGNORE_INTERRUPT; + bus->irq[i] = PHY_MAC_INTERRUPT; data = bus->priv; data->base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c index 4e15d4d02488..3e431737c1ba 100644 --- a/drivers/net/phy/icplus.c +++ b/drivers/net/phy/icplus.c @@ -188,7 +188,7 @@ static int ip175c_read_status(struct phy_device *phydev) genphy_read_status(phydev); else /* Don't need to read status for switch ports */ - phydev->irq = PHY_IGNORE_INTERRUPT; + phydev->irq = PHY_MAC_INTERRUPT; return 0; } diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index fdb914b5b857..1be07e45d314 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1145,7 +1145,7 @@ void phy_state_machine(struct work_struct *work) } /* Only re-schedule a PHY state machine change if we are polling the - * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving + * PHY, if PHY_MAC_INTERRUPT is set, then we will be moving * between states from phy_mac_interrupt(). * * In state PHY_HALTED the PHY gets suspended, so rescheduling the diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 30a20a29ae05..05261698bf74 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1167,8 +1167,8 @@ char *phy_attached_info_irq(struct phy_device *phydev) case PHY_POLL: irq_str = "POLL"; break; - case PHY_IGNORE_INTERRUPT: - irq_str = "IGNORE"; + case PHY_MAC_INTERRUPT: + irq_str = "MAC"; break; default: snprintf(irq_num, sizeof(irq_num), "%d", phydev->irq); diff --git a/include/linux/phy.h b/include/linux/phy.h index c130788306c8..5d7c4084ade9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -71,11 +71,11 @@ extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, - * or not desired for this PHY. Set to PHY_IGNORE_INTERRUPT if - * the attached driver handles the interrupt + * or not desired for this PHY. Set to PHY_MAC_INTERRUPT if + * the attached MAC driver handles the interrupt */ #define PHY_POLL -1 -#define PHY_IGNORE_INTERRUPT -2 +#define PHY_MAC_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 @@ -1202,11 +1202,11 @@ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad, * @phydev: the phy_device struct * * NOTE: must be kept in sync with addition/removal of PHY_POLL and - * PHY_IGNORE_INTERRUPT + * PHY_MAC_INTERRUPT */ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) { - return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT; + return phydev->irq != PHY_POLL && phydev->irq != PHY_MAC_INTERRUPT; } /** -- cgit v1.2.3 From a6a217dddcd544f6b75f0e2a60b6e84c1d494b7e Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 9 Feb 2021 15:11:06 +0200 Subject: net/mlx5: Add new timestamp mode bits These fields declare which timestamp mode is supported by the device per RQ/SQ/QP. In addition add the ts_format field to the select the mode for RQ/SQ/QP. Link: https://lore.kernel.org/r/20210209131107.698833-2-leon@kernel.org Signed-off-by: Aharon Landau Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 54 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cf692fc17f41..436d6f421dfd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -932,11 +932,18 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 reserved_at_200[0x600]; }; +enum { + MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING = 0x0, + MLX5_QP_TIMESTAMP_FORMAT_CAP_REAL_TIME = 0x1, + MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2, +}; + struct mlx5_ifc_roce_cap_bits { u8 roce_apm[0x1]; u8 reserved_at_1[0x3]; u8 sw_r_roce_src_udp_port[0x1]; - u8 reserved_at_5[0x1b]; + u8 reserved_at_5[0x19]; + u8 qp_ts_format[0x2]; u8 reserved_at_20[0x60]; @@ -1253,6 +1260,18 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, }; +enum { + MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING = 0x0, + MLX5_SQ_TIMESTAMP_FORMAT_CAP_REAL_TIME = 0x1, + MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2, +}; + +enum { + MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING = 0x0, + MLX5_RQ_TIMESTAMP_FORMAT_CAP_REAL_TIME = 0x1, + MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x1f]; u8 vhca_resource_manager[0x1]; @@ -1564,7 +1583,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x4]; + u8 sq_ts_format[0x2]; + u8 rq_ts_format[0x2]; u8 steering_format_version[0x4]; u8 create_qp_start_hint[0x18]; @@ -2868,6 +2888,12 @@ enum { MLX5_QPC_CS_RES_UP_TO_64B = 0x2, }; +enum { + MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + struct mlx5_ifc_qpc_bits { u8 state[0x4]; u8 lag_tx_port_affinity[0x4]; @@ -2896,7 +2922,9 @@ struct mlx5_ifc_qpc_bits { u8 log_rq_stride[0x3]; u8 no_sq[0x1]; u8 log_sq_size[0x4]; - u8 reserved_at_55[0x6]; + u8 reserved_at_55[0x3]; + u8 ts_format[0x2]; + u8 reserved_at_5a[0x1]; u8 rlky[0x1]; u8 ulp_stateless_offload_mode[0x4]; @@ -3312,6 +3340,12 @@ enum { MLX5_SQC_STATE_ERR = 0x3, }; +enum { + MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + struct mlx5_ifc_sqc_bits { u8 rlky[0x1]; u8 cd_master[0x1]; @@ -3323,7 +3357,9 @@ struct mlx5_ifc_sqc_bits { u8 reg_umr[0x1]; u8 allow_swp[0x1]; u8 hairpin[0x1]; - u8 reserved_at_f[0x11]; + u8 reserved_at_f[0xb]; + u8 ts_format[0x2]; + u8 reserved_at_1c[0x4]; u8 reserved_at_20[0x8]; u8 user_index[0x18]; @@ -3414,6 +3450,12 @@ enum { MLX5_RQC_STATE_ERR = 0x3, }; +enum { + MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0, + MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT = 0x1, + MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME = 0x2, +}; + struct mlx5_ifc_rqc_bits { u8 rlky[0x1]; u8 delay_drop_en[0x1]; @@ -3424,7 +3466,9 @@ struct mlx5_ifc_rqc_bits { u8 reserved_at_c[0x1]; u8 flush_in_error_en[0x1]; u8 hairpin[0x1]; - u8 reserved_at_f[0x11]; + u8 reserved_at_f[0xb]; + u8 ts_format[0x2]; + u8 reserved_at_1c[0x4]; u8 reserved_at_20[0x8]; u8 user_index[0x18]; -- cgit v1.2.3 From ae02d41551d6f2a035d3e63ce4415e1b2ba3a7e6 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Fri, 12 Feb 2021 14:30:38 -0800 Subject: net/mlx5: Add register layout to support real-time time-stamp Add needed structure layouts and defines for MTUTC (Management UTC) register. MTUTC will be used for cyc2time HW translation. In addition, add cyc2time modify capability bit and init segment HCA real time address. Finally, add capability bits indicating which time-stamping format is supported per SQ and RQ. Add ts_format in the queue's context layout to allow configuration. Signed-off-by: Eran Ben Elisha Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 5 ++++- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 30 ++++++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f1de49d64a98..fd694add6ff0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -578,7 +578,10 @@ struct mlx5_init_seg { __be32 internal_timer_l; __be32 rsvd3[2]; __be32 health_counter; - __be32 rsvd4[1019]; + __be32 rsvd4[11]; + __be32 real_time_h; + __be32 real_time_l; + __be32 rsvd5[1006]; __be64 ieee1588_clk; __be32 ieee1588_clk_type; __be32 clr_intx; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f93bfe7473aa..2eb7755143d7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -143,6 +143,7 @@ enum { MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, + MLX5_REG_MTUTC = 0x9055, MLX5_REG_MPEGC = 0x9056, MLX5_REG_MCQS = 0x9060, MLX5_REG_MCQI = 0x9061, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 436d6f421dfd..7059fcdf54a3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9150,6 +9150,28 @@ struct mlx5_ifc_mpegc_reg_bits { u8 reserved_at_60[0x100]; }; +enum { + MLX5_MTUTC_OPERATION_SET_TIME_IMMEDIATE = 0x1, + MLX5_MTUTC_OPERATION_ADJUST_TIME = 0x2, + MLX5_MTUTC_OPERATION_ADJUST_FREQ_UTC = 0x3, +}; + +struct mlx5_ifc_mtutc_reg_bits { + u8 reserved_at_0[0x1c]; + u8 operation[0x4]; + + u8 freq_adjustment[0x20]; + + u8 reserved_at_40[0x40]; + + u8 utc_sec[0x20]; + + u8 reserved_at_a0[0x2]; + u8 utc_nsec[0x1e]; + + u8 time_adjustment[0x20]; +}; + struct mlx5_ifc_pcam_enhanced_features_bits { u8 reserved_at_0[0x68]; u8 fec_50G_per_lane_in_pplm[0x1]; @@ -9208,7 +9230,9 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x6e]; + u8 reserved_at_0[0x6b]; + u8 ptpcyc2realtime_modify[0x1]; + u8 reserved_at_6c[0x2]; u8 pci_status_and_power[0x1]; u8 reserved_at_6f[0x5]; u8 mark_tx_action_cnp[0x1]; @@ -9231,7 +9255,8 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_95_to_87[0x9]; u8 mpegc[0x1]; - u8 regs_85_to_68[0x12]; + u8 mtutc[0x1]; + u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; u8 regs_63_to_32[0x20]; @@ -9964,6 +9989,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mcda_reg_bits mcda_reg; struct mlx5_ifc_mirc_reg_bits mirc_reg; struct mlx5_ifc_mfrl_reg_bits mfrl_reg; + struct mlx5_ifc_mtutc_reg_bits mtutc_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From d6f3dc8f509ce6288e2537eb4b0614ef444fd84a Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Fri, 12 Feb 2021 14:30:40 -0800 Subject: net/mlx5: Move all internal timer metadata into a dedicated struct Internal timer mode (SW clock) requires some PTP clock related metadata structs. Real time mode (HW clock) will not need these metadata structs. This separation emphasize the different interfaces for HW clock and SW clock. Signed-off-by: Eran Ben Elisha Signed-off-by: Aya Levin Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/lib/clock.c | 107 ++++++++++++--------- .../net/ethernet/mellanox/mlx5/core/lib/clock.h | 3 +- include/linux/mlx5/driver.h | 12 ++- 3 files changed, 71 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index aaf7d837a967..dcacaa451b53 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -89,7 +89,8 @@ static u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev, static u64 read_internal_timer(const struct cyclecounter *cc) { - struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles); + struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles); + struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer); struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); @@ -100,6 +101,7 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) { struct mlx5_ib_clock_info *clock_info = mdev->clock_info; struct mlx5_clock *clock = &mdev->clock; + struct mlx5_timer *timer; u32 sign; if (!clock_info) @@ -109,10 +111,11 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) smp_store_mb(clock_info->sign, sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING); - clock_info->cycles = clock->tc.cycle_last; - clock_info->mult = clock->cycles.mult; - clock_info->nsec = clock->tc.nsec; - clock_info->frac = clock->tc.frac; + timer = &clock->timer; + clock_info->cycles = timer->tc.cycle_last; + clock_info->mult = timer->cycles.mult; + clock_info->nsec = timer->tc.nsec; + clock_info->frac = timer->tc.frac; smp_store_release(&clock_info->sign, sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2); @@ -151,28 +154,32 @@ static void mlx5_timestamp_overflow(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct mlx5_core_dev *mdev; + struct mlx5_timer *timer; struct mlx5_clock *clock; unsigned long flags; - clock = container_of(dwork, struct mlx5_clock, overflow_work); + timer = container_of(dwork, struct mlx5_timer, overflow_work); + clock = container_of(timer, struct mlx5_clock, timer); mdev = container_of(clock, struct mlx5_core_dev, clock); + write_seqlock_irqsave(&clock->lock, flags); - timecounter_read(&clock->tc); + timecounter_read(&timer->tc); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); - schedule_delayed_work(&clock->overflow_work, clock->overflow_period); + schedule_delayed_work(&timer->overflow_work, timer->overflow_period); } static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; u64 ns = timespec64_to_ns(ts); struct mlx5_core_dev *mdev; unsigned long flags; mdev = container_of(clock, struct mlx5_core_dev, clock); write_seqlock_irqsave(&clock->lock, flags); - timecounter_init(&clock->tc, &clock->cycles, ns); + timecounter_init(&timer->tc, &timer->cycles, ns); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); @@ -183,6 +190,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; u64 cycles, ns; @@ -190,7 +198,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, mdev = container_of(clock, struct mlx5_core_dev, clock); write_seqlock_irqsave(&clock->lock, flags); cycles = mlx5_read_internal_timer(mdev, sts); - ns = timecounter_cyc2time(&clock->tc, cycles); + ns = timecounter_cyc2time(&timer->tc, cycles); write_sequnlock_irqrestore(&clock->lock, flags); *ts = ns_to_timespec64(ns); @@ -201,12 +209,13 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; mdev = container_of(clock, struct mlx5_core_dev, clock); write_seqlock_irqsave(&clock->lock, flags); - timecounter_adjtime(&clock->tc, delta); + timecounter_adjtime(&timer->tc, delta); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); @@ -216,27 +225,27 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; int neg_adj = 0; u32 diff; u64 adj; - if (delta < 0) { neg_adj = 1; delta = -delta; } - adj = clock->nominal_c_mult; + adj = timer->nominal_c_mult; adj *= delta; diff = div_u64(adj, 1000000000ULL); mdev = container_of(clock, struct mlx5_core_dev, clock); write_seqlock_irqsave(&clock->lock, flags); - timecounter_read(&clock->tc); - clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff : - clock->nominal_c_mult + diff; + timecounter_read(&timer->tc); + timer->cycles.mult = neg_adj ? timer->nominal_c_mult - diff : + timer->nominal_c_mult + diff; mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); @@ -313,6 +322,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); + struct mlx5_timer *timer = &clock->timer; u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; u64 nsec_now, nsec_delta, time_stamp = 0; u64 cycles_now, cycles_delta; @@ -355,10 +365,10 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, ns = timespec64_to_ns(&ts); cycles_now = mlx5_read_internal_timer(mdev, NULL); write_seqlock_irqsave(&clock->lock, flags); - nsec_now = timecounter_cyc2time(&clock->tc, cycles_now); + nsec_now = timecounter_cyc2time(&timer->tc, cycles_now); nsec_delta = ns - nsec_now; - cycles_delta = div64_u64(nsec_delta << clock->cycles.shift, - clock->cycles.mult); + cycles_delta = div64_u64(nsec_delta << timer->cycles.shift, + timer->cycles.mult); write_sequnlock_irqrestore(&clock->lock, flags); time_stamp = cycles_now + cycles_delta; field_select = MLX5_MTPPS_FS_PIN_MODE | @@ -541,6 +551,7 @@ static int mlx5_pps_event(struct notifier_block *nb, unsigned long type, void *data) { struct mlx5_clock *clock = mlx5_nb_cof(nb, struct mlx5_clock, pps_nb); + struct mlx5_timer *timer = &clock->timer; struct ptp_clock_event ptp_event; u64 cycles_now, cycles_delta; u64 nsec_now, nsec_delta, ns; @@ -575,10 +586,10 @@ static int mlx5_pps_event(struct notifier_block *nb, ts.tv_nsec = 0; ns = timespec64_to_ns(&ts); write_seqlock_irqsave(&clock->lock, flags); - nsec_now = timecounter_cyc2time(&clock->tc, cycles_now); + nsec_now = timecounter_cyc2time(&timer->tc, cycles_now); nsec_delta = ns - nsec_now; - cycles_delta = div64_u64(nsec_delta << clock->cycles.shift, - clock->cycles.mult); + cycles_delta = div64_u64(nsec_delta << timer->cycles.shift, + timer->cycles.mult); clock->pps_info.start[pin] = cycles_now + cycles_delta; write_sequnlock_irqrestore(&clock->lock, flags); schedule_work(&clock->pps_info.out_work); @@ -594,17 +605,18 @@ static int mlx5_pps_event(struct notifier_block *nb, static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) { struct mlx5_clock *clock = &mdev->clock; + struct mlx5_timer *timer = &clock->timer; u32 dev_freq; dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz); - clock->cycles.read = read_internal_timer; - clock->cycles.shift = MLX5_CYCLES_SHIFT; - clock->cycles.mult = clocksource_khz2mult(dev_freq, - clock->cycles.shift); - clock->nominal_c_mult = clock->cycles.mult; - clock->cycles.mask = CLOCKSOURCE_MASK(41); - - timecounter_init(&clock->tc, &clock->cycles, + timer->cycles.read = read_internal_timer; + timer->cycles.shift = MLX5_CYCLES_SHIFT; + timer->cycles.mult = clocksource_khz2mult(dev_freq, + timer->cycles.shift); + timer->nominal_c_mult = timer->cycles.mult; + timer->cycles.mask = CLOCKSOURCE_MASK(41); + + timecounter_init(&timer->tc, &timer->cycles, ktime_to_ns(ktime_get_real())); } @@ -612,6 +624,7 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock) { struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_timer *timer = &clock->timer; u64 overflow_cycles; u64 frac = 0; u64 ns; @@ -623,29 +636,30 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock) * multiplied by clock multiplier where the result doesn't exceed * 64bits. */ - overflow_cycles = div64_u64(~0ULL >> 1, clock->cycles.mult); - overflow_cycles = min(overflow_cycles, div_u64(clock->cycles.mask, 3)); + overflow_cycles = div64_u64(~0ULL >> 1, timer->cycles.mult); + overflow_cycles = min(overflow_cycles, div_u64(timer->cycles.mask, 3)); - ns = cyclecounter_cyc2ns(&clock->cycles, overflow_cycles, + ns = cyclecounter_cyc2ns(&timer->cycles, overflow_cycles, frac, &frac); do_div(ns, NSEC_PER_SEC / HZ); - clock->overflow_period = ns; + timer->overflow_period = ns; - INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow); - if (clock->overflow_period) - schedule_delayed_work(&clock->overflow_work, 0); + INIT_DELAYED_WORK(&timer->overflow_work, mlx5_timestamp_overflow); + if (timer->overflow_period) + schedule_delayed_work(&timer->overflow_work, 0); else mlx5_core_warn(mdev, "invalid overflow period, overflow_work is not scheduled\n"); if (clock_info) - clock_info->overflow_period = clock->overflow_period; + clock_info->overflow_period = timer->overflow_period; } static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) { struct mlx5_clock *clock = &mdev->clock; struct mlx5_ib_clock_info *info; + struct mlx5_timer *timer; mdev->clock_info = (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL); if (!mdev->clock_info) { @@ -654,13 +668,14 @@ static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) } info = mdev->clock_info; - - info->nsec = clock->tc.nsec; - info->cycles = clock->tc.cycle_last; - info->mask = clock->cycles.mask; - info->mult = clock->nominal_c_mult; - info->shift = clock->cycles.shift; - info->frac = clock->tc.frac; + timer = &clock->timer; + + info->nsec = timer->tc.nsec; + info->cycles = timer->tc.cycle_last; + info->mask = timer->cycles.mask; + info->mult = timer->nominal_c_mult; + info->shift = timer->cycles.shift; + info->frac = timer->tc.frac; } void mlx5_init_clock(struct mlx5_core_dev *mdev) @@ -714,7 +729,7 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) } cancel_work_sync(&clock->pps_info.out_work); - cancel_delayed_work_sync(&clock->overflow_work); + cancel_delayed_work_sync(&clock->timer.overflow_work); if (mdev->clock_info) { free_page((unsigned long)mdev->clock_info); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index 31600924bdc3..6e8804ebc773 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -45,12 +45,13 @@ static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, u64 timestamp) { + struct mlx5_timer *timer = &clock->timer; unsigned int seq; u64 nsec; do { seq = read_seqbegin(&clock->lock); - nsec = timecounter_cyc2time(&clock->tc, timestamp); + nsec = timecounter_cyc2time(&timer->tc, timestamp); } while (read_seqretry(&clock->lock, seq)); return ns_to_ktime(nsec); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2eb7755143d7..2f4d6182df1b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -662,18 +662,22 @@ struct mlx5_pps { u8 enabled; }; -struct mlx5_clock { - struct mlx5_nb pps_nb; - seqlock_t lock; +struct mlx5_timer { struct cyclecounter cycles; struct timecounter tc; - struct hwtstamp_config hwtstamp_config; u32 nominal_c_mult; unsigned long overflow_period; struct delayed_work overflow_work; +}; + +struct mlx5_clock { + struct mlx5_nb pps_nb; + seqlock_t lock; + struct hwtstamp_config hwtstamp_config; struct ptp_clock *ptp; struct ptp_clock_info ptp_info; struct mlx5_pps pps_info; + struct mlx5_timer timer; }; struct mlx5_dm; -- cgit v1.2.3 From 32aeba1f7a98b0c69d4a5704a7d9cea42ba856ba Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 16 Feb 2021 11:08:37 -0800 Subject: tg3: Remove unused PHY_BRCM flags The tg3 driver tried to communicate towards the PHY driver whether it wanted RGMII in-band signaling enabled or disabled however there is nothing that looks at those flags in drivers/net/phy/broadcom.c so this does do not anything. Suggested-by: Vladimir Oltean Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 6 ------ include/linux/brcmphy.h | 9 +++------ 2 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 8936c2bc6286..d2381929931b 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -1580,12 +1580,6 @@ static int tg3_mdio_init(struct tg3 *tp) PHY_BRCM_RX_REFCLK_UNUSED | PHY_BRCM_DIS_TXCRXC_NOENRGY | PHY_BRCM_AUTO_PWRDWN_ENABLE; - if (tg3_flag(tp, RGMII_INBAND_DISABLE)) - phydev->dev_flags |= PHY_BRCM_STD_IBND_DISABLE; - if (tg3_flag(tp, RGMII_EXT_IBND_RX_EN)) - phydev->dev_flags |= PHY_BRCM_EXT_IBND_RX_ENABLE; - if (tg3_flag(tp, RGMII_EXT_IBND_TX_EN)) - phydev->dev_flags |= PHY_BRCM_EXT_IBND_TX_ENABLE; fallthrough; case PHY_ID_RTL8211C: phydev->interface = PHY_INTERFACE_MODE_RGMII; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 16597d3fa011..8fe1d55371ac 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -63,12 +63,9 @@ #define PHY_BRCM_AUTO_PWRDWN_ENABLE 0x00000001 #define PHY_BRCM_RX_REFCLK_UNUSED 0x00000002 -#define PHY_BRCM_STD_IBND_DISABLE 0x00000004 -#define PHY_BRCM_EXT_IBND_RX_ENABLE 0x00000008 -#define PHY_BRCM_EXT_IBND_TX_ENABLE 0x00000010 -#define PHY_BRCM_CLEAR_RGMII_MODE 0x00000020 -#define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00000040 -#define PHY_BRCM_EN_MASTER_MODE 0x00000080 +#define PHY_BRCM_CLEAR_RGMII_MODE 0x00000004 +#define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00000008 +#define PHY_BRCM_EN_MASTER_MODE 0x00000010 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) -- cgit v1.2.3 From 7331d1d4622ba7e668ec19cfba2ed7feb4a3084e Mon Sep 17 00:00:00 2001 From: Pavana Sharma Date: Tue, 16 Feb 2021 20:20:53 +0100 Subject: net: phy: Add 5GBASER interface mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 5GBASE-R phy interface mode Signed-off-by: Pavana Sharma Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Marek BehĂșn Signed-off-by: David S. Miller --- Documentation/networking/phy.rst | 6 ++++++ include/linux/phy.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 70136cc9e25e..06adfc2afcf0 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -267,6 +267,12 @@ Some of the interface modes are described below: duplex, pause or other settings. This is dependent on the MAC and/or PHY behaviour. +``PHY_INTERFACE_MODE_5GBASER`` + This is the IEEE 802.3 Clause 129 defined 5GBASE-R protocol. It is + identical to the 10GBASE-R protocol defined in Clause 49, with the + exception that it operates at half the frequency. Please refer to the + IEEE standard for the definition. + ``PHY_INTERFACE_MODE_10GBASER`` This is the IEEE 802.3 Clause 49 defined 10GBASE-R protocol used with various different mediums. Please refer to the IEEE standard for a diff --git a/include/linux/phy.h b/include/linux/phy.h index 5d7c4084ade9..0d537f59b77f 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -107,6 +107,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX + * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR @@ -139,6 +140,7 @@ typedef enum { PHY_INTERFACE_MODE_100BASEX, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, + PHY_INTERFACE_MODE_5GBASER, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ @@ -209,6 +211,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "1000base-x"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; + case PHY_INTERFACE_MODE_5GBASER: + return "5gbase-r"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: -- cgit v1.2.3 From 405be6b46b707590f8014d468f4b42f25c6064cb Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Feb 2021 22:41:58 +0100 Subject: switchdev: mrp: Remove CONFIG_BRIDGE_MRP Remove #IS_ENABLED(CONFIG_BRIDGE_MRP) from switchdev.h. This will simplify the code implements MRP callbacks and will be similar with the vlan filtering. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 9a5426b61ca5..465362d9d063 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -27,9 +27,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, -#if IS_ENABLED(CONFIG_BRIDGE_MRP) SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, -#endif }; struct switchdev_brport_flags { @@ -51,9 +49,7 @@ struct switchdev_attr { bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ bool mc_disabled; /* MC_DISABLED */ -#if IS_ENABLED(CONFIG_BRIDGE_MRP) u8 mrp_port_role; /* MRP_PORT_ROLE */ -#endif } u; }; @@ -62,7 +58,6 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, SWITCHDEV_OBJ_ID_HOST_MDB, -#if IS_ENABLED(CONFIG_BRIDGE_MRP) SWITCHDEV_OBJ_ID_MRP, SWITCHDEV_OBJ_ID_RING_TEST_MRP, SWITCHDEV_OBJ_ID_RING_ROLE_MRP, @@ -70,8 +65,6 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_IN_TEST_MRP, SWITCHDEV_OBJ_ID_IN_ROLE_MRP, SWITCHDEV_OBJ_ID_IN_STATE_MRP, - -#endif }; struct switchdev_obj { @@ -103,7 +96,6 @@ struct switchdev_obj_port_mdb { container_of((OBJ), struct switchdev_obj_port_mdb, obj) -#if IS_ENABLED(CONFIG_BRIDGE_MRP) /* SWITCHDEV_OBJ_ID_MRP */ struct switchdev_obj_mrp { struct switchdev_obj obj; @@ -183,8 +175,6 @@ struct switchdev_obj_in_state_mrp { #define SWITCHDEV_OBJ_IN_STATE_MRP(OBJ) \ container_of((OBJ), struct switchdev_obj_in_state_mrp, obj) -#endif - typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); enum switchdev_notifier_type { -- cgit v1.2.3 From c513efa20c5254ef74c4157a03d515abdc46c503 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Feb 2021 22:41:59 +0100 Subject: switchdev: mrp: Extend ring_role_mrp and in_role_mrp Add the member sw_backup to the structures switchdev_obj_ring_role_mrp and switchdev_obj_in_role_mrp. In this way the SW can call the driver in 2 ways, once when sw_backup is set to false, meaning that the driver should implement this completely in HW. And if that is not supported the SW will call again but with sw_backup set to true, meaning that the HW should help or allow the SW to run the protocol. For example when role is MRM, if the HW can't detect when it stops receiving MRP Test frames but it can trap these frames to CPU, then it needs to return -EOPNOTSUPP when sw_backup is false and return 0 when sw_backup is true. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 465362d9d063..b7fc7d0f54e2 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -127,6 +127,7 @@ struct switchdev_obj_ring_role_mrp { struct switchdev_obj obj; u8 ring_role; u32 ring_id; + u8 sw_backup; }; #define SWITCHDEV_OBJ_RING_ROLE_MRP(OBJ) \ @@ -161,6 +162,7 @@ struct switchdev_obj_in_role_mrp { u32 ring_id; u16 in_id; u8 in_role; + u8 sw_backup; }; #define SWITCHDEV_OBJ_IN_ROLE_MRP(OBJ) \ -- cgit v1.2.3 From d8ea7ff3995ead5193313c72c0d97c9c16c83be9 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Feb 2021 22:42:03 +0100 Subject: net: mscc: ocelot: Add support for MRP Add basic support for MRP. The HW will just trap all MRP frames on the ring ports to CPU and allow the SW to process them. In this way it is possible to for this node to behave both as MRM and MRC. Current limitations are: - it doesn't support Interconnect roles. - it supports only a single ring. - the HW should be able to do forwarding of MRP Test frames so the SW will not need to do this. So it would be able to have the role MRC without SW support. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/Makefile | 1 + drivers/net/ethernet/mscc/ocelot.c | 10 +- drivers/net/ethernet/mscc/ocelot_mrp.c | 175 +++++++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_net.c | 60 +++++++++++ include/linux/dsa/ocelot.h | 5 + include/soc/mscc/ocelot.h | 45 +++++++++ 6 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mscc/ocelot_mrp.c (limited to 'include') diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile index 346bba2730ad..722c27694b21 100644 --- a/drivers/net/ethernet/mscc/Makefile +++ b/drivers/net/ethernet/mscc/Makefile @@ -8,6 +8,7 @@ mscc_ocelot_switch_lib-y := \ ocelot_flower.o \ ocelot_ptp.o \ ocelot_devlink.o +mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o mscc_ocelot-y := \ ocelot_vsc7514.o \ diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 5d13087c85d6..46e5c9136bac 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -687,7 +687,7 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh) int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) { struct skb_shared_hwtstamps *shhwtstamps; - u64 tod_in_ns, full_ts_in_ns; + u64 tod_in_ns, full_ts_in_ns, cpuq; u64 timestamp, src_port, len; u32 xfh[OCELOT_TAG_LEN / 4]; struct net_device *dev; @@ -704,6 +704,7 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) ocelot_xfh_get_src_port(xfh, &src_port); ocelot_xfh_get_len(xfh, &len); ocelot_xfh_get_rew_val(xfh, ×tamp); + ocelot_xfh_get_cpuq(xfh, &cpuq); if (WARN_ON(src_port >= ocelot->num_phys_ports)) return -EINVAL; @@ -770,6 +771,13 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) skb->offload_fwd_mark = 1; skb->protocol = eth_type_trans(skb, dev); + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + if (skb->protocol == cpu_to_be16(ETH_P_MRP) && + cpuq & BIT(OCELOT_MRP_CPUQ)) + skb->offload_fwd_mark = 0; +#endif + *nskb = skb; return 0; diff --git a/drivers/net/ethernet/mscc/ocelot_mrp.c b/drivers/net/ethernet/mscc/ocelot_mrp.c new file mode 100644 index 000000000000..683da320bfd8 --- /dev/null +++ b/drivers/net/ethernet/mscc/ocelot_mrp.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Microsemi Ocelot Switch driver + * + * This contains glue logic between the switchdev driver operations and the + * mscc_ocelot_switch_lib. + * + * Copyright (c) 2017, 2019 Microsemi Corporation + * Copyright 2020-2021 NXP Semiconductors + */ + +#include +#include +#include +#include +#include "ocelot.h" +#include "ocelot_vcap.h" + +static int ocelot_mrp_del_vcap(struct ocelot *ocelot, int port) +{ + struct ocelot_vcap_block *block_vcap_is2; + struct ocelot_vcap_filter *filter; + + block_vcap_is2 = &ocelot->block[VCAP_IS2]; + filter = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, port, + false); + if (!filter) + return 0; + + return ocelot_vcap_filter_del(ocelot, filter); +} + +int ocelot_mrp_add(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; + struct net_device *dev; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (mrp->p_port != dev && mrp->s_port != dev) + return 0; + + if (ocelot->mrp_ring_id != 0 && + ocelot->mrp_s_port && + ocelot->mrp_p_port) + return -EINVAL; + + if (mrp->p_port == dev) + ocelot->mrp_p_port = dev; + + if (mrp->s_port == dev) + ocelot->mrp_s_port = dev; + + ocelot->mrp_ring_id = mrp->ring_id; + + return 0; +} +EXPORT_SYMBOL(ocelot_mrp_add); + +int ocelot_mrp_del(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; + struct net_device *dev; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + return 0; + + if (ocelot->mrp_ring_id == 0 && + !ocelot->mrp_s_port && + !ocelot->mrp_p_port) + return -EINVAL; + + if (ocelot_mrp_del_vcap(ocelot, priv->chip_port)) + return -EINVAL; + + if (ocelot->mrp_p_port == dev) + ocelot->mrp_p_port = NULL; + + if (ocelot->mrp_s_port == dev) + ocelot->mrp_s_port = NULL; + + ocelot->mrp_ring_id = 0; + + return 0; +} +EXPORT_SYMBOL(ocelot_mrp_del); + +int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_vcap_filter *filter; + struct ocelot_port_private *priv; + struct net_device *dev; + int err; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (ocelot->mrp_ring_id != mrp->ring_id) + return -EINVAL; + + if (!mrp->sw_backup) + return -EOPNOTSUPP; + + if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + return 0; + + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); + if (!filter) + return -ENOMEM; + + filter->key_type = OCELOT_VCAP_KEY_ETYPE; + filter->prio = 1; + filter->id.cookie = priv->chip_port; + filter->id.tc_offload = false; + filter->block_id = VCAP_IS2; + filter->type = OCELOT_VCAP_FILTER_OFFLOAD; + filter->ingress_port_mask = BIT(priv->chip_port); + *(__be16 *)filter->key.etype.etype.value = htons(ETH_P_MRP); + *(__be16 *)filter->key.etype.etype.mask = htons(0xffff); + filter->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY; + filter->action.port_mask = 0x0; + filter->action.cpu_copy_ena = true; + filter->action.cpu_qu_num = OCELOT_MRP_CPUQ; + + err = ocelot_vcap_filter_add(ocelot, filter, NULL); + if (err) + kfree(filter); + + return err; +} +EXPORT_SYMBOL(ocelot_mrp_add_ring_role); + +int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port_private *priv; + struct net_device *dev; + + if (!ocelot_port) + return -EOPNOTSUPP; + + priv = container_of(ocelot_port, struct ocelot_port_private, port); + dev = priv->dev; + + if (ocelot->mrp_ring_id != mrp->ring_id) + return -EINVAL; + + if (!mrp->sw_backup) + return -EOPNOTSUPP; + + if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + return 0; + + return ocelot_mrp_del_vcap(ocelot, priv->chip_port); +} +EXPORT_SYMBOL(ocelot_mrp_del_ring_role); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 6518262532f0..12cb6867a2d0 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1010,6 +1010,52 @@ static int ocelot_port_obj_del_mdb(struct net_device *dev, return ocelot_port_mdb_del(ocelot, port, mdb); } +static int ocelot_port_obj_mrp_add(struct net_device *dev, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_add(ocelot, port, mrp); +} + +static int ocelot_port_obj_mrp_del(struct net_device *dev, + const struct switchdev_obj_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_del(ocelot, port, mrp); +} + +static int +ocelot_port_obj_mrp_add_ring_role(struct net_device *dev, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_add_ring_role(ocelot, port, mrp); +} + +static int +ocelot_port_obj_mrp_del_ring_role(struct net_device *dev, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot_port *ocelot_port = &priv->port; + struct ocelot *ocelot = ocelot_port->ocelot; + int port = priv->chip_port; + + return ocelot_mrp_del_ring_role(ocelot, port, mrp); +} + static int ocelot_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) @@ -1024,6 +1070,13 @@ static int ocelot_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); break; + case SWITCHDEV_OBJ_ID_MRP: + ret = ocelot_port_obj_mrp_add(dev, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + ret = ocelot_port_obj_mrp_add_ring_role(dev, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: return -EOPNOTSUPP; } @@ -1044,6 +1097,13 @@ static int ocelot_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_MDB: ret = ocelot_port_obj_del_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); break; + case SWITCHDEV_OBJ_ID_MRP: + ret = ocelot_port_obj_mrp_del(dev, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + ret = ocelot_port_obj_mrp_del_ring_role(dev, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: return -EOPNOTSUPP; } diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index c6bc45ae5e03..4265f328681a 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -160,6 +160,11 @@ static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port) packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); } +static inline void ocelot_xfh_get_cpuq(void *extraction, u64 *cpuq) +{ + packing(extraction, cpuq, 28, 20, OCELOT_TAG_LEN, UNPACK, 0); +} + static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class) { packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 1f2d90976564..425ff29d9389 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -112,6 +112,8 @@ #define REG_RESERVED_ADDR 0xffffffff #define REG_RESERVED(reg) REG(reg, REG_RESERVED_ADDR) +#define OCELOT_MRP_CPUQ 7 + enum ocelot_target { ANA = 1, QS, @@ -677,6 +679,12 @@ struct ocelot { /* Protects the PTP clock */ spinlock_t ptp_clock_lock; struct ptp_pin_desc ptp_pins[OCELOT_PTP_PINS_NUM]; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + u16 mrp_ring_id; + struct net_device *mrp_p_port; + struct net_device *mrp_s_port; +#endif }; struct ocelot_policer { @@ -874,4 +882,41 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +int ocelot_mrp_add(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp); +int ocelot_mrp_del(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp); +int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp); +int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp); +#else +static inline int ocelot_mrp_add(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + return -EOPNOTSUPP; +} + +static inline int ocelot_mrp_del(struct ocelot *ocelot, int port, + const struct switchdev_obj_mrp *mrp) +{ + return -EOPNOTSUPP; +} + +static inline int +ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + return -EOPNOTSUPP; +} + +static inline int +ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + return -EOPNOTSUPP; +} +#endif + #endif -- cgit v1.2.3 From c595c4330da06fff716337239a8d5e528341a502 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Feb 2021 22:42:04 +0100 Subject: net: dsa: add MRP support Add support for offloading MRP in HW. Currently implement the switchdev calls 'SWITCHDEV_OBJ_ID_MRP', 'SWITCHDEV_OBJ_ID_RING_ROLE_MRP', to allow to create MRP instances and to set the role of these instances. Add DSA_NOTIFIER_MRP_ADD/DEL and DSA_NOTIFIER_MRP_ADD/DEL_RING_ROLE which calls to .port_mrp_add/del and .port_mrp_add/del_ring_role in the DSA driver for the switch. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ++++++ net/dsa/dsa_priv.h | 26 +++++++++++++ net/dsa/port.c | 48 ++++++++++++++++++++++++ net/dsa/slave.c | 22 +++++++++++ net/dsa/switch.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 213 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 68f8159564a3..83a933e563fe 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -792,6 +792,18 @@ struct dsa_switch_ops { struct net_device *hsr); int (*port_hsr_leave)(struct dsa_switch *ds, int port, struct net_device *hsr); + + /* + * MRP integration + */ + int (*port_mrp_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp); + int (*port_mrp_del)(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp); + int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp); + int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index e9d1e76c42ba..2eeaa42f2e08 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -31,6 +31,10 @@ enum { DSA_NOTIFIER_VLAN_DEL, DSA_NOTIFIER_MTU, DSA_NOTIFIER_TAG_PROTO, + DSA_NOTIFIER_MRP_ADD, + DSA_NOTIFIER_MRP_DEL, + DSA_NOTIFIER_MRP_ADD_RING_ROLE, + DSA_NOTIFIER_MRP_DEL_RING_ROLE, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -91,6 +95,20 @@ struct dsa_notifier_tag_proto_info { const struct dsa_device_ops *tag_ops; }; +/* DSA_NOTIFIER_MRP_* */ +struct dsa_notifier_mrp_info { + const struct switchdev_obj_mrp *mrp; + int sw_index; + int port; +}; + +/* DSA_NOTIFIER_MRP_* */ +struct dsa_notifier_mrp_ring_role_info { + const struct switchdev_obj_ring_role_mrp *mrp; + int sw_index; + int port; +}; + struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; @@ -198,6 +216,14 @@ int dsa_port_vlan_add(struct dsa_port *dp, struct netlink_ext_ack *extack); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); diff --git a/net/dsa/port.c b/net/dsa/port.c index 14a1d0d77657..c9c6d7ab3f47 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -564,6 +564,54 @@ int dsa_port_vlan_del(struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); } +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp) +{ + struct dsa_notifier_mrp_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_ADD, &info); +} + +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp) +{ + struct dsa_notifier_mrp_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_DEL, &info); +} + +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct dsa_notifier_mrp_ring_role_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_ADD_RING_ROLE, &info); +} + +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp) +{ + struct dsa_notifier_mrp_ring_role_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mrp = mrp, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MRP_DEL_RING_ROLE, &info); +} + void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5ecb43a1b6e0..491e3761b5f4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -404,6 +404,17 @@ static int dsa_slave_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_vlan_add(dev, obj, extack); break; + case SWITCHDEV_OBJ_ID_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_add_ring_role(dp, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: err = -EOPNOTSUPP; break; @@ -461,6 +472,17 @@ static int dsa_slave_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_vlan_del(dev, obj); break; + case SWITCHDEV_OBJ_ID_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + if (!dsa_port_offloads_netdev(dp, obj->orig_dev)) + return -EOPNOTSUPP; + err = dsa_port_mrp_del_ring_role(dp, + SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index db2a9b221988..4b5da89dc27a 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -372,6 +372,99 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, return 0; } +static bool dsa_switch_mrp_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mrp_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_mrp_add(struct dsa_switch *ds, + struct dsa_notifier_mrp_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_mrp_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mrp_match(ds, port, info)) { + err = ds->ops->port_mrp_add(ds, port, info->mrp); + if (err) + break; + } + } + + return err; +} + +static int dsa_switch_mrp_del(struct dsa_switch *ds, + struct dsa_notifier_mrp_info *info) +{ + if (!ds->ops->port_mrp_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_mrp_del(ds, info->port, info->mrp); + + return 0; +} + +static bool +dsa_switch_mrp_ring_role_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mrp_ring_role_info *info) +{ + if (ds->index == info->sw_index && port == info->port) + return true; + + if (dsa_is_dsa_port(ds, port)) + return true; + + return false; +} + +static int +dsa_switch_mrp_add_ring_role(struct dsa_switch *ds, + struct dsa_notifier_mrp_ring_role_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_mrp_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mrp_ring_role_match(ds, port, info)) { + err = ds->ops->port_mrp_add_ring_role(ds, port, + info->mrp); + if (err) + break; + } + } + + return err; +} + +static int +dsa_switch_mrp_del_ring_role(struct dsa_switch *ds, + struct dsa_notifier_mrp_ring_role_info *info) +{ + if (!ds->ops->port_mrp_del) + return -EOPNOTSUPP; + + if (ds->index == info->sw_index) + return ds->ops->port_mrp_del_ring_role(ds, info->port, + info->mrp); + + return 0; +} + static int dsa_switch_event(struct notifier_block *nb, unsigned long event, void *info) { @@ -427,6 +520,18 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_TAG_PROTO: err = dsa_switch_change_tag_proto(ds, info); break; + case DSA_NOTIFIER_MRP_ADD: + err = dsa_switch_mrp_add(ds, info); + break; + case DSA_NOTIFIER_MRP_DEL: + err = dsa_switch_mrp_del(ds, info); + break; + case DSA_NOTIFIER_MRP_ADD_RING_ROLE: + err = dsa_switch_mrp_add_ring_role(ds, info); + break; + case DSA_NOTIFIER_MRP_DEL_RING_ROLE: + err = dsa_switch_mrp_del_ring_role(ds, info); + break; default: err = -EOPNOTSUPP; break; -- cgit v1.2.3 From 396d7f23adf9e8c436dd81a69488b5b6a865acf8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 16 Feb 2021 18:22:00 +0200 Subject: net: sched: fix police ext initialization When police action is created by cls API tcf_exts_validate() first conditional that calls tcf_action_init_1() directly, the action idr is not updated according to latest changes in action API that require caller to commit newly created action to idr with tcf_idr_insert_many(). This results such action not being accessible through act API and causes crash reported by syzbot: ================================================================== BUG: KASAN: null-ptr-deref in instrument_atomic_read include/linux/instrumented.h:71 [inline] BUG: KASAN: null-ptr-deref in atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] BUG: KASAN: null-ptr-deref in __tcf_idr_release net/sched/act_api.c:178 [inline] BUG: KASAN: null-ptr-deref in tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598 Read of size 4 at addr 0000000000000010 by task kworker/u4:5/204 CPU: 0 PID: 204 Comm: kworker/u4:5 Not tainted 5.11.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 __kasan_report mm/kasan/report.c:400 [inline] kasan_report.cold+0x5f/0xd5 mm/kasan/report.c:413 check_memory_region_inline mm/kasan/generic.c:179 [inline] check_memory_region+0x13d/0x180 mm/kasan/generic.c:185 instrument_atomic_read include/linux/instrumented.h:71 [inline] atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] __tcf_idr_release net/sched/act_api.c:178 [inline] tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598 tc_action_net_exit include/net/act_api.h:151 [inline] police_exit_net+0x168/0x360 net/sched/act_police.c:390 ops_exit_list+0x10d/0x160 net/core/net_namespace.c:190 cleanup_net+0x4ea/0xb10 net/core/net_namespace.c:604 process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275 worker_thread+0x64c/0x1120 kernel/workqueue.c:2421 kthread+0x3b1/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 ================================================================== Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 204 Comm: kworker/u4:5 Tainted: G B 5.11.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 panic+0x306/0x73d kernel/panic.c:231 end_report+0x58/0x5e mm/kasan/report.c:100 __kasan_report mm/kasan/report.c:403 [inline] kasan_report.cold+0x67/0xd5 mm/kasan/report.c:413 check_memory_region_inline mm/kasan/generic.c:179 [inline] check_memory_region+0x13d/0x180 mm/kasan/generic.c:185 instrument_atomic_read include/linux/instrumented.h:71 [inline] atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] __tcf_idr_release net/sched/act_api.c:178 [inline] tcf_idrinfo_destroy+0x129/0x1d0 net/sched/act_api.c:598 tc_action_net_exit include/net/act_api.h:151 [inline] police_exit_net+0x168/0x360 net/sched/act_police.c:390 ops_exit_list+0x10d/0x160 net/core/net_namespace.c:190 cleanup_net+0x4ea/0xb10 net/core/net_namespace.c:604 process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275 worker_thread+0x64c/0x1120 kernel/workqueue.c:2421 kthread+0x3b1/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 Kernel Offset: disabled Fix the issue by calling tcf_idr_insert_many() after successful action initialization. Fixes: 0fedc63fadf0 ("net_sched: commit action insertions together") Reported-by: syzbot+151e3e714d34ae4ce7e8@syzkaller.appspotmail.com Signed-off-by: Vlad Buslov Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_api.c | 2 +- net/sched/cls_api.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 55dab604861f..57be7c5d273b 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -166,6 +166,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); +void tcf_idr_insert_many(struct tc_action *actions[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2e85b636b27b..ebc8f1413078 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -908,7 +908,7 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; -static void tcf_idr_insert_many(struct tc_action *actions[]) +void tcf_idr_insert_many(struct tc_action *actions[]) { int i; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 37b77bd30974..0b3900dd2354 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3053,6 +3053,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; + tcf_idr_insert_many(exts->actions); } else if (exts->action && tb[exts->action]) { int err; -- cgit v1.2.3 From 3afd0218992a8d1398e9791d6c2edd4c948ae7ee Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 16 Feb 2021 16:54:52 -0600 Subject: net: phy: broadcom: Set proper 1000BaseX/SGMII interface mode for BCM54616S The default configuration for the BCM54616S PHY may not match the desired mode when using 1000BaseX or SGMII interface modes, such as when it is on an SFP module. Add code to explicitly set the correct mode using programming sequences provided by Bel-Fuse: https://www.belfuse.com/resources/datasheets/powersolutions/ds-bps-sfp-1gbt-05-series.pdf https://www.belfuse.com/resources/datasheets/powersolutions/ds-bps-sfp-1gbt-06-series.pdf Signed-off-by: Robert Hancock Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 84 +++++++++++++++++++++++++++++++++++++++------- include/linux/brcmphy.h | 4 +++ 2 files changed, 76 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 91fbd26c809e..8e7fc3368380 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -103,6 +103,64 @@ static int bcm54612e_config_init(struct phy_device *phydev) return 0; } +static int bcm54616s_config_init(struct phy_device *phydev) +{ + int rc, val; + + if (phydev->interface != PHY_INTERFACE_MODE_SGMII && + phydev->interface != PHY_INTERFACE_MODE_1000BASEX) + return 0; + + /* Ensure proper interface mode is selected. */ + /* Disable RGMII mode */ + val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC); + if (val < 0) + return val; + val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN; + val |= MII_BCM54XX_AUXCTL_MISC_WREN; + rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC, + val); + if (rc < 0) + return rc; + + /* Select 1000BASE-X register set (primary SerDes) */ + val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE); + if (val < 0) + return val; + val |= BCM54XX_SHD_MODE_1000BX; + rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val); + if (rc < 0) + return rc; + + /* Power down SerDes interface */ + rc = phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN); + if (rc < 0) + return rc; + + /* Select proper interface mode */ + val &= ~BCM54XX_SHD_INTF_SEL_MASK; + val |= phydev->interface == PHY_INTERFACE_MODE_SGMII ? + BCM54XX_SHD_INTF_SEL_SGMII : + BCM54XX_SHD_INTF_SEL_GBIC; + rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val); + if (rc < 0) + return rc; + + /* Power up SerDes interface */ + rc = phy_clear_bits(phydev, MII_BMCR, BMCR_PDOWN); + if (rc < 0) + return rc; + + /* Select copper register set */ + val &= ~BCM54XX_SHD_MODE_1000BX; + rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val); + if (rc < 0) + return rc; + + /* Power up copper interface */ + return phy_clear_bits(phydev, MII_BMCR, BMCR_PDOWN); +} + /* Needs SMDSP clock enabled via bcm54xx_phydsp_config() */ static int bcm50610_a0_workaround(struct phy_device *phydev) { @@ -283,15 +341,17 @@ static int bcm54xx_config_init(struct phy_device *phydev) bcm54xx_adjust_rxrefclk(phydev); - if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E) { + switch (BRCM_PHY_MODEL(phydev)) { + case PHY_ID_BCM54210E: err = bcm54210e_config_init(phydev); - if (err) - return err; - } else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54612E) { + break; + case PHY_ID_BCM54612E: err = bcm54612e_config_init(phydev); - if (err) - return err; - } else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) { + break; + case PHY_ID_BCM54616S: + err = bcm54616s_config_init(phydev); + break; + case PHY_ID_BCM54810: /* For BCM54810, we need to disable BroadR-Reach function */ val = bcm_phy_read_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL); @@ -299,9 +359,10 @@ static int bcm54xx_config_init(struct phy_device *phydev) err = bcm_phy_write_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL, val); - if (err < 0) - return err; + break; } + if (err) + return err; bcm54xx_phydsp_config(phydev); @@ -390,7 +451,7 @@ struct bcm54616s_phy_priv { static int bcm54616s_probe(struct phy_device *phydev) { struct bcm54616s_phy_priv *priv; - int val, intf_sel; + int val; priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -408,8 +469,7 @@ static int bcm54616s_probe(struct phy_device *phydev) * RGMII-1000Base-X is properly supported, but RGMII-100Base-FX * support is still missing as of now. */ - intf_sel = (val & BCM54XX_SHD_INTF_SEL_MASK) >> 1; - if (intf_sel == 1) { + if ((val & BCM54XX_SHD_INTF_SEL_MASK) == BCM54XX_SHD_INTF_SEL_RGMII) { val = bcm_phy_read_shadow(phydev, BCM54616S_SHD_100FX_CTRL); if (val < 0) return val; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 8fe1d55371ac..c2c2147dfeb8 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -129,6 +129,7 @@ #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC 0x07 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN 0x0010 +#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN 0x0080 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN 0x0100 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX 0x0200 #define MII_BCM54XX_AUXCTL_MISC_WREN 0x8000 @@ -216,6 +217,9 @@ /* 11111: Mode Control Register */ #define BCM54XX_SHD_MODE 0x1f #define BCM54XX_SHD_INTF_SEL_MASK GENMASK(2, 1) /* INTERF_SEL[1:0] */ +#define BCM54XX_SHD_INTF_SEL_RGMII 0x02 +#define BCM54XX_SHD_INTF_SEL_SGMII 0x04 +#define BCM54XX_SHD_INTF_SEL_GBIC 0x06 #define BCM54XX_SHD_MODE_1000BX BIT(0) /* Enable 1000-X registers */ /* -- cgit v1.2.3 From b834489bceccc64641684eee5e93275cdf5f465b Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Tue, 16 Feb 2021 16:54:53 -0600 Subject: net: phy: Add is_on_sfp_module flag and phy_on_sfp helper Add a flag and helper function to indicate that a PHY device is part of an SFP module, which is set on attach. This can be used by PHY drivers to handle SFP-specific quirks or behavior. Signed-off-by: Robert Hancock Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 ++ include/linux/phy.h | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 05261698bf74..d6ac3ed38197 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1377,6 +1377,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, if (phydev->sfp_bus_attached) dev->sfp_bus = phydev->sfp_bus; + else if (dev->sfp_bus) + phydev->is_on_sfp_module = true; } /* Some Ethernet drivers try to connect to a PHY device before diff --git a/include/linux/phy.h b/include/linux/phy.h index 0d537f59b77f..1a12e4436b5b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -492,6 +492,7 @@ struct macsec_ops; * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. + * @is_on_sfp_module: Set true if PHY is located on an SFP module. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * @irq: IRQ number of the PHY's interrupt (-1 if none) @@ -565,6 +566,7 @@ struct phy_device { unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; + unsigned is_on_sfp_module:1; unsigned autoneg:1; /* The most recently read link state */ @@ -1296,6 +1298,15 @@ static inline bool phy_is_internal(struct phy_device *phydev) return phydev->is_internal; } +/** + * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module + * @phydev: the phy_device struct + */ +static inline bool phy_on_sfp(struct phy_device *phydev) +{ + return phydev->is_on_sfp_module; +} + /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) -- cgit v1.2.3 From 96313e1db8e5629cc2217616dca78f03e6463008 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 17 Feb 2021 12:25:57 -0800 Subject: net: mdio: Remove of_phy_attach() We have no in-tree users, also update the sfp-phylink.rst documentation to indicate that phy_attach_direct() is used instead of of_phy_attach(). Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/sfp-phylink.rst | 2 +- drivers/net/mdio/of_mdio.c | 30 ------------------------------ include/linux/of_mdio.h | 10 ---------- 3 files changed, 1 insertion(+), 41 deletions(-) (limited to 'include') diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst index 5aec7c8857d0..328f8d39eeb3 100644 --- a/Documentation/networking/sfp-phylink.rst +++ b/Documentation/networking/sfp-phylink.rst @@ -163,7 +163,7 @@ this documentation. err = phylink_of_phy_connect(priv->phylink, node, flags); For the most part, ``flags`` can be zero; these flags are passed to - the of_phy_attach() inside this function call if a PHY is specified + the phy_attach_direct() inside this function call if a PHY is specified in the DT node ``node``. ``node`` should be the DT node which contains the network phy property, diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c index 4daf94bb56a5..ea9d5855fb52 100644 --- a/drivers/net/mdio/of_mdio.c +++ b/drivers/net/mdio/of_mdio.c @@ -462,36 +462,6 @@ struct phy_device *of_phy_get_and_connect(struct net_device *dev, } EXPORT_SYMBOL(of_phy_get_and_connect); -/** - * of_phy_attach - Attach to a PHY without starting the state machine - * @dev: pointer to net_device claiming the phy - * @phy_np: Node pointer for the PHY - * @flags: flags to pass to the PHY - * @iface: PHY data interface type - * - * If successful, returns a pointer to the phy_device with the embedded - * struct device refcount incremented by one, or NULL on failure. The - * refcount must be dropped by calling phy_disconnect() or phy_detach(). - */ -struct phy_device *of_phy_attach(struct net_device *dev, - struct device_node *phy_np, u32 flags, - phy_interface_t iface) -{ - struct phy_device *phy = of_phy_find_device(phy_np); - int ret; - - if (!phy) - return NULL; - - ret = phy_attach_direct(dev, phy, flags, iface); - - /* refcount is held by phy_attach_direct() on success */ - put_device(&phy->mdio.dev); - - return ret ? NULL : phy; -} -EXPORT_SYMBOL(of_phy_attach); - /* * of_phy_is_fixed_link() and of_phy_register_fixed_link() must * support two DT bindings: diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index cfe8c607a628..2b05e7f7c238 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -26,9 +26,6 @@ of_phy_connect(struct net_device *dev, struct device_node *phy_np, struct phy_device * of_phy_get_and_connect(struct net_device *dev, struct device_node *np, void (*hndlr)(struct net_device *)); -struct phy_device * -of_phy_attach(struct net_device *dev, struct device_node *phy_np, - u32 flags, phy_interface_t iface); struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); int of_phy_register_fixed_link(struct device_node *np); @@ -100,13 +97,6 @@ of_phy_get_and_connect(struct net_device *dev, struct device_node *np, return NULL; } -static inline struct phy_device *of_phy_attach(struct net_device *dev, - struct device_node *phy_np, - u32 flags, phy_interface_t iface) -{ - return NULL; -} - static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np) { return NULL; -- cgit v1.2.3 From 20e07e2c3cf310578ef19fb4f1e64dc9832abd9d Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Wed, 17 Feb 2021 17:57:05 +0800 Subject: net: stmmac: Add PCI bus info to ethtool driver query output This patch populates the PCI bus info in the ethtool driver query data. Users will be able to view PCI bus info using 'ethtool -i '. Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 4 ++++ include/linux/stmmac.h | 1 + 3 files changed, 6 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 1c9c67b641a2..751dfdeec41c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -236,6 +236,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, int ret; int i; + plat->pdev = pdev; plat->phy_addr = -1; plat->clk_csr = 5; plat->has_gmac = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 9e54f953634b..c5642985ef95 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -268,6 +268,10 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev, strlcpy(info->driver, MAC100_ETHTOOL_NAME, sizeof(info->driver)); + if (priv->plat->pdev) { + strlcpy(info->bus_info, pci_name(priv->plat->pdev), + sizeof(info->bus_info)); + } strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version)); } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 15ca6b4167cc..a302982de2d7 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -202,5 +202,6 @@ struct plat_stmmacenet_data { bool vlan_fail_q_en; u8 vlan_fail_q; unsigned int eee_usecs_rate; + struct pci_dev *pdev; }; #endif -- cgit v1.2.3