From 9f460ae31c4435fd022c443a6029352217a16ac1 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 18 May 2021 21:00:27 +0200 Subject: batman-adv: Avoid WARN_ON timing related checks The soft/batadv interface for a queued OGM can be changed during the time the OGM was queued for transmission and when the OGM is actually transmitted by the worker. But WARN_ON must be used to denote kernel bugs and not to print simple warnings. A warning can simply be printed using pr_warn. Reported-by: Tetsuo Handa Reported-by: syzbot+c0b807de416427ff3dd1@syzkaller.appspotmail.com Fixes: ef0a937f7a14 ("batman-adv: consider outgoing interface in OGM sending") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 789f257be24f..fc8be49010b9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -409,8 +409,10 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) if (WARN_ON(!forw_packet->if_outgoing)) return; - if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) + if (forw_packet->if_outgoing->soft_iface != soft_iface) { + pr_warn("%s: soft interface switch for queued OGM\n", __func__); return; + } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) return; -- cgit v1.2.3 From 0ee4d55534f82a0624701d0bb9fc2304d4529086 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 16:47:17 +0200 Subject: mac80211: remove warning in ieee80211_get_sband() Syzbot reports that it's possible to hit this from userspace, by trying to add a station before any other connection setup has been done. Instead of trying to catch this in some other way simply remove the warning, that will appropriately reject the call from userspace. Reported-by: syzbot+7716dbc401d9a437890d@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20210517164715.f537da276d17.Id05f40ec8761d6a8cc2df87f1aa09c651988a586@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 214404a558fb..648696b49f89 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1442,7 +1442,7 @@ ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (WARN_ON_ONCE(!chanctx_conf)) { + if (!chanctx_conf) { rcu_read_unlock(); return NULL; } -- cgit v1.2.3 From bd18de517923903a177508fc8813f44e717b1c00 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 17:04:31 +0200 Subject: mac80211_hwsim: drop pending frames on stop Syzbot reports that we may be able to get into a situation where mac80211 has pending ACK frames on shutdown with hwsim. It appears that the reason for this is that syzbot uses the wmediumd hooks to intercept/injection frames, and may shut down hwsim, removing the radio(s), while frames are pending in the air simulation. Clean out the pending queue when the interface is stopped, after this the frames can't be reported back to mac80211 properly anyway. Reported-by: syzbot+a063bbf0b15737362592@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20210517170429.b0f85ab0eda1.Ie42a6ec6b940c971f3441286aeaaae2fe368e29a@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 51ce767eaf88..7a6fd46d0c6e 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1693,8 +1693,13 @@ static int mac80211_hwsim_start(struct ieee80211_hw *hw) static void mac80211_hwsim_stop(struct ieee80211_hw *hw) { struct mac80211_hwsim_data *data = hw->priv; + data->started = false; hrtimer_cancel(&data->beacon_timer); + + while (!skb_queue_empty(&data->pending)) + ieee80211_free_txskb(hw, skb_dequeue(&data->pending)); + wiphy_dbg(hw->wiphy, "%s\n", __func__); } -- cgit v1.2.3 From 34fb4db5abc1fe6708522cbf13f637e0eefb1a50 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Wed, 5 May 2021 13:28:29 -0700 Subject: mac80211: correct ieee80211_iterate_active_interfaces_mtx() locking comments Commit a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") dropped usage of RTNL here and replaced it with hw->wiphy->mutex. But we didn't update the comments. Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Signed-off-by: Brian Norris Link: https://lore.kernel.org/r/20210505202829.1039400-1-briannorris@chromium.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 445b66c6eb7e..e7c59b4e2c44 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5537,7 +5537,7 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. - * This version can only be used while holding the RTNL. + * This version can only be used while holding the wiphy mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags -- cgit v1.2.3 From a64b6a25dd9f984ed05fade603a00e2eae787d2f Mon Sep 17 00:00:00 2001 From: Du Cheng Date: Wed, 28 Apr 2021 14:39:41 +0800 Subject: cfg80211: call cfg80211_leave_ocb when switching away from OCB If the userland switches back-and-forth between NL80211_IFTYPE_OCB and NL80211_IFTYPE_ADHOC via send_msg(NL80211_CMD_SET_INTERFACE), there is a chance where the cleanup cfg80211_leave_ocb() is not called. This leads to initialization of in-use memory (e.g. init u.ibss while in-use by u.ocb) due to a shared struct/union within ieee80211_sub_if_data: struct ieee80211_sub_if_data { ... union { struct ieee80211_if_ap ap; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; // <- shares address struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; // <- shares address struct ieee80211_if_mntr mntr; struct ieee80211_if_nan nan; } u; ... } Therefore add handling of otype == NL80211_IFTYPE_OCB, during cfg80211_change_iface() to perform cleanup when leaving OCB mode. link to syzkaller bug: https://syzkaller.appspot.com/bug?id=0612dbfa595bf4b9b680ff7b4948257b8e3732d5 Reported-by: syzbot+105896fac213f26056f9@syzkaller.appspotmail.com Signed-off-by: Du Cheng Link: https://lore.kernel.org/r/20210428063941.105161-1-ducheng2@gmail.com Signed-off-by: Johannes Berg --- net/wireless/util.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index 7ec021a610ae..18dba3d7c638 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1059,6 +1059,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; + case NL80211_IFTYPE_OCB: + cfg80211_leave_ocb(rdev, dev); + break; default: break; } -- cgit v1.2.3 From b90f51e8e1f5014c01c82a7bf4c611643d0a8bcb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 26 Apr 2021 21:28:02 +0200 Subject: staging: rtl8723bs: fix monitor netdev register/unregister Due to the locking changes and callbacks happening inside cfg80211, we need to use cfg80211 versions of the register and unregister functions if called within cfg80211 methods, otherwise deadlocks occur. Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20210426212801.3d902cc9e6f4.Ie0b1e0c545920c61400a4b7d0f384ea61feb645a@changeid Signed-off-by: Johannes Berg --- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index a6d731e959a2..36a1319ec4bf 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -2284,7 +2284,7 @@ static int rtw_cfg80211_add_monitor_if(struct adapter *padapter, char *name, str mon_wdev->iftype = NL80211_IFTYPE_MONITOR; mon_ndev->ieee80211_ptr = mon_wdev; - ret = register_netdevice(mon_ndev); + ret = cfg80211_register_netdevice(mon_ndev); if (ret) { goto out; } @@ -2360,7 +2360,7 @@ static int cfg80211_rtw_del_virtual_intf(struct wiphy *wiphy, adapter = rtw_netdev_priv(ndev); pwdev_priv = adapter_wdev_data(adapter); - unregister_netdevice(ndev); + cfg80211_unregister_netdevice(ndev); if (ndev == pwdev_priv->pmon_ndev) { pwdev_priv->pmon_ndev = NULL; -- cgit v1.2.3 From e298aa358f0ca658406d524b6639fe389cb6e11e Mon Sep 17 00:00:00 2001 From: Du Cheng Date: Mon, 10 May 2021 12:16:49 +0800 Subject: mac80211: fix skb length check in ieee80211_scan_rx() Replace hard-coded compile-time constants for header length check with dynamic determination based on the frame type. Otherwise, we hit a validation WARN_ON in cfg80211 later. Fixes: cd418ba63f0c ("mac80211: convert S1G beacon to scan results") Reported-by: syzbot+405843667e93b9790fc1@syzkaller.appspotmail.com Signed-off-by: Du Cheng Link: https://lore.kernel.org/r/20210510041649.589754-1-ducheng2@gmail.com [style fixes, reword commit message] Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index d4cc9ac2d703..6b50cb5e0e3c 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -251,13 +251,24 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; + size_t min_hdr_len = offsetof(struct ieee80211_mgmt, + u.probe_resp.variable); + + if (!ieee80211_is_probe_resp(mgmt->frame_control) && + !ieee80211_is_beacon(mgmt->frame_control) && + !ieee80211_is_s1g_beacon(mgmt->frame_control)) + return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { - if (skb->len < 15) - return; - } else if (skb->len < 24 || - (!ieee80211_is_probe_resp(mgmt->frame_control) && - !ieee80211_is_beacon(mgmt->frame_control))) + if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) + min_hdr_len = offsetof(struct ieee80211_ext, + u.s1g_short_beacon.variable); + else + min_hdr_len = offsetof(struct ieee80211_ext, + u.s1g_beacon); + } + + if (skb->len < min_hdr_len) return; sdata1 = rcu_dereference(local->scan_sdata); -- cgit v1.2.3 From bddc0c411a45d3718ac535a070f349be8eca8d48 Mon Sep 17 00:00:00 2001 From: Mathy Vanhoef Date: Sun, 30 May 2021 15:32:26 +0200 Subject: mac80211: Fix NULL ptr deref for injected rate info The commit cb17ed29a7a5 ("mac80211: parse radiotap header when selecting Tx queue") moved the code to validate the radiotap header from ieee80211_monitor_start_xmit to ieee80211_parse_tx_radiotap. This made is possible to share more code with the new Tx queue selection code for injected frames. But at the same time, it now required the call of ieee80211_parse_tx_radiotap at the beginning of functions which wanted to handle the radiotap header. And this broke the rate parser for radiotap header parser. The radiotap parser for rates is operating most of the time only on the data in the actual radiotap header. But for the 802.11a/b/g rates, it must also know the selected band from the chandef information. But this information is only written to the ieee80211_tx_info at the end of the ieee80211_monitor_start_xmit - long after ieee80211_parse_tx_radiotap was already called. The info->band information was therefore always 0 (NL80211_BAND_2GHZ) when the parser code tried to access it. For a 5GHz only device, injecting a frame with 802.11a rates would cause a NULL pointer dereference because local->hw.wiphy->bands[NL80211_BAND_2GHZ] would most likely have been NULL when the radiotap parser searched for the correct rate index of the driver. Cc: stable@vger.kernel.org Reported-by: Ben Greear Fixes: cb17ed29a7a5 ("mac80211: parse radiotap header when selecting Tx queue") Signed-off-by: Mathy Vanhoef [sven@narfation.org: added commit message] Signed-off-by: Sven Eckelmann Link: https://lore.kernel.org/r/20210530133226.40587-1-sven@narfation.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 ++++++- net/mac80211/tx.c | 52 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e7c59b4e2c44..e89530d0d9c6 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6392,7 +6392,12 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, /** * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header - * of injected frames + * of injected frames. + * + * To accurately parse and take into account rate and retransmission fields, + * you must initialize the chandef field in the ieee80211_tx_info structure + * of the skb before calling this function. + * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 0b719f3d2dec..2651498d05e8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2014,6 +2014,26 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, ieee80211_tx(sdata, sta, skb, false); } +static bool ieee80211_validate_radiotap_len(struct sk_buff *skb) +{ + struct ieee80211_radiotap_header *rthdr = + (struct ieee80211_radiotap_header *)skb->data; + + /* check for not even having the fixed radiotap header part */ + if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) + return false; /* too short to be possibly valid */ + + /* is it a header version we can trust to find length from? */ + if (unlikely(rthdr->it_version)) + return false; /* only version 0 is supported */ + + /* does the skb contain enough to deliver on the alleged length? */ + if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) + return false; /* skb too short for claimed rt header extent */ + + return true; +} + bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev) { @@ -2022,8 +2042,6 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_supported_band *sband = - local->hw.wiphy->bands[info->band]; int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, NULL); u16 txflags; @@ -2036,17 +2054,8 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, u8 vht_mcs = 0, vht_nss = 0; int i; - /* check for not even having the fixed radiotap header part */ - if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) - return false; /* too short to be possibly valid */ - - /* is it a header version we can trust to find length from? */ - if (unlikely(rthdr->it_version)) - return false; /* only version 0 is supported */ - - /* does the skb contain enough to deliver on the alleged length? */ - if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) - return false; /* skb too short for claimed rt header extent */ + if (!ieee80211_validate_radiotap_len(skb)) + return false; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; @@ -2186,6 +2195,9 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, return false; if (rate_found) { + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[info->band]; + info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { @@ -2199,7 +2211,7 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) { ieee80211_rate_set_vht(info->control.rates, vht_mcs, vht_nss); - } else { + } else if (sband) { for (i = 0; i < sband->n_bitrates; i++) { if (rate * 5 != sband->bitrates[i].bitrate) continue; @@ -2236,8 +2248,8 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; - /* Sanity-check and process the injection radiotap header */ - if (!ieee80211_parse_tx_radiotap(skb, dev)) + /* Sanity-check the length of the radiotap header */ + if (!ieee80211_validate_radiotap_len(skb)) goto fail; /* we now know there is a radiotap header with a length we can use */ @@ -2351,6 +2363,14 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, ieee80211_select_queue_80211(sdata, skb, hdr); skb_set_queue_mapping(skb, ieee80211_ac_from_tid(skb->priority)); + /* + * Process the radiotap header. This will now take into account the + * selected chandef above to accurately set injection rates and + * retransmissions. + */ + if (!ieee80211_parse_tx_radiotap(skb, dev)) + goto fail_rcu; + /* remove the injection radiotap header */ skb_pull(skb, len_rthdr); -- cgit v1.2.3 From 51c96a561f244e25a4a2afc7a48b92b4adf8050d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 6 Jun 2021 17:24:22 +0300 Subject: ethtool: Fix NULL pointer dereference during module EEPROM dump When get_module_eeprom_by_page() is not implemented by the driver, NULL pointer dereference can occur [1]. Fix by testing if get_module_eeprom_by_page() is implemented instead of get_module_info(). [1] BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] CPU: 0 PID: 251 Comm: ethtool Not tainted 5.13.0-rc3-custom-00940-g3822d0670c9d #989 Call Trace: eeprom_prepare_data+0x101/0x2d0 ethnl_default_doit+0xc2/0x290 genl_family_rcv_msg_doit+0xdc/0x140 genl_rcv_msg+0xd7/0x1d0 netlink_rcv_skb+0x49/0xf0 genl_rcv+0x1f/0x30 netlink_unicast+0x1f6/0x2c0 netlink_sendmsg+0x1f9/0x400 __sys_sendto+0xe1/0x130 __x64_sys_sendto+0x1b/0x20 do_syscall_64+0x3a/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c97a31f66ebc ("ethtool: wire in generic SFP module access") Signed-off-by: Ido Schimmel Acked-by: Moshe Shemesh Signed-off-by: David S. Miller --- net/ethtool/eeprom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 2a6733a6449a..5d38e90895ac 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -95,7 +95,7 @@ static int get_module_eeprom_by_page(struct net_device *dev, if (dev->sfp_bus) return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack); - if (ops->get_module_info) + if (ops->get_module_eeprom_by_page) return ops->get_module_eeprom_by_page(dev, page_data, extack); return -EOPNOTSUPP; -- cgit v1.2.3 From 306b9228c097b4101c150ccd262372ded8348644 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sun, 6 Jun 2021 11:24:30 +0300 Subject: mlxsw: reg: Spectrum-3: Enforce lowest max-shaper burst size of 11 A max-shaper is the HW component responsible for delaying egress traffic above a configured transmission rate. Burst size is the amount of traffic that is allowed to pass without accounting. The burst size value needs to be such that it can be expressed as 2^BS * 512 bits, where BS lies in a certain ASIC-dependent range. mlxsw enforces that this holds before attempting to configure the shaper. The assumption for Spectrum-3 was that the lower limit of BS would be 5, like for Spectrum-1. But as of now, the limit is still 11. Therefore fix the driver accordingly, so that incorrect values are rejected early with a proper message. Fixes: 23effa2479ba ("mlxsw: reg: Add max_shaper_bs to QoS ETS Element Configuration") Reported-by: Maksym Yaremchuk Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 900b4bf5bb5b..2bc5a9003c6d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3907,7 +3907,7 @@ MLXSW_ITEM32(reg, qeec, max_shaper_bs, 0x1C, 0, 6); #define MLXSW_REG_QEEC_HIGHEST_SHAPER_BS 25 #define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP1 5 #define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP2 11 -#define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP3 5 +#define MLXSW_REG_QEEC_LOWEST_SHAPER_BS_SP3 11 static inline void mlxsw_reg_qeec_pack(char *payload, u8 local_port, enum mlxsw_reg_qeec_hr hr, u8 index, -- cgit v1.2.3 From d566ed04e42bbb7144cf52718b77ca5c791abc09 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sun, 6 Jun 2021 11:24:31 +0300 Subject: mlxsw: spectrum_qdisc: Pass handle, not band number to find_class() In mlxsw Qdisc offload, find_class() is an operation that yields a qdisc offload descriptor given a parental qdisc descriptor and a class handle. In __mlxsw_sp_qdisc_ets_graft() however, a band number is passed to that function instead of a handle. This can lead to a trigger of a WARN_ON with the following splat: WARNING: CPU: 3 PID: 808 at drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c:1356 __mlxsw_sp_qdisc_ets_graft+0x115/0x130 [mlxsw_spectrum] [...] Call Trace: mlxsw_sp_setup_tc_prio+0xe3/0x100 [mlxsw_spectrum] qdisc_offload_graft_helper+0x35/0xa0 prio_graft+0x176/0x290 [sch_prio] qdisc_graft+0xb3/0x540 tc_modify_qdisc+0x56a/0x8a0 rtnetlink_rcv_msg+0x12c/0x370 netlink_rcv_skb+0x49/0xf0 netlink_unicast+0x1f6/0x2b0 netlink_sendmsg+0x1fb/0x410 ____sys_sendmsg+0x1f3/0x220 ___sys_sendmsg+0x70/0xb0 __sys_sendmsg+0x54/0xa0 do_syscall_64+0x3a/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae Since the parent handle is not passed with the offload information, compute it from the band number and qdisc handle. Fixes: 28052e618b04 ("mlxsw: spectrum_qdisc: Track children per qdisc") Reported-by: Maksym Yaremchuk Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c index 04672eb5c7f3..9958d503bf0e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c @@ -1332,6 +1332,7 @@ __mlxsw_sp_qdisc_ets_graft(struct mlxsw_sp_port *mlxsw_sp_port, u8 band, u32 child_handle) { struct mlxsw_sp_qdisc *old_qdisc; + u32 parent; if (band < mlxsw_sp_qdisc->num_classes && mlxsw_sp_qdisc->qdiscs[band].handle == child_handle) @@ -1352,7 +1353,9 @@ __mlxsw_sp_qdisc_ets_graft(struct mlxsw_sp_port *mlxsw_sp_port, if (old_qdisc) mlxsw_sp_qdisc_destroy(mlxsw_sp_port, old_qdisc); - mlxsw_sp_qdisc = mlxsw_sp_qdisc->ops->find_class(mlxsw_sp_qdisc, band); + parent = TC_H_MAKE(mlxsw_sp_qdisc->handle, band + 1); + mlxsw_sp_qdisc = mlxsw_sp_qdisc->ops->find_class(mlxsw_sp_qdisc, + parent); if (!WARN_ON(!mlxsw_sp_qdisc)) mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc); -- cgit v1.2.3 From 2fd8d84ce3095e8a7b5fe96532c91b1b9e07339c Mon Sep 17 00:00:00 2001 From: Mykola Kostenok Date: Sun, 6 Jun 2021 11:24:32 +0300 Subject: mlxsw: core: Set thermal zone polling delay argument to real value at init Thermal polling delay argument for modules and gearboxes thermal zones used to be initialized with zero value, while actual delay was used to be set by mlxsw_thermal_set_mode() by thermal operation callback set_mode(). After operations set_mode()/get_mode() have been removed by cited commits, modules and gearboxes thermal zones always have polling time set to zero and do not perform temperature monitoring. Set non-zero "polling_delay" in thermal_zone_device_register() routine, thus, the relevant thermal zones will perform thermal monitoring. Cc: Andrzej Pietrasiewicz Fixes: 5d7bd8aa7c35 ("thermal: Simplify or eliminate unnecessary set_mode() methods") Fixes: 1ee14820fd8e ("thermal: remove get_mode() operation of drivers") Signed-off-by: Mykola Kostenok Acked-by: Vadim Pasternak Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index dfea14399607..85f0ce285146 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -693,7 +693,8 @@ mlxsw_thermal_module_tz_init(struct mlxsw_thermal_module *module_tz) MLXSW_THERMAL_TRIP_MASK, module_tz, &mlxsw_thermal_module_ops, - NULL, 0, 0); + NULL, 0, + module_tz->parent->polling_delay); if (IS_ERR(module_tz->tzdev)) { err = PTR_ERR(module_tz->tzdev); return err; @@ -815,7 +816,8 @@ mlxsw_thermal_gearbox_tz_init(struct mlxsw_thermal_module *gearbox_tz) MLXSW_THERMAL_TRIP_MASK, gearbox_tz, &mlxsw_thermal_gearbox_ops, - NULL, 0, 0); + NULL, 0, + gearbox_tz->parent->polling_delay); if (IS_ERR(gearbox_tz->tzdev)) return PTR_ERR(gearbox_tz->tzdev); -- cgit v1.2.3 From a47c397bb29fce1751dc755246a2c8deeca5e38f Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Mon, 7 Jun 2021 21:46:23 +0300 Subject: revert "net: kcm: fix memory leak in kcm_sendmsg" In commit c47cc304990a ("net: kcm: fix memory leak in kcm_sendmsg") I misunderstood the root case of the memory leak and came up with completely broken fix. So, simply revert this commit to avoid GPF reported by syzbot. Im so sorry for this situation. Fixes: c47cc304990a ("net: kcm: fix memory leak in kcm_sendmsg") Reported-by: syzbot+65badd5e74ec62cb67dc@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 1c572c8daced..6201965bd822 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1066,11 +1066,6 @@ out_error: goto partial_message; } - if (skb_has_frag_list(head)) { - kfree_skb_list(skb_shinfo(head)->frag_list); - skb_shinfo(head)->frag_list = NULL; - } - if (head != kcm->seq_skb) kfree_skb(head); -- cgit v1.2.3 From 7a6b1ab7475fd6478eeaf5c9d1163e7a18125c8f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 7 Jun 2021 11:35:30 -0600 Subject: neighbour: allow NUD_NOARP entries to be forced GCed IFF_POINTOPOINT interfaces use NUD_NOARP entries for IPv6. It's possible to fill up the neighbour table with enough entries that it will overflow for valid connections after that. This behaviour is more prevalent after commit 58956317c8de ("neighbor: Improve garbage collection") is applied, as it prevents removal from entries that are not NUD_FAILED, unless they are more than 5s old. Fixes: 58956317c8de (neighbor: Improve garbage collection) Reported-by: Kasper Dupont Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 98f20efbfadf..bf774575ad71 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -238,6 +238,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || + (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || time_after(tref, n->updated)) -- cgit v1.2.3 From 11fc79fc9f2e395aa39fa5baccae62767c5d8280 Mon Sep 17 00:00:00 2001 From: Kev Jackson Date: Mon, 7 Jun 2021 14:08:35 +0100 Subject: libbpf: Fixes incorrect rx_ring_setup_done When calling xsk_socket__create_shared(), the logic at line 1097 marks a boolean flag true within the xsk_umem structure to track setup progress in order to support multiple calls to the function. However, instead of marking umem->tx_ring_setup_done, the code incorrectly sets umem->rx_ring_setup_done. This leads to improper behaviour when creating and destroying xsk and umem structures. Multiple calls to this function is documented as supported. Fixes: ca7a83e2487a ("libbpf: Only create rx and tx XDP rings when necessary") Signed-off-by: Kev Jackson Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/YL4aU4f3Aaik7CN0@linux-dev --- tools/lib/bpf/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 6061431ee04c..e9b619aa0cdf 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -1094,7 +1094,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, goto out_put_ctx; } if (xsk->fd == umem->fd) - umem->rx_ring_setup_done = true; + umem->tx_ring_setup_done = true; } err = xsk_get_mmap_offsets(xsk->fd, &off); -- cgit v1.2.3 From d5befb224edbe53056c2c18999d630dafb4a08b9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 16:03:23 +0200 Subject: mac80211: fix deadlock in AP/VLAN handling Syzbot reports that when you have AP_VLAN interfaces that are up and close the AP interface they belong to, we get a deadlock. No surprise - since we dev_close() them with the wiphy mutex held, which goes back into the netdev notifier in cfg80211 and tries to acquire the wiphy mutex there. To fix this, we need to do two things: 1) prevent changing iftype while AP_VLANs are up, we can't easily fix this case since cfg80211 already calls us with the wiphy mutex held, but change_interface() is relatively rare in drivers anyway, so changing iftype isn't used much (and userspace has to fall back to down/change/up anyway) 2) pull the dev_close() loop over VLANs out of the wiphy mutex section in the normal stop case Cc: stable@vger.kernel.org Reported-by: syzbot+452ea4fbbef700ff0a56@syzkaller.appspotmail.com Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Link: https://lore.kernel.org/r/20210517160322.9b8f356c0222.I392cb0e2fa5a1a94cf2e637555d702c7e512c1ff@changeid Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 2e2f73a4aa73..137fa4c50e07 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -476,14 +476,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do GFP_KERNEL); } - /* APs need special treatment */ if (sdata->vif.type == NL80211_IFTYPE_AP) { - struct ieee80211_sub_if_data *vlan, *tmpsdata; - - /* down all dependent devices, that is VLANs */ - list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, - u.vlan.list) - dev_close(vlan->dev); WARN_ON(!list_empty(&sdata->u.ap.vlans)); } else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { /* remove all packets in parent bc_buf pointing to this dev */ @@ -641,6 +634,15 @@ static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + /* close all dependent VLAN interfaces before locking wiphy */ + if (sdata->vif.type == NL80211_IFTYPE_AP) { + struct ieee80211_sub_if_data *vlan, *tmpsdata; + + list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, + u.vlan.list) + dev_close(vlan->dev); + } + wiphy_lock(sdata->local->hw.wiphy); ieee80211_do_stop(sdata, true); wiphy_unlock(sdata->local->hw.wiphy); @@ -1591,6 +1593,9 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: + if (!list_empty(&sdata->u.ap.vlans)) + return -EBUSY; + break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_OCB: -- cgit v1.2.3 From d612c3f3fae221e7ea736d196581c2217304bbbc Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 8 Jun 2021 09:51:58 +0800 Subject: net: ipv4: fix memory leak in netlbl_cipsov4_add_std Reported by syzkaller: BUG: memory leak unreferenced object 0xffff888105df7000 (size 64): comm "syz-executor842", pid 360, jiffies 4294824824 (age 22.546s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000e67ed558>] kmalloc include/linux/slab.h:590 [inline] [<00000000e67ed558>] kzalloc include/linux/slab.h:720 [inline] [<00000000e67ed558>] netlbl_cipsov4_add_std net/netlabel/netlabel_cipso_v4.c:145 [inline] [<00000000e67ed558>] netlbl_cipsov4_add+0x390/0x2340 net/netlabel/netlabel_cipso_v4.c:416 [<0000000006040154>] genl_family_rcv_msg_doit.isra.0+0x20e/0x320 net/netlink/genetlink.c:739 [<00000000204d7a1c>] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] [<00000000204d7a1c>] genl_rcv_msg+0x2bf/0x4f0 net/netlink/genetlink.c:800 [<00000000c0d6a995>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000d78b9d2c>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:811 [<000000009733081b>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000009733081b>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000d5fd43b8>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000a2d1e40>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000a2d1e40>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000321d1969>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000964e16bc>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000001615e288>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<000000004ee8b6a5>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<00000000171c7cee>] entry_SYSCALL_64_after_hwframe+0x44/0xae The memory of doi_def->map.std pointing is allocated in netlbl_cipsov4_add_std, but no place has freed it. It should be freed in cipso_v4_doi_free which frees the cipso DOI resource. Fixes: 96cb8e3313c7a ("[NetLabel]: CIPSOv4 and Unlabeled packet integration") Reported-by: Hulk Robot Signed-off-by: Nanyong Sun Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index bfaf327e9d12..e0480c6cebaa 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -472,6 +472,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); break; } kfree(doi_def); -- cgit v1.2.3 From 5ac6b198d7e312bd10ebe7d58c64690dc59cc49a Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 8 Jun 2021 09:53:15 +0800 Subject: net: ipv4: Remove unneed BUG() function When 'nla_parse_nested_deprecated' failed, it's no need to BUG() here, return -EINVAL is ok. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 2e35f68da40a..1c6429c353a9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1989,7 +1989,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) - BUG(); + return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b0ef65eb9bd2..701eb82acd1c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5827,7 +5827,7 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla, return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) - BUG(); + return -EINVAL; if (tb[IFLA_INET6_TOKEN]) { err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]), -- cgit v1.2.3 From d439aa33a9b917cfbca8a528f13367aff974aeb7 Mon Sep 17 00:00:00 2001 From: gushengxian Date: Mon, 7 Jun 2021 19:19:32 -0700 Subject: net: appletalk: fix the usage of preposition The preposition "for" should be changed to preposition "of". Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/appletalk/aarp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index be18af481d7d..c7236daa2415 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -768,7 +768,7 @@ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, if (a && a->status & ATIF_PROBE) { a->status |= ATIF_PROBE_FAIL; /* - * we do not respond to probe or request packets for + * we do not respond to probe or request packets of * this address while we are probing this address */ goto unlock; -- cgit v1.2.3 From 9bb392f62447d73cc7dd7562413a2cd9104c82f8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 8 Jun 2021 16:59:51 +0200 Subject: vrf: fix maximum MTU My initial goal was to fix the default MTU, which is set to 65536, ie above the maximum defined in the driver: 65535 (ETH_MAX_MTU). In fact, it's seems more consistent, wrt min_mtu, to set the max_mtu to IP6_MAX_MTU (65535 + sizeof(struct ipv6hdr)) and use it by default. Let's also, for consistency, set the mtu in vrf_setup(). This function calls ether_setup(), which set the mtu to 1500. Thus, the whole mtu config is done in the same function. Before the patch: $ ip link add blue type vrf table 1234 $ ip link list blue 9: blue: mtu 65536 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether fa:f5:27:70:24:2a brd ff:ff:ff:ff:ff:ff $ ip link set dev blue mtu 65535 $ ip link set dev blue mtu 65536 Error: mtu greater than device maximum. Fixes: 5055376a3b44 ("net: vrf: Fix ping failed when vrf mtu is set to 0") CC: Miaohe Lin Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 503e2fd7ce51..28a6c4cfe9b8 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1183,9 +1183,6 @@ static int vrf_dev_init(struct net_device *dev) dev->flags = IFF_MASTER | IFF_NOARP; - /* MTU is irrelevant for VRF device; set to 64k similar to lo */ - dev->mtu = 64 * 1024; - /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); @@ -1685,7 +1682,8 @@ static void vrf_setup(struct net_device *dev) * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; - dev->max_mtu = ETH_MAX_MTU; + dev->max_mtu = IP6_MAX_MTU; + dev->mtu = dev->max_mtu; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], -- cgit v1.2.3 From 49bfcbfd989a8f1f23e705759a6bb099de2cff9f Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 8 Jun 2021 11:06:41 +0300 Subject: net: rds: fix memory leak in rds_recvmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzbot reported memory leak in rds. The problem was in unputted refcount in case of error. int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { ... if (!rds_next_incoming(rs, &inc)) { ... } After this "if" inc refcount incremented and if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; goto out; } ... out: return ret; } in case of rds_cmsg_recv() fail the refcount won't be decremented. And it's easy to see from ftrace log, that rds_inc_addref() don't have rds_inc_put() pair in rds_recvmsg() after rds_cmsg_recv() 1) | rds_recvmsg() { 1) 3.721 us | rds_inc_addref(); 1) 3.853 us | rds_message_inc_copy_to_user(); 1) + 10.395 us | rds_cmsg_recv(); 1) + 34.260 us | } Fixes: bdbe6fbc6a2f ("RDS: recv.c") Reported-and-tested-by: syzbot+5134cdf021c4ed5aaa5f@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Reviewed-by: Håkon Bugge Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/recv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/recv.c b/net/rds/recv.c index 4db109fb6ec2..5b426dc3634d 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -714,7 +714,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; - goto out; + break; } rds_recvmsg_zcookie(rs, msg); -- cgit v1.2.3 From 1650bdb1c516c248fb06f6d076559ff6437a5853 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 8 Jun 2021 14:15:35 +0300 Subject: net: dsa: felix: re-enable TX flow control in ocelot_port_flush() Because flow control is set up statically in ocelot_init_port(), and not in phylink_mac_link_up(), what happens is that after the blamed commit, the flow control remains disabled after the port flushing procedure. Fixes: eb4733d7cffc ("net: dsa: felix: implement port flushing on .phylink_mac_link_down") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 0c4283319d7f..adfb9781799e 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -379,6 +379,7 @@ static u32 ocelot_read_eq_avail(struct ocelot *ocelot, int port) int ocelot_port_flush(struct ocelot *ocelot, int port) { + unsigned int pause_ena; int err, val; /* Disable dequeuing from the egress queues */ @@ -387,6 +388,7 @@ int ocelot_port_flush(struct ocelot *ocelot, int port) QSYS_PORT_MODE, port); /* Disable flow control */ + ocelot_fields_read(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, &pause_ena); ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 0); /* Disable priority flow control */ @@ -422,6 +424,9 @@ int ocelot_port_flush(struct ocelot *ocelot, int port) /* Clear flushing again. */ ocelot_rmw_gix(ocelot, 0, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG, port); + /* Re-enable flow control */ + ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, pause_ena); + return err; } EXPORT_SYMBOL(ocelot_port_flush); -- cgit v1.2.3 From 504fd6a5390c30b1b7670768e314dd5d473da06a Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Tue, 8 Jun 2021 19:42:54 +0300 Subject: net: ena: fix DMA mapping function issues in XDP This patch fixes several bugs found when (DMA/LLQ) mapping a packet for transmission. The mapping procedure makes the transmitted packet accessible by the device. When using LLQ, this requires copying the packet's header to push header (which would be passed to LLQ) and creating DMA mapping for the payload (if the packet doesn't fit the maximum push length). When not using LLQ, we map the whole packet with DMA. The following bugs are fixed in the code: 1. Add support for non-LLQ machines: The ena_xdp_tx_map_frame() function assumed that LLQ is supported, and never mapped the whole packet using DMA. On some instances, which don't support LLQ, this causes loss of traffic. 2. Wrong DMA buffer length passed to device: When using LLQ, the first 'tx_max_header_size' bytes of the packet would be copied to push header. The rest of the packet would be copied to a DMA'd buffer. 3. Freeing the XDP buffer twice in case of a mapping error: In case a buffer DMA mapping fails, the function uses xdp_return_frame_rx_napi() to free the RX buffer and returns from the function with an error. XDP frames that fail to xmit get freed by the kernel and so there is no need for this call. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Shay Agroskin Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 54 ++++++++++++++-------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 881f88754bf6..52571486705e 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -236,36 +236,48 @@ static int ena_xdp_io_poll(struct napi_struct *napi, int budget) static int ena_xdp_tx_map_frame(struct ena_ring *xdp_ring, struct ena_tx_buffer *tx_info, struct xdp_frame *xdpf, - void **push_hdr, - u32 *push_len) + struct ena_com_tx_ctx *ena_tx_ctx) { struct ena_adapter *adapter = xdp_ring->adapter; struct ena_com_buf *ena_buf; - dma_addr_t dma = 0; + int push_len = 0; + dma_addr_t dma; + void *data; u32 size; tx_info->xdpf = xdpf; + data = tx_info->xdpf->data; size = tx_info->xdpf->len; - ena_buf = tx_info->bufs; - /* llq push buffer */ - *push_len = min_t(u32, size, xdp_ring->tx_max_header_size); - *push_hdr = tx_info->xdpf->data; + if (xdp_ring->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) { + /* Designate part of the packet for LLQ */ + push_len = min_t(u32, size, xdp_ring->tx_max_header_size); + + ena_tx_ctx->push_header = data; + + size -= push_len; + data += push_len; + } + + ena_tx_ctx->header_len = push_len; - if (size - *push_len > 0) { + if (size > 0) { dma = dma_map_single(xdp_ring->dev, - *push_hdr + *push_len, - size - *push_len, + data, + size, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(xdp_ring->dev, dma))) goto error_report_dma_error; - tx_info->map_linear_data = 1; - tx_info->num_of_bufs = 1; - } + tx_info->map_linear_data = 0; - ena_buf->paddr = dma; - ena_buf->len = size; + ena_buf = tx_info->bufs; + ena_buf->paddr = dma; + ena_buf->len = size; + + ena_tx_ctx->ena_bufs = ena_buf; + ena_tx_ctx->num_bufs = tx_info->num_of_bufs = 1; + } return 0; @@ -274,10 +286,6 @@ error_report_dma_error: &xdp_ring->syncp); netif_warn(adapter, tx_queued, adapter->netdev, "Failed to map xdp buff\n"); - xdp_return_frame_rx_napi(tx_info->xdpf); - tx_info->xdpf = NULL; - tx_info->num_of_bufs = 0; - return -EINVAL; } @@ -289,8 +297,6 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring, struct ena_com_tx_ctx ena_tx_ctx = {}; struct ena_tx_buffer *tx_info; u16 next_to_use, req_id; - void *push_hdr; - u32 push_len; int rc; next_to_use = xdp_ring->next_to_use; @@ -298,15 +304,11 @@ static int ena_xdp_xmit_frame(struct ena_ring *xdp_ring, tx_info = &xdp_ring->tx_buffer_info[req_id]; tx_info->num_of_bufs = 0; - rc = ena_xdp_tx_map_frame(xdp_ring, tx_info, xdpf, &push_hdr, &push_len); + rc = ena_xdp_tx_map_frame(xdp_ring, tx_info, xdpf, &ena_tx_ctx); if (unlikely(rc)) return rc; - ena_tx_ctx.ena_bufs = tx_info->bufs; - ena_tx_ctx.push_header = push_hdr; - ena_tx_ctx.num_bufs = tx_info->num_of_bufs; ena_tx_ctx.req_id = req_id; - ena_tx_ctx.header_len = push_len; rc = ena_xmit_common(dev, xdp_ring, -- cgit v1.2.3 From f2386cf7c5f4ff5d7b584f5d92014edd7df6c676 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Tue, 8 Jun 2021 23:21:07 +0200 Subject: net: lantiq: disable interrupt before sheduling NAPI This patch fixes TX hangs with threaded NAPI enabled. The scheduled NAPI seems to be executed in parallel with the interrupt on second thread. Sometimes it happens that ltq_dma_disable_irq() is executed after xrx200_tx_housekeeping(). The symptom is that TX interrupts are disabled in the DMA controller. As a result, the TX hangs after a few seconds of the iperf test. Scheduling NAPI after disabling interrupts fixes this issue. Tested on Lantiq xRX200 (BT Home Hub 5A). Fixes: 9423361da523 ("net: lantiq: Disable IRQs only if NAPI gets scheduled ") Signed-off-by: Aleksander Jan Bajkowski Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_xrx200.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/lantiq_xrx200.c b/drivers/net/ethernet/lantiq_xrx200.c index 36dc3e5f6218..0e10d8aeffe1 100644 --- a/drivers/net/ethernet/lantiq_xrx200.c +++ b/drivers/net/ethernet/lantiq_xrx200.c @@ -352,8 +352,8 @@ static irqreturn_t xrx200_dma_irq(int irq, void *ptr) struct xrx200_chan *ch = ptr; if (napi_schedule_prep(&ch->napi)) { - __napi_schedule(&ch->napi); ltq_dma_disable_irq(&ch->dma); + __napi_schedule(&ch->napi); } ltq_dma_ack_irq(&ch->dma); -- cgit v1.2.3 From adaed1b9daf5a045be71e923e04b5069d2bee664 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Jun 2021 11:32:27 +0200 Subject: mac80211: fix 'reset' debugfs locking cfg80211 now calls suspend/resume with the wiphy lock held, and while there's a problem with that needing to be fixed, we should do the same in debugfs. Cc: stable@vger.kernel.org Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Link: https://lore.kernel.org/r/20210608113226.14020430e449.I78e19db0a55a8295a376e15ac4cf77dbb4c6fb51@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 9245c0421bda..b5ff61b6448a 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -4,7 +4,7 @@ * * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2019, 2021 Intel Corporation */ #include @@ -389,8 +389,10 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf, struct ieee80211_local *local = file->private_data; rtnl_lock(); + wiphy_lock(local->hw.wiphy); __ieee80211_suspend(&local->hw, NULL); __ieee80211_resume(&local->hw); + wiphy_unlock(local->hw.wiphy); rtnl_unlock(); return count; -- cgit v1.2.3 From 43076c1e074359f11c85d7d1b85ede1bbb8ee6b9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Jun 2021 11:32:28 +0200 Subject: cfg80211: fix phy80211 symlink creation When I moved around the code here, I neglected that we could still call register_netdev() or similar without the wiphy mutex held, which then calls cfg80211_register_wdev() - that's also done from cfg80211_register_netdevice(), but the phy80211 symlink creation was only there. Now, the symlink isn't needed for a *pure* wdev, but a netdev not registered via cfg80211_register_wdev() should still have the symlink, so move the creation to the right place. Cc: stable@vger.kernel.org Fixes: 2fe8ef106238 ("cfg80211: change netdev registration/unregistration semantics") Link: https://lore.kernel.org/r/20210608113226.a5dc4c1e488c.Ia42fe663cefe47b0883af78c98f284c5555bbe5d@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index 6fbf7537faf5..8d0883e81093 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1340,6 +1340,11 @@ void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, rdev->devlist_generation++; wdev->registered = true; + if (wdev->netdev && + sysfs_create_link(&wdev->netdev->dev.kobj, &rdev->wiphy.dev.kobj, + "phy80211")) + pr_err("failed to add phy80211 symlink to netdev!\n"); + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } @@ -1365,14 +1370,6 @@ int cfg80211_register_netdevice(struct net_device *dev) if (ret) goto out; - if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, - "phy80211")) { - pr_err("failed to add phy80211 symlink to netdev!\n"); - unregister_netdevice(dev); - ret = -EINVAL; - goto out; - } - cfg80211_register_wdev(rdev, wdev); ret = 0; out: -- cgit v1.2.3 From 65bec836da8394b1d56bdec2c478dcac21cf12a4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Jun 2021 11:32:29 +0200 Subject: cfg80211: shut down interfaces on failed resume If resume fails, we should shut down all interfaces as the hardware is probably dead. This was/is already done now in mac80211, but we need to change that due to locking issues, so move it here and do it without the wiphy lock held. Cc: stable@vger.kernel.org Fixes: 2fe8ef106238 ("cfg80211: change netdev registration/unregistration semantics") Link: https://lore.kernel.org/r/20210608113226.d564ca69de7c.I2e3c3e5d410b72a4f63bade4fb075df041b3d92f@changeid Signed-off-by: Johannes Berg --- net/wireless/sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 9b959e3b09c6..0c3f05c9be27 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -133,6 +133,10 @@ static int wiphy_resume(struct device *dev) if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); wiphy_unlock(&rdev->wiphy); + + if (ret) + cfg80211_shutdown_all_interfaces(&rdev->wiphy); + rtnl_unlock(); return ret; -- cgit v1.2.3 From f5baf287f5da5641099ad5c809b3b4ebfc08506d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Jun 2021 11:32:30 +0200 Subject: mac80211: move interface shutdown out of wiphy lock When reconfiguration fails, we shut down everything, but we cannot call cfg80211_shutdown_all_interfaces() with the wiphy mutex held. Since cfg80211 now calls it on resume errors, we only need to do likewise for where we call reconfig (whether directly or indirectly), but not under the wiphy lock. Cc: stable@vger.kernel.org Fixes: 2fe8ef106238 ("cfg80211: change netdev registration/unregistration semantics") Link: https://lore.kernel.org/r/20210608113226.78233c80f548.Iecc104aceb89f0568f50e9670a9cb191a1c8887b@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 7 ++++++- net/mac80211/main.c | 7 ++++++- net/mac80211/util.c | 2 -- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b5ff61b6448a..fc34ae2b604c 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -387,12 +387,17 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; + int ret; rtnl_lock(); wiphy_lock(local->hw.wiphy); __ieee80211_suspend(&local->hw, NULL); - __ieee80211_resume(&local->hw); + ret = __ieee80211_resume(&local->hw); wiphy_unlock(local->hw.wiphy); + + if (ret) + cfg80211_shutdown_all_interfaces(local->hw.wiphy); + rtnl_unlock(); return count; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 62145e5f9628..f33a3acd7f96 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -252,6 +252,7 @@ static void ieee80211_restart_work(struct work_struct *work) struct ieee80211_local *local = container_of(work, struct ieee80211_local, restart_work); struct ieee80211_sub_if_data *sdata; + int ret; /* wait for scan work complete */ flush_workqueue(local->workqueue); @@ -301,8 +302,12 @@ static void ieee80211_restart_work(struct work_struct *work) /* wait for all packet processing to be done */ synchronize_net(); - ieee80211_reconfig(local); + ret = ieee80211_reconfig(local); wiphy_unlock(local->hw.wiphy); + + if (ret) + cfg80211_shutdown_all_interfaces(local->hw.wiphy); + rtnl_unlock(); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 0a0481f5af48..93d96a4f9c3e 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2178,8 +2178,6 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) list_for_each_entry(ctx, &local->chanctx_list, list) ctx->driver_present = false; mutex_unlock(&local->chanctx_mtx); - - cfg80211_shutdown_all_interfaces(local->hw.wiphy); } static void ieee80211_assign_chanctx(struct ieee80211_local *local, -- cgit v1.2.3 From a9799541ca34652d9996e45f80e8e03144c12949 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 9 Jun 2021 16:13:06 +0200 Subject: mac80211: drop multicast fragments These are not permitted by the spec, just drop them. Link: https://lore.kernel.org/r/20210609161305.23def022b750.Ibd6dd3cdce573dae262fcdc47f8ac52b883a9c50@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1bb43edd47b6..af0ef456eb0f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2240,17 +2240,15 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) sc = le16_to_cpu(hdr->seq_ctrl); frag = sc & IEEE80211_SCTL_FRAG; - if (is_multicast_ether_addr(hdr->addr1)) { - I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount); - goto out_no_led; - } - if (rx->sta) cache = &rx->sta->frags; if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) goto out; + if (is_multicast_ether_addr(hdr->addr1)) + return RX_DROP_MONITOR; + I802_DEBUG_INC(rx->local->rx_handlers_fragments); if (skb_linearize(rx->skb)) @@ -2376,7 +2374,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) out: ieee80211_led_rx(rx->local); - out_no_led: if (rx->sta) rx->sta->rx_stats.packets++; return RX_CONTINUE; -- cgit v1.2.3 From ad9f151e560b016b6ad3280b48e42fa11e1a5440 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 4 Jun 2021 03:07:28 +0200 Subject: netfilter: nf_tables: initialize set before expression setup nft_set_elem_expr_alloc() needs an initialized set if expression sets on the NFT_EXPR_GC flag. Move set fields initialization before expression setup. [4512935.019450] ================================================================== [4512935.019456] BUG: KASAN: null-ptr-deref in nft_set_elem_expr_alloc+0x84/0xd0 [nf_tables] [4512935.019487] Read of size 8 at addr 0000000000000070 by task nft/23532 [4512935.019494] CPU: 1 PID: 23532 Comm: nft Not tainted 5.12.0-rc4+ #48 [...] [4512935.019502] Call Trace: [4512935.019505] dump_stack+0x89/0xb4 [4512935.019512] ? nft_set_elem_expr_alloc+0x84/0xd0 [nf_tables] [4512935.019536] ? nft_set_elem_expr_alloc+0x84/0xd0 [nf_tables] [4512935.019560] kasan_report.cold.12+0x5f/0xd8 [4512935.019566] ? nft_set_elem_expr_alloc+0x84/0xd0 [nf_tables] [4512935.019590] nft_set_elem_expr_alloc+0x84/0xd0 [nf_tables] [4512935.019615] nf_tables_newset+0xc7f/0x1460 [nf_tables] Reported-by: syzbot+ce96ca2b1d0b37c6422d@syzkaller.appspotmail.com Fixes: 65038428b2c6 ("netfilter: nf_tables: allow to specify stateful expression in set definition") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 85 ++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 72bc759179ef..bf4d6ec9fc55 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4364,13 +4364,45 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_set_alloc_name(&ctx, set, name); kfree(name); if (err < 0) - goto err_set_alloc_name; + goto err_set_name; + + udata = NULL; + if (udlen) { + udata = set->data + size; + nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen); + } + + INIT_LIST_HEAD(&set->bindings); + INIT_LIST_HEAD(&set->catchall_list); + set->table = table; + write_pnet(&set->net, net); + set->ops = ops; + set->ktype = ktype; + set->klen = desc.klen; + set->dtype = dtype; + set->objtype = objtype; + set->dlen = desc.dlen; + set->flags = flags; + set->size = desc.size; + set->policy = policy; + set->udlen = udlen; + set->udata = udata; + set->timeout = timeout; + set->gc_int = gc_int; + + set->field_count = desc.field_count; + for (i = 0; i < desc.field_count; i++) + set->field_len[i] = desc.field_len[i]; + + err = ops->init(set, &desc, nla); + if (err < 0) + goto err_set_init; if (nla[NFTA_SET_EXPR]) { expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); if (IS_ERR(expr)) { err = PTR_ERR(expr); - goto err_set_alloc_name; + goto err_set_expr_alloc; } set->exprs[0] = expr; set->num_exprs++; @@ -4381,75 +4413,44 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_EXPR)) { err = -EINVAL; - goto err_set_alloc_name; + goto err_set_expr_alloc; } i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { err = -E2BIG; - goto err_set_init; + goto err_set_expr_alloc; } if (nla_type(tmp) != NFTA_LIST_ELEM) { err = -EINVAL; - goto err_set_init; + goto err_set_expr_alloc; } expr = nft_set_elem_expr_alloc(&ctx, set, tmp); if (IS_ERR(expr)) { err = PTR_ERR(expr); - goto err_set_init; + goto err_set_expr_alloc; } set->exprs[i++] = expr; set->num_exprs++; } } - udata = NULL; - if (udlen) { - udata = set->data + size; - nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen); - } - - INIT_LIST_HEAD(&set->bindings); - INIT_LIST_HEAD(&set->catchall_list); - set->table = table; - write_pnet(&set->net, net); - set->ops = ops; - set->ktype = ktype; - set->klen = desc.klen; - set->dtype = dtype; - set->objtype = objtype; - set->dlen = desc.dlen; - set->flags = flags; - set->size = desc.size; - set->policy = policy; - set->udlen = udlen; - set->udata = udata; - set->timeout = timeout; - set->gc_int = gc_int; set->handle = nf_tables_alloc_handle(table); - set->field_count = desc.field_count; - for (i = 0; i < desc.field_count; i++) - set->field_len[i] = desc.field_len[i]; - - err = ops->init(set, &desc, nla); - if (err < 0) - goto err_set_init; - err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err_set_trans; + goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; -err_set_trans: - ops->destroy(set); -err_set_init: +err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); -err_set_alloc_name: + + ops->destroy(set); +err_set_init: kfree(set->name); err_set_name: kvfree(set); -- cgit v1.2.3 From 82944421243e5988898f54266687586ba07d889e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Jun 2021 13:48:17 +0200 Subject: selftests: netfilter: add fib test case There is a bug report on netfilter.org bugzilla pointing to fib expression dropping ipv6 DAD packets. Add a test case that demonstrates this problem. Next patch excludes icmpv6 packets coming from any to linklocal. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- tools/testing/selftests/netfilter/nft_fib.sh | 221 +++++++++++++++++++++++++++ 2 files changed, 222 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nft_fib.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3171069a6b46..cd6430b39982 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for netfilter selftests -TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ +TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh new file mode 100755 index 000000000000..6caf6ac8c285 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_fib.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# +# This tests the fib expression. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsrouter="nsrouter-$sfx" +timeout=4 + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +cleanup() +{ + ip netns del ${ns1} + ip netns del ${ns2} + ip netns del ${nsrouter} + + [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add ${nsrouter} +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +trap cleanup EXIT + +dmesg | grep -q ' nft_rpfilter: ' +if [ $? -eq 0 ]; then + dmesg -c | grep ' nft_rpfilter: ' + echo "WARN: a previous test run has failed" 1>&2 +fi + +sysctl -q net.netfilter.nf_log_all_netns=1 +ip netns add ${ns1} +ip netns add ${ns2} + +load_ruleset() { + local netns=$1 + +ip netns exec ${netns} nft -f /dev/stdin <&2 + ip netns exec ${ns} nft list table inet filter + return 1 + fi + + if [ $want -gt 0 ]; then + echo "PASS: fib expression did drop packets for $address" + fi + + return 0 +} + +load_ruleset ${nsrouter} +load_ruleset ${ns1} +load_ruleset ${ns2} + +ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} + +ip -net ${nsrouter} link set lo up +ip -net ${nsrouter} link set veth0 up +ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 + +ip -net ${nsrouter} link set veth1 up +ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set eth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set eth0 up + +ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} route add default via 10.0.1.1 +ip -net ${ns1} route add default via dead:1::1 + +ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} route add default via 10.0.2.1 +ip -net ${ns2} route add default via dead:2::1 + +test_ping() { + local daddr4=$1 + local daddr6=$2 + + ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null + ret=$? + if [ $ret -ne 0 ];then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 + return 1 + fi + + ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null + ret=$? + if [ $ret -ne 0 ];then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 + return 1 + fi + + return 0 +} + +ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +sleep 3 + +test_ping 10.0.2.1 dead:2::1 || exit 1 +check_drops || exit 1 + +test_ping 10.0.2.99 dead:2::99 || exit 1 +check_drops || exit 1 + +echo "PASS: fib expression did not cause unwanted packet drops" + +ip netns exec ${nsrouter} nft flush table inet filter + +ip -net ${ns1} route del default +ip -net ${ns1} -6 route del default + +ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr del dead:1::99/64 dev eth0 + +ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns1} addr add dead:2::99/64 dev eth0 + +ip -net ${ns1} route add default via 10.0.2.1 +ip -net ${ns1} -6 route add default via dead:2::1 + +ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 + +# switch to ruleset that doesn't log, this time +# its expected that this does drop the packets. +load_ruleset_count ${nsrouter} + +# ns1 has a default route, but nsrouter does not. +# must not check return value, ping to 1.1.1.1 will +# fail. +check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 +check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 + +ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null +check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 + +sleep 2 +ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null +check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 + +exit 0 -- cgit v1.2.3 From 12f36e9bf678a81d030ca1b693dcda62b55af7c5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Jun 2021 13:48:18 +0200 Subject: netfilter: nft_fib_ipv6: skip ipv6 packets from any to link-local The ip6tables rpfilter match has an extra check to skip packets with "::" source address. Extend this to ipv6 fib expression. Else ipv6 duplicate address detection packets will fail rpf route check -- lookup returns -ENETUNREACH. While at it, extend the prerouting check to also cover the ingress hook. Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1543 Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nft_fib_ipv6.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index e204163c7036..92f3235fa287 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -135,6 +135,17 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, } EXPORT_SYMBOL_GPL(nft_fib6_eval_type); +static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const struct ipv6hdr *iph) +{ + if (likely(next != IPPROTO_ICMPV6)) + return false; + + if (ipv6_addr_type(&iph->saddr) != IPV6_ADDR_ANY) + return false; + + return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL; +} + void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -163,10 +174,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && - nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; + if (nft_hook(pkt) == NF_INET_PRE_ROUTING || + nft_hook(pkt) == NF_INET_INGRESS) { + if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || + nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } } *dest = 0; -- cgit v1.2.3 From ebc5399ea1dfcddac31974091086a3379141899b Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 20 May 2021 08:34:59 +0200 Subject: ice: add ndo_bpf callback for safe mode netdev ops ice driver requires a programmable pipeline firmware package in order to have a support for advanced features. Otherwise, driver falls back to so called 'safe mode'. For that mode, ndo_bpf callback is not exposed and when user tries to load XDP program, the following happens: $ sudo ./xdp1 enp179s0f1 libbpf: Kernel error message: Underlying driver does not support XDP in native mode link set xdp fd failed which is sort of confusing, as there is a native XDP support, but not in the current mode. Improve the user experience by providing the specific ndo_bpf callback dedicated for safe mode which will make use of extack to explicitly let the user know that the DDP package is missing and that's the reason that the XDP can't be loaded onto interface currently. Cc: Jamal Hadi Salim Fixes: efc2214b6047 ("ice: Add support for XDP") Signed-off-by: Maciej Fijalkowski Tested-by: Kiran Bhandare Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 4ee85a217c6f..0eb2307325d3 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2555,6 +2555,20 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, return (ret || xdp_ring_err) ? -ENOMEM : 0; } +/** + * ice_xdp_safe_mode - XDP handler for safe mode + * @dev: netdevice + * @xdp: XDP command + */ +static int ice_xdp_safe_mode(struct net_device __always_unused *dev, + struct netdev_bpf *xdp) +{ + NL_SET_ERR_MSG_MOD(xdp->extack, + "Please provide working DDP firmware package in order to use XDP\n" + "Refer to Documentation/networking/device_drivers/ethernet/intel/ice.rst"); + return -EOPNOTSUPP; +} + /** * ice_xdp - implements XDP handler * @dev: netdevice @@ -6937,6 +6951,7 @@ static const struct net_device_ops ice_netdev_safe_mode_ops = { .ndo_change_mtu = ice_change_mtu, .ndo_get_stats64 = ice_get_stats64, .ndo_tx_timeout = ice_tx_timeout, + .ndo_bpf = ice_xdp_safe_mode, }; static const struct net_device_ops ice_netdev_ops = { -- cgit v1.2.3 From 2e84f6b3773f43263124c76499c0c4ec3f40aa9b Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 20 May 2021 08:35:00 +0200 Subject: ice: parameterize functions responsible for Tx ring management Commit ae15e0ba1b33 ("ice: Change number of XDP Tx queues to match number of Rx queues") tried to address the incorrect setting of XDP queue count that was based on the Tx queue count, whereas in theory we should provide the XDP queue per Rx queue. However, the routines that setup and destroy the set of Tx resources are still based on the vsi->num_txq. Ice supports the asynchronous Tx/Rx queue count, so for a setup where vsi->num_txq > vsi->num_rxq, ice_vsi_stop_tx_rings and ice_vsi_cfg_txqs will be accessing the vsi->xdp_rings out of the bounds. Parameterize two mentioned functions so they get the size of Tx resources array as the input. Fixes: ae15e0ba1b33 ("ice: Change number of XDP Tx queues to match number of Rx queues") Signed-off-by: Maciej Fijalkowski Tested-by: Kiran Bhandare Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lib.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index d70ee573fde5..27f9dac8719c 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1717,12 +1717,13 @@ setup_rings: * ice_vsi_cfg_txqs - Configure the VSI for Tx * @vsi: the VSI being configured * @rings: Tx ring array to be configured + * @count: number of Tx ring array elements * * Return 0 on success and a negative value on error * Configure the Tx VSI for operation. */ static int -ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings) +ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings, u16 count) { struct ice_aqc_add_tx_qgrp *qg_buf; u16 q_idx = 0; @@ -1734,7 +1735,7 @@ ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings) qg_buf->num_txqs = 1; - for (q_idx = 0; q_idx < vsi->num_txq; q_idx++) { + for (q_idx = 0; q_idx < count; q_idx++) { err = ice_vsi_cfg_txq(vsi, rings[q_idx], qg_buf); if (err) goto err_cfg_txqs; @@ -1754,7 +1755,7 @@ err_cfg_txqs: */ int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi) { - return ice_vsi_cfg_txqs(vsi, vsi->tx_rings); + return ice_vsi_cfg_txqs(vsi, vsi->tx_rings, vsi->num_txq); } /** @@ -1769,7 +1770,7 @@ int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi) int ret; int i; - ret = ice_vsi_cfg_txqs(vsi, vsi->xdp_rings); + ret = ice_vsi_cfg_txqs(vsi, vsi->xdp_rings, vsi->num_xdp_txq); if (ret) return ret; @@ -2009,17 +2010,18 @@ int ice_vsi_stop_all_rx_rings(struct ice_vsi *vsi) * @rst_src: reset source * @rel_vmvf_num: Relative ID of VF/VM * @rings: Tx ring array to be stopped + * @count: number of Tx ring array elements */ static int ice_vsi_stop_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, - u16 rel_vmvf_num, struct ice_ring **rings) + u16 rel_vmvf_num, struct ice_ring **rings, u16 count) { u16 q_idx; if (vsi->num_txq > ICE_LAN_TXQ_MAX_QDIS) return -EINVAL; - for (q_idx = 0; q_idx < vsi->num_txq; q_idx++) { + for (q_idx = 0; q_idx < count; q_idx++) { struct ice_txq_meta txq_meta = { }; int status; @@ -2047,7 +2049,7 @@ int ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, u16 rel_vmvf_num) { - return ice_vsi_stop_tx_rings(vsi, rst_src, rel_vmvf_num, vsi->tx_rings); + return ice_vsi_stop_tx_rings(vsi, rst_src, rel_vmvf_num, vsi->tx_rings, vsi->num_txq); } /** @@ -2056,7 +2058,7 @@ ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, */ int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi) { - return ice_vsi_stop_tx_rings(vsi, ICE_NO_RESET, 0, vsi->xdp_rings); + return ice_vsi_stop_tx_rings(vsi, ICE_NO_RESET, 0, vsi->xdp_rings, vsi->num_xdp_txq); } /** -- cgit v1.2.3 From 80ec82e3d2c1fab42eeb730aaa7985494a963d3f Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Wed, 9 Jun 2021 03:34:25 +0100 Subject: net: ethtool: clear heap allocations for ethtool function Several ethtool functions leave heap uncleared (potentially) by drivers. This will leave the unused portion of heap unchanged and might copy the full contents back to userspace. Signed-off-by: Austin Kim Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 3fa7a394eabf..baa5d10043cb 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1421,7 +1421,7 @@ static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, if (eeprom.offset + eeprom.len > total_len) return -EINVAL; - data = kmalloc(PAGE_SIZE, GFP_USER); + data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; @@ -1486,7 +1486,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) return -EINVAL; - data = kmalloc(PAGE_SIZE, GFP_USER); + data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; @@ -1765,7 +1765,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) return -EFAULT; test.len = test_len; - data = kmalloc_array(test_len, sizeof(u64), GFP_USER); + data = kcalloc(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM; @@ -2293,7 +2293,7 @@ static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) ret = ethtool_tunable_valid(&tuna); if (ret) return ret; - data = kmalloc(tuna.len, GFP_USER); + data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; ret = ops->get_tunable(dev, &tuna, data); @@ -2485,7 +2485,7 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr) ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; - data = kmalloc(tuna.len, GFP_USER); + data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; if (phy_drv_tunable) { -- cgit v1.2.3 From dcd01eeac14486b56a790f5cce9b823440ba5b34 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jun 2021 00:59:45 -0700 Subject: inet: annotate data race in inet_send_prepare() and inet_dgram_connect() Both functions are known to be racy when reading inet_num as we do not want to grab locks for the common case the socket has been bound already. The race is resolved in inet_autobind() by reading again inet_num under the socket lock. syzbot reported: BUG: KCSAN: data-race in inet_send_prepare / udp_lib_get_port write to 0xffff88812cba150e of 2 bytes by task 24135 on cpu 0: udp_lib_get_port+0x4b2/0xe20 net/ipv4/udp.c:308 udp_v6_get_port+0x5e/0x70 net/ipv6/udp.c:89 inet_autobind net/ipv4/af_inet.c:183 [inline] inet_send_prepare+0xd0/0x210 net/ipv4/af_inet.c:807 inet6_sendmsg+0x29/0x80 net/ipv6/af_inet6.c:639 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88812cba150e of 2 bytes by task 24132 on cpu 1: inet_send_prepare+0x21/0x210 net/ipv4/af_inet.c:806 inet6_sendmsg+0x29/0x80 net/ipv6/af_inet6.c:639 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000 -> 0x9db4 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24132 Comm: syz-executor.2 Not tainted 5.13.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f17870ee558b..2f94d221c00e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -575,7 +575,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, return err; } - if (!inet_sk(sk)->inet_num && inet_autobind(sk)) + if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); } @@ -803,7 +803,7 @@ int inet_send_prepare(struct sock *sk) sock_rps_record_flow(sk); /* We may need to bind the socket. */ - if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && + if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && inet_autobind(sk)) return -EAGAIN; -- cgit v1.2.3 From a8b897c7bcd47f4147d066e22cc01d1026d7640e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Jun 2021 11:49:01 +0200 Subject: udp: fix race between close() and udp_abort() Kaustubh reported and diagnosed a panic in udp_lib_lookup(). The root cause is udp_abort() racing with close(). Both racing functions acquire the socket lock, but udp{v6}_destroy_sock() release it before performing destructive actions. We can't easily extend the socket lock scope to avoid the race, instead use the SOCK_DEAD flag to prevent udp_abort from doing any action when the critical race happens. Diagnosed-and-tested-by: Kaustubh Pandey Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 10 ++++++++++ net/ipv6/udp.c | 3 +++ 2 files changed, 13 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 15f5504adf5b..1307ad0d3b9e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2607,6 +2607,9 @@ void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_branch_unlikely(&udp_encap_needed_key)) { @@ -2857,10 +2860,17 @@ int udp_abort(struct sock *sk, int err) { lock_sock(sk); + /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing + * with close() + */ + if (sock_flag(sk, SOCK_DEAD)) + goto out; + sk->sk_err = err; sk->sk_error_report(sk); __udp_disconnect(sk, 0); +out: release_sock(sk); return 0; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 199b080d418a..3fcd86f4dfdc 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1598,6 +1598,9 @@ void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); lock_sock(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_v6_flush_pending_frames(sk); release_sock(sk); -- cgit v1.2.3 From d2e381c4963663bca6f30c3b996fa4dbafe8fcb5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 9 Jun 2021 14:17:53 +0300 Subject: rtnetlink: Fix regression in bridge VLAN configuration Cited commit started returning errors when notification info is not filled by the bridge driver, resulting in the following regression: # ip link add name br1 type bridge vlan_filtering 1 # bridge vlan add dev br1 vid 555 self pvid untagged RTNETLINK answers: Invalid argument As long as the bridge driver does not fill notification info for the bridge device itself, an empty notification should not be considered as an error. This is explained in commit 59ccaaaa49b5 ("bridge: dont send notification when skb->len == 0 in rtnl_bridge_notify"). Fix by removing the error and add a comment to avoid future bugs. Fixes: a8db57c1d285 ("rtnetlink: Fix missing error code in rtnl_bridge_notify()") Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3e84279c4123..ec931b080156 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4842,10 +4842,12 @@ static int rtnl_bridge_notify(struct net_device *dev) if (err < 0) goto errout; - if (!skb->len) { - err = -EINVAL; + /* Notification info is only filled for bridge ports, not the bridge + * device itself. Therefore, a zero notification length is valid and + * should not result in an error. + */ + if (!skb->len) goto errout; - } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; -- cgit v1.2.3 From 13c62f5371e3eb4fc3400cfa26e64ca75f888008 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 9 Jun 2021 11:23:56 -0300 Subject: net/sched: act_ct: handle DNAT tuple collision This this the counterpart of 8aa7b526dc0b ("openvswitch: handle DNAT tuple collision") for act_ct. From that commit changelog: """ With multiple DNAT rules it's possible that after destination translation the resulting tuples collide. ... Netfilter handles this case by allocating a null binding for SNAT at egress by default. Perform the same operation in openvswitch for DNAT if no explicit SNAT is requested by the user and allocate a null binding for SNAT for packets in the "original" direction. """ Fixes: 95219afbb980 ("act_ct: support asymmetric conntrack") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sched/act_ct.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 18edd9ad1410..a656baa321fe 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -904,14 +904,19 @@ static int tcf_ct_act_nat(struct sk_buff *skb, } err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); - if (err == NF_ACCEPT && - ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); + if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { + if (ct->status & IPS_SRC_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = ct_nat_execute(skb, ct, ctinfo, range, + maniptype); + } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + err = ct_nat_execute(skb, ct, ctinfo, NULL, + NF_NAT_MANIP_SRC); + } } return err; #else -- cgit v1.2.3 From 2bf8d2ae3480da06e64dad3b326ebd2e40c0be86 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 4 Jun 2021 18:08:27 +0800 Subject: net/mlx5e: Fix an error code in mlx5e_arfs_create_tables() When the code execute 'if (!priv->fs.arfs->wq)', the value of err is 0. So, we use -ENOMEM to indicate that the function create_singlethread_workqueue() return NULL. Clean up smatch warning: drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c:373 mlx5e_arfs_create_tables() warn: missing error code 'err'. Reported-by: Abaci Robot Fixes: f6755b80d693 ("net/mlx5e: Dynamic alloc arfs table for netdev when needed") Signed-off-by: Yang Li Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index 5cd466ec6492..25403af32859 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -356,7 +356,7 @@ err: int mlx5e_arfs_create_tables(struct mlx5e_priv *priv) { - int err = 0; + int err = -ENOMEM; int i; if (!(priv->netdev->hw_features & NETIF_F_NTUPLE)) -- cgit v1.2.3 From fb1a3132ee1ac968316e45d21a48703a6db0b6c3 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 31 May 2021 16:28:39 +0300 Subject: net/mlx5e: Fix use-after-free of encap entry in neigh update handler Function mlx5e_rep_neigh_update() wasn't updated to accommodate rtnl lock removal from TC filter update path and properly handle concurrent encap entry insertion/deletion which can lead to following use-after-free: [23827.464923] ================================================================== [23827.469446] BUG: KASAN: use-after-free in mlx5e_encap_take+0x72/0x140 [mlx5_core] [23827.470971] Read of size 4 at addr ffff8881d132228c by task kworker/u20:6/21635 [23827.472251] [23827.472615] CPU: 9 PID: 21635 Comm: kworker/u20:6 Not tainted 5.13.0-rc3+ #5 [23827.473788] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [23827.475639] Workqueue: mlx5e mlx5e_rep_neigh_update [mlx5_core] [23827.476731] Call Trace: [23827.477260] dump_stack+0xbb/0x107 [23827.477906] print_address_description.constprop.0+0x18/0x140 [23827.478896] ? mlx5e_encap_take+0x72/0x140 [mlx5_core] [23827.479879] ? mlx5e_encap_take+0x72/0x140 [mlx5_core] [23827.480905] kasan_report.cold+0x7c/0xd8 [23827.481701] ? mlx5e_encap_take+0x72/0x140 [mlx5_core] [23827.482744] kasan_check_range+0x145/0x1a0 [23827.493112] mlx5e_encap_take+0x72/0x140 [mlx5_core] [23827.494054] ? mlx5e_tc_tun_encap_info_equal_generic+0x140/0x140 [mlx5_core] [23827.495296] mlx5e_rep_neigh_update+0x41e/0x5e0 [mlx5_core] [23827.496338] ? mlx5e_rep_neigh_entry_release+0xb80/0xb80 [mlx5_core] [23827.497486] ? read_word_at_a_time+0xe/0x20 [23827.498250] ? strscpy+0xa0/0x2a0 [23827.498889] process_one_work+0x8ac/0x14e0 [23827.499638] ? lockdep_hardirqs_on_prepare+0x400/0x400 [23827.500537] ? pwq_dec_nr_in_flight+0x2c0/0x2c0 [23827.501359] ? rwlock_bug.part.0+0x90/0x90 [23827.502116] worker_thread+0x53b/0x1220 [23827.502831] ? process_one_work+0x14e0/0x14e0 [23827.503627] kthread+0x328/0x3f0 [23827.504254] ? _raw_spin_unlock_irq+0x24/0x40 [23827.505065] ? __kthread_bind_mask+0x90/0x90 [23827.505912] ret_from_fork+0x1f/0x30 [23827.506621] [23827.506987] Allocated by task 28248: [23827.507694] kasan_save_stack+0x1b/0x40 [23827.508476] __kasan_kmalloc+0x7c/0x90 [23827.509197] mlx5e_attach_encap+0xde1/0x1d40 [mlx5_core] [23827.510194] mlx5e_tc_add_fdb_flow+0x397/0xc40 [mlx5_core] [23827.511218] __mlx5e_add_fdb_flow+0x519/0xb30 [mlx5_core] [23827.512234] mlx5e_configure_flower+0x191c/0x4870 [mlx5_core] [23827.513298] tc_setup_cb_add+0x1d5/0x420 [23827.514023] fl_hw_replace_filter+0x382/0x6a0 [cls_flower] [23827.514975] fl_change+0x2ceb/0x4a51 [cls_flower] [23827.515821] tc_new_tfilter+0x89a/0x2070 [23827.516548] rtnetlink_rcv_msg+0x644/0x8c0 [23827.517300] netlink_rcv_skb+0x11d/0x340 [23827.518021] netlink_unicast+0x42b/0x700 [23827.518742] netlink_sendmsg+0x743/0xc20 [23827.519467] sock_sendmsg+0xb2/0xe0 [23827.520131] ____sys_sendmsg+0x590/0x770 [23827.520851] ___sys_sendmsg+0xd8/0x160 [23827.521552] __sys_sendmsg+0xb7/0x140 [23827.522238] do_syscall_64+0x3a/0x70 [23827.522907] entry_SYSCALL_64_after_hwframe+0x44/0xae [23827.523797] [23827.524163] Freed by task 25948: [23827.524780] kasan_save_stack+0x1b/0x40 [23827.525488] kasan_set_track+0x1c/0x30 [23827.526187] kasan_set_free_info+0x20/0x30 [23827.526968] __kasan_slab_free+0xed/0x130 [23827.527709] slab_free_freelist_hook+0xcf/0x1d0 [23827.528528] kmem_cache_free_bulk+0x33a/0x6e0 [23827.529317] kfree_rcu_work+0x55f/0xb70 [23827.530024] process_one_work+0x8ac/0x14e0 [23827.530770] worker_thread+0x53b/0x1220 [23827.531480] kthread+0x328/0x3f0 [23827.532114] ret_from_fork+0x1f/0x30 [23827.532785] [23827.533147] Last potentially related work creation: [23827.534007] kasan_save_stack+0x1b/0x40 [23827.534710] kasan_record_aux_stack+0xab/0xc0 [23827.535492] kvfree_call_rcu+0x31/0x7b0 [23827.536206] mlx5e_tc_del_fdb_flow+0x577/0xef0 [mlx5_core] [23827.537305] mlx5e_flow_put+0x49/0x80 [mlx5_core] [23827.538290] mlx5e_delete_flower+0x6d1/0xe60 [mlx5_core] [23827.539300] tc_setup_cb_destroy+0x18e/0x2f0 [23827.540144] fl_hw_destroy_filter+0x1d2/0x310 [cls_flower] [23827.541148] __fl_delete+0x4dc/0x660 [cls_flower] [23827.541985] fl_delete+0x97/0x160 [cls_flower] [23827.542782] tc_del_tfilter+0x7ab/0x13d0 [23827.543503] rtnetlink_rcv_msg+0x644/0x8c0 [23827.544257] netlink_rcv_skb+0x11d/0x340 [23827.544981] netlink_unicast+0x42b/0x700 [23827.545700] netlink_sendmsg+0x743/0xc20 [23827.546424] sock_sendmsg+0xb2/0xe0 [23827.547084] ____sys_sendmsg+0x590/0x770 [23827.547850] ___sys_sendmsg+0xd8/0x160 [23827.548606] __sys_sendmsg+0xb7/0x140 [23827.549303] do_syscall_64+0x3a/0x70 [23827.549969] entry_SYSCALL_64_after_hwframe+0x44/0xae [23827.550853] [23827.551217] The buggy address belongs to the object at ffff8881d1322200 [23827.551217] which belongs to the cache kmalloc-256 of size 256 [23827.553341] The buggy address is located 140 bytes inside of [23827.553341] 256-byte region [ffff8881d1322200, ffff8881d1322300) [23827.555747] The buggy address belongs to the page: [23827.556847] page:00000000898762aa refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1d1320 [23827.558651] head:00000000898762aa order:2 compound_mapcount:0 compound_pincount:0 [23827.559961] flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) [23827.561243] raw: 002ffff800010200 dead000000000100 dead000000000122 ffff888100042b40 [23827.562653] raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 [23827.564112] page dumped because: kasan: bad access detected [23827.565439] [23827.565932] Memory state around the buggy address: [23827.566917] ffff8881d1322180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [23827.568485] ffff8881d1322200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [23827.569818] >ffff8881d1322280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [23827.571143] ^ [23827.571879] ffff8881d1322300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [23827.573283] ffff8881d1322380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [23827.574654] ================================================================== Most of the necessary logic is already correctly implemented by mlx5e_get_next_valid_encap() helper that is used in neigh stats update handler. Make the handler generic by renaming it to mlx5e_get_next_matching_encap() and use callback to test whether flow is matching instead of hardcoded check for 'valid' flag value. Implement mlx5e_get_next_valid_encap() by calling mlx5e_get_next_matching_encap() with callback that tests encap MLX5_ENCAP_ENTRY_VALID flag. Implement new mlx5e_get_next_init_encap() helper by calling mlx5e_get_next_matching_encap() with callback that tests encap completion result to be non-error and use it in mlx5e_rep_neigh_update() to safely iterate over nhe->encap_list. Remove encap completion logic from mlx5e_rep_update_flows() since the encap entries passed to this function are already guaranteed to be properly initialized by similar code in mlx5e_get_next_init_encap(). Fixes: 2a1f1768fa17 ("net/mlx5e: Refactor neigh update for concurrent execution") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/rep/neigh.c | 15 ++++------ .../net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 6 +--- .../ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 33 ++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 3 ++ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c index be0ee03de721..2e9bee4e5209 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c @@ -129,10 +129,9 @@ static void mlx5e_rep_neigh_update(struct work_struct *work) work); struct mlx5e_neigh_hash_entry *nhe = update_work->nhe; struct neighbour *n = update_work->n; + struct mlx5e_encap_entry *e = NULL; bool neigh_connected, same_dev; - struct mlx5e_encap_entry *e; unsigned char ha[ETH_ALEN]; - struct mlx5e_priv *priv; u8 nud_state, dead; rtnl_lock(); @@ -156,14 +155,12 @@ static void mlx5e_rep_neigh_update(struct work_struct *work) if (!same_dev) goto out; - list_for_each_entry(e, &nhe->encap_list, encap_list) { - if (!mlx5e_encap_take(e)) - continue; + /* mlx5e_get_next_init_encap() releases previous encap before returning + * the next one. + */ + while ((e = mlx5e_get_next_init_encap(nhe, e)) != NULL) + mlx5e_rep_update_flows(netdev_priv(e->out_dev), e, neigh_connected, ha); - priv = netdev_priv(e->out_dev); - mlx5e_rep_update_flows(priv, e, neigh_connected, ha); - mlx5e_encap_put(priv, e); - } out: rtnl_unlock(); mlx5e_release_neigh_update_work(update_work); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index 311382261840..85eaadc989df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -94,13 +94,9 @@ void mlx5e_rep_update_flows(struct mlx5e_priv *priv, ASSERT_RTNL(); - /* wait for encap to be fully initialized */ - wait_for_completion(&e->res_ready); - mutex_lock(&esw->offloads.encap_tbl_lock); encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID); - if (e->compl_result < 0 || (encap_connected == neigh_connected && - ether_addr_equal(e->h_dest, ha))) + if (encap_connected == neigh_connected && ether_addr_equal(e->h_dest, ha)) goto unlock; mlx5e_take_all_encap_flows(e, &flow_list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index f1fb11680d20..490131e06efb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -251,9 +251,12 @@ static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r, mlx5e_take_tmp_flow(flow, flow_list, 0); } +typedef bool (match_cb)(struct mlx5e_encap_entry *); + static struct mlx5e_encap_entry * -mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe, - struct mlx5e_encap_entry *e) +mlx5e_get_next_matching_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e, + match_cb match) { struct mlx5e_encap_entry *next = NULL; @@ -288,7 +291,7 @@ retry: /* wait for encap to be fully initialized */ wait_for_completion(&next->res_ready); /* continue searching if encap entry is not in valid state after completion */ - if (!(next->flags & MLX5_ENCAP_ENTRY_VALID)) { + if (!match(next)) { e = next; goto retry; } @@ -296,6 +299,30 @@ retry: return next; } +static bool mlx5e_encap_valid(struct mlx5e_encap_entry *e) +{ + return e->flags & MLX5_ENCAP_ENTRY_VALID; +} + +static struct mlx5e_encap_entry * +mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e) +{ + return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_valid); +} + +static bool mlx5e_encap_initialized(struct mlx5e_encap_entry *e) +{ + return e->compl_result >= 0; +} + +struct mlx5e_encap_entry * +mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e) +{ + return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_initialized); +} + void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) { struct mlx5e_neigh *m_neigh = &nhe->m_neigh; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index 25c091795bcd..17027536efba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -178,6 +178,9 @@ void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *f void mlx5e_put_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list); struct mlx5e_neigh_hash_entry; +struct mlx5e_encap_entry * +mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e); void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe); void mlx5e_tc_reoffload_flows_work(struct work_struct *work); -- cgit v1.2.3 From 8ad893e516a77209a1818a2072d2027d87db809f Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 28 May 2021 13:20:32 -0500 Subject: net/mlx5e: Remove dependency in IPsec initialization flows Currently, IPsec feature is disabled because mlx5e_build_nic_netdev is required to be called after mlx5e_ipsec_init. This requirement is invalid as mlx5e_build_nic_netdev and mlx5e_ipsec_init initialize independent resources. Remove ipsec pointer check in mlx5e_build_nic_netdev so that the two functions can be called at any order. Fixes: 547eede070eb ("net/mlx5e: IPSec, Innova IPSec offload infrastructure") Signed-off-by: Huy Nguyen Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 3d45341e2216..26f7fab109d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -532,9 +532,6 @@ void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv) struct mlx5_core_dev *mdev = priv->mdev; struct net_device *netdev = priv->netdev; - if (!priv->ipsec) - return; - if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_ESP) || !MLX5_CAP_ETH(mdev, swp)) { mlx5_core_dbg(mdev, "mlx5e: ESP and SWP offload not supported\n"); -- cgit v1.2.3 From a3e5fd9314dfc4314a9567cde96e1aef83a7458a Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Wed, 26 May 2021 13:45:10 +0300 Subject: net/mlx5e: Fix page reclaim for dead peer hairpin When adding a hairpin flow, a firmware-side send queue is created for the peer net device, which claims some host memory pages for its internal ring buffer. If the peer net device is removed/unbound before the hairpin flow is deleted, then the send queue is not destroyed which leads to a stack trace on pci device remove: [ 748.005230] mlx5_core 0000:08:00.2: wait_func:1094:(pid 12985): MANAGE_PAGES(0x108) timeout. Will cause a leak of a command resource [ 748.005231] mlx5_core 0000:08:00.2: reclaim_pages:514:(pid 12985): failed reclaiming pages: err -110 [ 748.001835] mlx5_core 0000:08:00.2: mlx5_reclaim_root_pages:653:(pid 12985): failed reclaiming pages (-110) for func id 0x0 [ 748.002171] ------------[ cut here ]------------ [ 748.001177] FW pages counter is 4 after reclaiming all pages [ 748.001186] WARNING: CPU: 1 PID: 12985 at drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:685 mlx5_reclaim_startup_pages+0x34b/0x460 [mlx5_core] [ +0.002771] Modules linked in: cls_flower mlx5_ib mlx5_core ptp pps_core act_mirred sch_ingress openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi rdma_cm ib_umad ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay fuse [last unloaded: pps_core] [ 748.007225] CPU: 1 PID: 12985 Comm: tee Not tainted 5.12.0+ #1 [ 748.001376] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 748.002315] RIP: 0010:mlx5_reclaim_startup_pages+0x34b/0x460 [mlx5_core] [ 748.001679] Code: 28 00 00 00 0f 85 22 01 00 00 48 81 c4 b0 00 00 00 31 c0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 c7 c7 40 cc 19 a1 e8 9f 71 0e e2 <0f> 0b e9 30 ff ff ff 48 c7 c7 a0 cc 19 a1 e8 8c 71 0e e2 0f 0b e9 [ 748.003781] RSP: 0018:ffff88815220faf8 EFLAGS: 00010286 [ 748.001149] RAX: 0000000000000000 RBX: ffff8881b4900280 RCX: 0000000000000000 [ 748.001445] RDX: 0000000000000027 RSI: 0000000000000004 RDI: ffffed102a441f51 [ 748.001614] RBP: 00000000000032b9 R08: 0000000000000001 R09: ffffed1054a15ee8 [ 748.001446] R10: ffff8882a50af73b R11: ffffed1054a15ee7 R12: fffffbfff07c1e30 [ 748.001447] R13: dffffc0000000000 R14: ffff8881b492cba8 R15: 0000000000000000 [ 748.001429] FS: 00007f58bd08b580(0000) GS:ffff8882a5080000(0000) knlGS:0000000000000000 [ 748.001695] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 748.001309] CR2: 000055a026351740 CR3: 00000001d3b48006 CR4: 0000000000370ea0 [ 748.001506] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 748.001483] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 748.001654] Call Trace: [ 748.000576] ? mlx5_satisfy_startup_pages+0x290/0x290 [mlx5_core] [ 748.001416] ? mlx5_cmd_teardown_hca+0xa2/0xd0 [mlx5_core] [ 748.001354] ? mlx5_cmd_init_hca+0x280/0x280 [mlx5_core] [ 748.001203] mlx5_function_teardown+0x30/0x60 [mlx5_core] [ 748.001275] mlx5_uninit_one+0xa7/0xc0 [mlx5_core] [ 748.001200] remove_one+0x5f/0xc0 [mlx5_core] [ 748.001075] pci_device_remove+0x9f/0x1d0 [ 748.000833] device_release_driver_internal+0x1e0/0x490 [ 748.001207] unbind_store+0x19f/0x200 [ 748.000942] ? sysfs_file_ops+0x170/0x170 [ 748.001000] kernfs_fop_write_iter+0x2bc/0x450 [ 748.000970] new_sync_write+0x373/0x610 [ 748.001124] ? new_sync_read+0x600/0x600 [ 748.001057] ? lock_acquire+0x4d6/0x700 [ 748.000908] ? lockdep_hardirqs_on_prepare+0x400/0x400 [ 748.001126] ? fd_install+0x1c9/0x4d0 [ 748.000951] vfs_write+0x4d0/0x800 [ 748.000804] ksys_write+0xf9/0x1d0 [ 748.000868] ? __x64_sys_read+0xb0/0xb0 [ 748.000811] ? filp_open+0x50/0x50 [ 748.000919] ? syscall_enter_from_user_mode+0x1d/0x50 [ 748.001223] do_syscall_64+0x3f/0x80 [ 748.000892] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 748.001026] RIP: 0033:0x7f58bcfb22f7 [ 748.000944] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 [ 748.003925] RSP: 002b:00007fffd7f2aaa8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 748.001732] RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007f58bcfb22f7 [ 748.001426] RDX: 000000000000000d RSI: 00007fffd7f2abc0 RDI: 0000000000000003 [ 748.001746] RBP: 00007fffd7f2abc0 R08: 0000000000000000 R09: 0000000000000001 [ 748.001631] R10: 00000000000001b6 R11: 0000000000000246 R12: 000000000000000d [ 748.001537] R13: 00005597ac2c24a0 R14: 000000000000000d R15: 00007f58bd084700 [ 748.001564] irq event stamp: 0 [ 748.000787] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [ 748.001399] hardirqs last disabled at (0): [] copy_process+0x146f/0x5eb0 [ 748.001854] softirqs last enabled at (0): [] copy_process+0x14ae/0x5eb0 [ 748.013431] softirqs last disabled at (0): [<0000000000000000>] 0x0 [ 748.001492] ---[ end trace a6fabd773d1c51ae ]--- Fix by destroying the send queue of a hairpin peer net device that is being removed/unbound, which returns the allocated ring buffer pages to the host. Fixes: 4d8fcf216c90 ("net/mlx5e: Avoid unbounded peer devices when unpairing TC hairpin rules") Signed-off-by: Dima Chumak Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 30 +++++++++++++++++----- include/linux/mlx5/transobj.h | 1 + 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index dd64878e5b38..d4b0f270b6bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4765,7 +4765,7 @@ static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv, list_for_each_entry_safe(hpe, tmp, &init_wait_list, dead_peer_wait_list) { wait_for_completion(&hpe->res_ready); if (!IS_ERR_OR_NULL(hpe->hp) && hpe->peer_vhca_id == peer_vhca_id) - hpe->hp->pair->peer_gone = true; + mlx5_core_hairpin_clear_dead_peer(hpe->hp->pair); mlx5e_hairpin_put(priv, hpe); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index 01cc00ad8acf..b6931bbe52d2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -424,6 +424,15 @@ err_modify_sq: return err; } +static void mlx5_hairpin_unpair_peer_sq(struct mlx5_hairpin *hp) +{ + int i; + + for (i = 0; i < hp->num_channels; i++) + mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY, + MLX5_SQC_STATE_RST, 0, 0); +} + static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp) { int i; @@ -432,13 +441,9 @@ static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp) for (i = 0; i < hp->num_channels; i++) mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i], MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_RST, 0, 0); - /* unset peer SQs */ - if (hp->peer_gone) - return; - for (i = 0; i < hp->num_channels; i++) - mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY, - MLX5_SQC_STATE_RST, 0, 0); + if (!hp->peer_gone) + mlx5_hairpin_unpair_peer_sq(hp); } struct mlx5_hairpin * @@ -485,3 +490,16 @@ void mlx5_core_hairpin_destroy(struct mlx5_hairpin *hp) mlx5_hairpin_destroy_queues(hp); kfree(hp); } + +void mlx5_core_hairpin_clear_dead_peer(struct mlx5_hairpin *hp) +{ + int i; + + mlx5_hairpin_unpair_peer_sq(hp); + + /* destroy peer SQ */ + for (i = 0; i < hp->num_channels; i++) + mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]); + + hp->peer_gone = true; +} diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index 028f442530cf..60ffeb6b67ae 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -85,4 +85,5 @@ mlx5_core_hairpin_create(struct mlx5_core_dev *func_mdev, struct mlx5_hairpin_params *params); void mlx5_core_hairpin_destroy(struct mlx5_hairpin *pair); +void mlx5_core_hairpin_clear_dead_peer(struct mlx5_hairpin *hp); #endif /* __TRANSOBJ_H__ */ -- cgit v1.2.3 From c189716b2a7c1d2d8658e269735273caa1c38b54 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 6 Jun 2021 11:20:46 +0300 Subject: net/mlx5: Consider RoCE cap before init RDMA resources Check if RoCE is supported by the device before enable it in the vport context and create all the RDMA steering objects. Fixes: 80f09dfc237f ("net/mlx5: Eswitch, enable RoCE loopback traffic") Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/rdma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c index 441b5453acae..540cf05f6373 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -156,6 +156,9 @@ void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) { int err; + if (!MLX5_CAP_GEN(dev, roce)) + return; + err = mlx5_nic_vport_enable_roce(dev); if (err) { mlx5_core_err(dev, "Failed to enable RoCE: %d\n", err); -- cgit v1.2.3 From 4aaf96ac8b45d8e2e019b6b53cce65a73c4ace2c Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 6 Jun 2021 11:23:41 +0300 Subject: net/mlx5: DR, Don't use SW steering when RoCE is not supported SW steering uses RC QP to write/read to/from ICM, hence it's not supported when RoCE is not supported as well. Fixes: 70605ea545e8 ("net/mlx5: DR, Expose APIs for direct rule managing") Signed-off-by: Maor Gottlieb Reviewed-by: Alex Vesker Reviewed-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index 612b0ac31db2..9737565cd8d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -124,10 +124,11 @@ int mlx5dr_action_destroy(struct mlx5dr_action *action); static inline bool mlx5dr_is_supported(struct mlx5_core_dev *dev) { - return MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner) || - (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner_v2) && - (MLX5_CAP_GEN(dev, steering_format_version) <= - MLX5_STEERING_FORMAT_CONNECTX_6DX)); + return MLX5_CAP_GEN(dev, roce) && + (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner) || + (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner_v2) && + (MLX5_CAP_GEN(dev, steering_format_version) <= + MLX5_STEERING_FORMAT_CONNECTX_6DX))); } /* buddy functions & structure */ -- cgit v1.2.3 From 11f5ac3e05c134d333afe6f84ab10e22bc0a5d5a Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Wed, 28 Apr 2021 19:39:26 +0800 Subject: net/mlx5e: Verify dev is present in get devlink port ndo When changing eswitch mode, the netdev is detached from the hardware resources. So verify dev is present in get devlink port ndo. Otherwise, we will hit the following panic: [241535.973539] RIP: 0010:__devlink_port_phys_port_name_get+0x13/0x1b0 [241535.976471] RSP: 0018:ffff9eaf0ae1b7c8 EFLAGS: 00010292 [241535.977471] RAX: 000000000002d370 RBX: 000000000002d370 RCX: 0000000000000000 [241535.978479] RDX: 0000000000000010 RSI: ffff9eaf0ae1b858 RDI: 000000000002d370 [241535.979482] RBP: ffff9eaf0ae1b7e0 R08: 000000000000002a R09: ffff8888d54d13da [241535.980486] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8888e6700000 [241535.981491] R13: ffff9eaf0ae1b858 R14: 0000000000000010 R15: 0000000000000000 [241535.982489] FS: 00007fd374ef3740(0000) GS:ffff88909ea00000(0000) knlGS:0000000000000000 [241535.983494] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [241535.984487] CR2: 000000000002d444 CR3: 000000089fd26006 CR4: 00000000003706e0 [241535.985502] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [241535.986499] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [241535.987477] Call Trace: [241535.988426] ? nla_put_64bit+0x71/0xa0 [241535.989368] devlink_compat_phys_port_name_get+0x50/0xa0 [241535.990312] dev_get_phys_port_name+0x4b/0x60 [241535.991252] rtnl_fill_ifinfo+0x57b/0xcb0 [241535.992192] rtnl_dump_ifinfo+0x58f/0x6d0 [241535.993123] ? ksize+0x14/0x20 [241535.994033] ? __alloc_skb+0xe8/0x250 [241535.994935] netlink_dump+0x17c/0x300 [241535.995821] netlink_recvmsg+0x1de/0x2c0 [241535.996677] sock_recvmsg+0x70/0x80 [241535.997518] ____sys_recvmsg+0x9b/0x1b0 [241535.998360] ? iovec_from_user+0x82/0x120 [241535.999202] ? __import_iovec+0x2c/0x130 [241536.000031] ___sys_recvmsg+0x94/0x130 [241536.000850] ? __handle_mm_fault+0x56d/0x6e0 [241536.001668] __sys_recvmsg+0x5f/0xb0 [241536.002464] ? syscall_enter_from_user_mode+0x2b/0x80 [241536.003242] __x64_sys_recvmsg+0x1f/0x30 [241536.004008] do_syscall_64+0x38/0x50 [241536.004767] entry_SYSCALL_64_after_hwframe+0x44/0xae [241536.005532] RIP: 0033:0x7fd375014f47 Fixes: 2ff349c5edfe ("net/mlx5e: Verify dev is present in some ndos") Signed-off-by: Roi Dayan Signed-off-by: Chris Mi Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c index 0dd7615e5931..bc33eaada3b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -64,6 +64,8 @@ struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev) struct mlx5e_priv *priv = netdev_priv(dev); struct devlink_port *port; + if (!netif_device_present(dev)) + return NULL; port = mlx5e_devlink_get_dl_port(priv); if (port->registered) return port; -- cgit v1.2.3 From 9ae8c18c5e4d8814d3b405a07712fa5464070e3e Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 29 Apr 2021 10:03:20 +0300 Subject: net/mlx5e: Don't update netdev RQs with PTP-RQ Since the driver opens the PTP-RQ under channel 0, it appears to the stack as if the SKB was received on rxq0. So from thew stack POV there are still the same number of RX queues. Fixes: 960fbfe222a4 ("net/mlx5e: Allow coexistence of CQE compression and HW TS PTP") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ec6bafe7a2e5..263adc82b4e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2705,8 +2705,6 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) nch = priv->channels.params.num_channels; ntc = priv->channels.params.num_tc; num_rxqs = nch * priv->profile->rq_groups; - if (priv->channels.params.ptp_rx) - num_rxqs++; mlx5e_netdev_set_tcs(netdev, nch, ntc); -- cgit v1.2.3 From a6ee6f5f1082c416f9bfffbae1a87feff8a6ab3d Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 19 Apr 2021 11:58:31 +0300 Subject: net/mlx5e: Fix select queue to consider SKBTX_HW_TSTAMP Steering packets to PTP-SQ should be done only if the SKB has SKBTX_HW_TSTAMP set in the tx_flags. While here, take the function into a header and inline it. Set the whole condition to select the PTP-SQ to unlikely. Fixes: 24c22dd0918b ("net/mlx5e: Add states to PTP channel") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h | 22 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 25 +++--------------------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index d907c1acd4d5..778e229310a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB // Copyright (c) 2020 Mellanox Technologies -#include #include "en/ptp.h" #include "en/txrx.h" #include "en/params.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index ab935cce952b..c96668bd701c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -6,6 +6,7 @@ #include "en.h" #include "en_stats.h" +#include struct mlx5e_ptpsq { struct mlx5e_txqsq txqsq; @@ -43,6 +44,27 @@ struct mlx5e_ptp { DECLARE_BITMAP(state, MLX5E_PTP_STATE_NUM_STATES); }; +static inline bool mlx5e_use_ptpsq(struct sk_buff *skb) +{ + struct flow_keys fk; + + if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + return false; + + if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) + return false; + + if (fk.basic.n_proto == htons(ETH_P_1588)) + return true; + + if (fk.basic.n_proto != htons(ETH_P_IP) && + fk.basic.n_proto != htons(ETH_P_IPV6)) + return false; + + return (fk.basic.ip_proto == IPPROTO_UDP && + fk.ports.dst == htons(PTP_EV_PORT)); +} + int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params, u8 lag_port, struct mlx5e_ptp **cp); void mlx5e_ptp_close(struct mlx5e_ptp *c); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 8ba62671f5f1..320fe0cda917 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include "en.h" @@ -67,24 +66,6 @@ static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb } #endif -static bool mlx5e_use_ptpsq(struct sk_buff *skb) -{ - struct flow_keys fk; - - if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) - return false; - - if (fk.basic.n_proto == htons(ETH_P_1588)) - return true; - - if (fk.basic.n_proto != htons(ETH_P_IP) && - fk.basic.n_proto != htons(ETH_P_IPV6)) - return false; - - return (fk.basic.ip_proto == IPPROTO_UDP && - fk.ports.dst == htons(PTP_EV_PORT)); -} - static u16 mlx5e_select_ptpsq(struct net_device *dev, struct sk_buff *skb) { struct mlx5e_priv *priv = netdev_priv(dev); @@ -145,9 +126,9 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, } ptp_channel = READ_ONCE(priv->channels.ptp); - if (unlikely(ptp_channel) && - test_bit(MLX5E_PTP_STATE_TX, ptp_channel->state) && - mlx5e_use_ptpsq(skb)) + if (unlikely(ptp_channel && + test_bit(MLX5E_PTP_STATE_TX, ptp_channel->state) && + mlx5e_use_ptpsq(skb))) return mlx5e_select_ptpsq(dev, skb); txq_ix = netdev_pick_tx(dev, skb, NULL); -- cgit v1.2.3 From 7a545077cb6701957e84c7f158630bb5c984e648 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 25 Feb 2021 12:27:53 +0200 Subject: Revert "net/mlx5: Arm only EQs with EQEs" In the scenario described below, an EQ can remain in FIRED state which can result in missing an interrupt generation. The scenario: device mlx5_core driver ------ ---------------- EQ1.eqe generated EQ1.MSI-X sent EQ1.state = FIRED EQ2.eqe generated mlx5_irq() polls - eq1_eqes() arm eq1 polls - eq2_eqes() arm eq2 EQ2.MSI-X sent EQ2.state = FIRED mlx5_irq() polls - eq2_eqes() -- no eqes found driver skips EQ arming; ->EQ2 remains fired, misses generating interrupt. Hence, always arm the EQ by reverting the cited commit in fixes tag. Fixes: d894892dda25 ("net/mlx5: Arm only EQs with EQEs") Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 77c0ca655975..940333410267 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -136,7 +136,7 @@ static int mlx5_eq_comp_int(struct notifier_block *nb, eqe = next_eqe_sw(eq); if (!eqe) - return 0; + goto out; do { struct mlx5_core_cq *cq; @@ -161,6 +161,8 @@ static int mlx5_eq_comp_int(struct notifier_block *nb, ++eq->cons_index; } while ((++num_eqes < MLX5_EQ_POLLING_BUDGET) && (eqe = next_eqe_sw(eq))); + +out: eq_update_ci(eq, 1); if (cqn != -1) @@ -248,9 +250,9 @@ static int mlx5_eq_async_int(struct notifier_block *nb, ++eq->cons_index; } while ((++num_eqes < MLX5_EQ_POLLING_BUDGET) && (eqe = next_eqe_sw(eq))); - eq_update_ci(eq, 1); out: + eq_update_ci(eq, 1); mlx5_eq_async_int_unlock(eq_async, recovery, &flags); return unlikely(recovery) ? num_eqes : 0; -- cgit v1.2.3 From 6d6727dddc7f93fcc155cb8d0c49c29ae0e71122 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 10 May 2021 14:34:58 +0300 Subject: net/mlx5e: Block offload of outer header csum for UDP tunnels The device is able to offload either the outer header csum or inner header csum. The driver utilizes the inner csum offload. Hence, block setting of tx-udp_tnl-csum-segmentation and set it to off[fixed]. Fixes: b49663c8fb49 ("net/mlx5e: Add support for UDP tunnel segmentation with outer checksum offload") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 263adc82b4e1..d4167f7be99c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4822,13 +4822,9 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) } if (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev)) { - netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | - NETIF_F_GSO_UDP_TUNNEL_CSUM; - netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL | - NETIF_F_GSO_UDP_TUNNEL_CSUM; - netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM; - netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL | - NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; + netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL; + netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL; } if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_GRE)) { -- cgit v1.2.3 From 54e1217b90486c94b26f24dcee1ee5ef5372f832 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 26 May 2021 10:40:36 +0300 Subject: net/mlx5e: Block offload of outer header csum for GRE tunnel The device is able to offload either the outer header csum or inner header csum. The driver utilizes the inner csum offload. So, prohibit setting of tx-gre-csum-segmentation and let it be: off[fixed]. Fixes: 2729984149e6 ("net/mlx5e: Support TSO and TX checksum offloads for GRE tunnels") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d4167f7be99c..d26b8ed51195 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4828,12 +4828,9 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) } if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_GRE)) { - netdev->hw_features |= NETIF_F_GSO_GRE | - NETIF_F_GSO_GRE_CSUM; - netdev->hw_enc_features |= NETIF_F_GSO_GRE | - NETIF_F_GSO_GRE_CSUM; - netdev->gso_partial_features |= NETIF_F_GSO_GRE | - NETIF_F_GSO_GRE_CSUM; + netdev->hw_features |= NETIF_F_GSO_GRE; + netdev->hw_enc_features |= NETIF_F_GSO_GRE; + netdev->gso_partial_features |= NETIF_F_GSO_GRE; } if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_IPIP)) { -- cgit v1.2.3 From 3bdd5ee0ec8c14131d560da492e6df452c6fdd75 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 9 Jun 2021 18:41:57 -0400 Subject: skbuff: fix incorrect msg_zerocopy copy notifications msg_zerocopy signals if a send operation required copying with a flag in serr->ee.ee_code. This field can be incorrect as of the below commit, as a result of both structs uarg and serr pointing into the same skb->cb[]. uarg->zerocopy must be read before skb->cb[] is reinitialized to hold serr. Similar to other fields len, hi and lo, use a local variable to temporarily hold the value. This was not a problem before, when the value was passed as a function argument. Fixes: 75518851a2a0 ("skbuff: Push status and refcounts into sock_zerocopy_callback") Reported-by: Talal Ahmad Signed-off-by: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ad22870298c..bbc3b4b62032 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1253,6 +1253,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg) struct sock *sk = skb->sk; struct sk_buff_head *q; unsigned long flags; + bool is_zerocopy; u32 lo, hi; u16 len; @@ -1267,6 +1268,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg) len = uarg->len; lo = uarg->id; hi = uarg->id + len - 1; + is_zerocopy = uarg->zerocopy; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); @@ -1274,7 +1276,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg) serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; serr->ee.ee_data = hi; serr->ee.ee_info = lo; - if (!uarg->zerocopy) + if (!is_zerocopy) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; q = &sk->sk_error_queue; -- cgit v1.2.3 From 9d44fa3e50cc91691896934d106c86e4027e61ca Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Thu, 10 Jun 2021 09:41:36 +0800 Subject: ping: Check return value of function 'ping_queue_rcv_skb' Function 'ping_queue_rcv_skb' not always return success, which will also return fail. If not check the wrong return value of it, lead to function `ping_rcv` return success. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller --- net/ipv4/ping.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 1c9f71a37258..95a718397fd1 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -954,6 +954,7 @@ bool ping_rcv(struct sk_buff *skb) struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + bool rc = false; /* We assume the packet has already been checked by icmp_rcv */ @@ -968,14 +969,15 @@ bool ping_rcv(struct sk_buff *skb) struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); - if (skb2) - ping_queue_rcv_skb(sk, skb2); + if (skb2 && !ping_queue_rcv_skb(sk, skb2)) + rc = true; sock_put(sk); - return true; } - pr_debug("no socket, dropping\n"); - return false; + if (!rc) + pr_debug("no socket, dropping\n"); + + return rc; } EXPORT_SYMBOL_GPL(ping_rcv); -- cgit v1.2.3 From 58e2071742e38f29f051b709a5cca014ba51166f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 10 Jun 2021 15:04:10 +0300 Subject: net: bridge: fix vlan tunnel dst null pointer dereference This patch fixes a tunnel_dst null pointer dereference due to lockless access in the tunnel egress path. When deleting a vlan tunnel the tunnel_dst pointer is set to NULL without waiting a grace period (i.e. while it's still usable) and packets egressing are dereferencing it without checking. Use READ/WRITE_ONCE to annotate the lockless use of tunnel_id, use RCU for accessing tunnel_dst and make sure it is read only once and checked in the egress path. The dst is already properly RCU protected so we don't need to do anything fancy than to make sure tunnel_id and tunnel_dst are read only once and checked in the egress path. Cc: stable@vger.kernel.org Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 4 ++-- net/bridge/br_vlan_tunnel.c | 38 ++++++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7ce8a77cc6b6..e013d33f1c7c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -90,8 +90,8 @@ struct bridge_mcast_stats { #endif struct br_tunnel_info { - __be64 tunnel_id; - struct metadata_dst *tunnel_dst; + __be64 tunnel_id; + struct metadata_dst __rcu *tunnel_dst; }; /* private vlan flags */ diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 0d3a8c01552e..03de461a0d44 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -41,26 +41,33 @@ static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, br_vlan_tunnel_rht_params); } +static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) +{ + struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); + + WRITE_ONCE(vlan->tinfo.tunnel_id, 0); + RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); + dst_release(&tdst->dst); +} + void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { - if (!vlan->tinfo.tunnel_dst) + if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); - vlan->tinfo.tunnel_id = 0; - dst_release(&vlan->tinfo.tunnel_dst->dst); - vlan->tinfo.tunnel_dst = NULL; + vlan_tunnel_info_release(vlan); } static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { - struct metadata_dst *metadata = NULL; + struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); int err; - if (vlan->tinfo.tunnel_dst) + if (metadata) return -EEXIST; metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, @@ -69,8 +76,8 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, return -EINVAL; metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; - vlan->tinfo.tunnel_dst = metadata; - vlan->tinfo.tunnel_id = key; + rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); + WRITE_ONCE(vlan->tinfo.tunnel_id, key); err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); @@ -79,9 +86,7 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, return 0; out: - dst_release(&vlan->tinfo.tunnel_dst->dst); - vlan->tinfo.tunnel_dst = NULL; - vlan->tinfo.tunnel_id = 0; + vlan_tunnel_info_release(vlan); return err; } @@ -182,12 +187,15 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { + struct metadata_dst *tunnel_dst; + __be64 tunnel_id; int err; - if (!vlan || !vlan->tinfo.tunnel_id) + if (!vlan) return 0; - if (unlikely(!skb_vlan_tag_present(skb))) + tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); + if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0; skb_dst_drop(skb); @@ -195,7 +203,9 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err; - skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst)); + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); + if (tunnel_dst) + skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); return 0; } -- cgit v1.2.3 From cfc579f9d89af4ada58c69b03bcaa4887840f3b3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 10 Jun 2021 15:04:11 +0300 Subject: net: bridge: fix vlan tunnel dst refcnt when egressing The egress tunnel code uses dst_clone() and directly sets the result which is wrong because the entry might have 0 refcnt or be already deleted, causing number of problems. It also triggers the WARN_ON() in dst_hold()[1] when a refcnt couldn't be taken. Fix it by using dst_hold_safe() and checking if a reference was actually taken before setting the dst. [1] dmesg WARN_ON log and following refcnt errors WARNING: CPU: 5 PID: 38 at include/net/dst.h:230 br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] Modules linked in: 8021q garp mrp bridge stp llc bonding ipv6 virtio_net CPU: 5 PID: 38 Comm: ksoftirqd/5 Kdump: loaded Tainted: G W 5.13.0-rc3+ #360 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] Code: e8 85 bc 01 e1 45 84 f6 74 90 45 31 f6 85 db 48 c7 c7 a0 02 19 a0 41 0f 94 c6 31 c9 31 d2 44 89 f6 e8 64 bc 01 e1 85 db 75 02 <0f> 0b 31 c9 31 d2 44 89 f6 48 c7 c7 70 02 19 a0 e8 4b bc 01 e1 49 RSP: 0018:ffff8881003d39e8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffa01902a0 RBP: ffff8881040c6700 R08: 0000000000000000 R09: 0000000000000001 R10: 2ce93d0054fe0d00 R11: 54fe0d00000e0000 R12: ffff888109515000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000401 FS: 0000000000000000(0000) GS:ffff88822bf40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f42ba70f030 CR3: 0000000109926000 CR4: 00000000000006e0 Call Trace: br_handle_vlan+0xbc/0xca [bridge] __br_forward+0x23/0x164 [bridge] deliver_clone+0x41/0x48 [bridge] br_handle_frame_finish+0x36f/0x3aa [bridge] ? skb_dst+0x2e/0x38 [bridge] ? br_handle_ingress_vlan_tunnel+0x3e/0x1c8 [bridge] ? br_handle_frame_finish+0x3aa/0x3aa [bridge] br_handle_frame+0x2c3/0x377 [bridge] ? __skb_pull+0x33/0x51 ? vlan_do_receive+0x4f/0x36a ? br_handle_frame_finish+0x3aa/0x3aa [bridge] __netif_receive_skb_core+0x539/0x7c6 ? __list_del_entry_valid+0x16e/0x1c2 __netif_receive_skb_list_core+0x6d/0xd6 netif_receive_skb_list_internal+0x1d9/0x1fa gro_normal_list+0x22/0x3e dev_gro_receive+0x55b/0x600 ? detach_buf_split+0x58/0x140 napi_gro_receive+0x94/0x12e virtnet_poll+0x15d/0x315 [virtio_net] __napi_poll+0x2c/0x1c9 net_rx_action+0xe6/0x1fb __do_softirq+0x115/0x2d8 run_ksoftirqd+0x18/0x20 smpboot_thread_fn+0x183/0x19c ? smpboot_unregister_percpu_thread+0x66/0x66 kthread+0x10a/0x10f ? kthread_mod_delayed_work+0xb6/0xb6 ret_from_fork+0x22/0x30 ---[ end trace 49f61b07f775fd2b ]--- dst_release: dst:00000000c02d677a refcnt:-1 dst_release underflow Cc: stable@vger.kernel.org Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 03de461a0d44..01017448ebde 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -204,8 +204,8 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, return err; tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); - if (tunnel_dst) - skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); + if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) + skb_dst_set(skb, &tunnel_dst->dst); return 0; } -- cgit v1.2.3 From f13ef10059ccf5f4ed201cd050176df62ec25bb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jun 2021 07:27:37 -0700 Subject: net: annotate data race in sock_error() sock_error() is known to be racy. The code avoids an atomic operation is sk_err is zero, and this field could be changed under us, this is fine. Sysbot reported: BUG: KCSAN: data-race in sock_alloc_send_pskb / unix_release_sock write to 0xffff888131855630 of 4 bytes by task 9365 on cpu 1: unix_release_sock+0x2e9/0x6e0 net/unix/af_unix.c:550 unix_release+0x2f/0x50 net/unix/af_unix.c:859 __sock_release net/socket.c:599 [inline] sock_close+0x6c/0x150 net/socket.c:1258 __fput+0x25b/0x4e0 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0xae/0x130 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:174 [inline] exit_to_user_mode_prepare+0x156/0x190 kernel/entry/common.c:208 __syscall_exit_to_user_mode_work kernel/entry/common.c:290 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:301 do_syscall_64+0x56/0x90 arch/x86/entry/common.c:57 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888131855630 of 4 bytes by task 9385 on cpu 0: sock_error include/net/sock.h:2269 [inline] sock_alloc_send_pskb+0xe4/0x4e0 net/core/sock.c:2336 unix_dgram_sendmsg+0x478/0x1610 net/unix/af_unix.c:1671 unix_seqpacket_sendmsg+0xc2/0x100 net/unix/af_unix.c:2055 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 __sys_sendmsg_sock+0x25/0x30 net/socket.c:2416 io_sendmsg fs/io_uring.c:4367 [inline] io_issue_sqe+0x231a/0x6750 fs/io_uring.c:6135 __io_queue_sqe+0xe9/0x360 fs/io_uring.c:6414 __io_req_task_submit fs/io_uring.c:2039 [inline] io_async_task_func+0x312/0x590 fs/io_uring.c:5074 __tctx_task_work fs/io_uring.c:1910 [inline] tctx_task_work+0x1d4/0x3d0 fs/io_uring.c:1924 task_work_run+0xae/0x130 kernel/task_work.c:164 tracehook_notify_signal include/linux/tracehook.h:212 [inline] handle_signal_work kernel/entry/common.c:145 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0xf8/0x190 kernel/entry/common.c:208 __syscall_exit_to_user_mode_work kernel/entry/common.c:290 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:301 do_syscall_64+0x56/0x90 arch/x86/entry/common.c:57 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0x00000068 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 9385 Comm: syz-executor.3 Not tainted 5.13.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 0e962d8bc73b..2fc513aa114c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2266,8 +2266,13 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk); static inline int sock_error(struct sock *sk) { int err; - if (likely(!sk->sk_err)) + + /* Avoid an atomic operation for the common case. + * This is racy since another cpu/thread can change sk_err under us. + */ + if (likely(data_race(!sk->sk_err))) return 0; + err = xchg(&sk->sk_err, 0); return -err; } -- cgit v1.2.3 From b71eaed8c04f72a919a9c44e83e4ee254e69e7f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jun 2021 07:44:11 -0700 Subject: inet: annotate date races around sk->sk_txhash UDP sendmsg() path can be lockless, it is possible for another thread to re-connect an change sk->sk_txhash under us. There is no serious impact, but we can use READ_ONCE()/WRITE_ONCE() pair to document the race. BUG: KCSAN: data-race in __ip4_datagram_connect / skb_set_owner_w write to 0xffff88813397920c of 4 bytes by task 30997 on cpu 1: sk_set_txhash include/net/sock.h:1937 [inline] __ip4_datagram_connect+0x69e/0x710 net/ipv4/datagram.c:75 __ip6_datagram_connect+0x551/0x840 net/ipv6/datagram.c:189 ip6_datagram_connect+0x2a/0x40 net/ipv6/datagram.c:272 inet_dgram_connect+0xfd/0x180 net/ipv4/af_inet.c:580 __sys_connect_file net/socket.c:1837 [inline] __sys_connect+0x245/0x280 net/socket.c:1854 __do_sys_connect net/socket.c:1864 [inline] __se_sys_connect net/socket.c:1861 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1861 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88813397920c of 4 bytes by task 31039 on cpu 0: skb_set_hash_from_sk include/net/sock.h:2211 [inline] skb_set_owner_w+0x118/0x220 net/core/sock.c:2101 sock_alloc_send_pskb+0x452/0x4e0 net/core/sock.c:2359 sock_alloc_send_skb+0x2d/0x40 net/core/sock.c:2373 __ip6_append_data+0x1743/0x21a0 net/ipv6/ip6_output.c:1621 ip6_make_skb+0x258/0x420 net/ipv6/ip6_output.c:1983 udpv6_sendmsg+0x160a/0x16b0 net/ipv6/udp.c:1527 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:642 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xbca3c43d -> 0xfdb309e0 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 31039 Comm: syz-executor.2 Not tainted 5.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 2fc513aa114c..7a7058f4f265 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1934,7 +1934,8 @@ static inline u32 net_tx_rndhash(void) static inline void sk_set_txhash(struct sock *sk) { - sk->sk_txhash = net_tx_rndhash(); + /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ + WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) @@ -2206,9 +2207,12 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock, static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { - if (sk->sk_txhash) { + /* This pairs with WRITE_ONCE() in sk_set_txhash() */ + u32 txhash = READ_ONCE(sk->sk_txhash); + + if (txhash) { skb->l4_hash = 1; - skb->hash = sk->sk_txhash; + skb->hash = txhash; } } -- cgit v1.2.3 From d1b5bee4c8be01585033be9b3a8878789285285f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jun 2021 09:00:12 -0700 Subject: net/packet: annotate data race in packet_sendmsg() There is a known race in packet_sendmsg(), addressed in commit 32d3182cd2cd ("net/packet: fix race in tpacket_snd()") Now we have data_race(), we can use it to avoid a future KCSAN warning, as syzbot loves stressing af_packet sockets :) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ae906eb4b269..74e6e45a8e84 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3034,10 +3034,13 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - if (po->tx_ring.pg_vec) + /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. + * tpacket_snd() will redo the check safely. + */ + if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); - else - return packet_snd(sock, msg, len); + + return packet_snd(sock, msg, len); } /* -- cgit v1.2.3 From 5fc177ab759418c9537433e63301096e733fb915 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 10 Jun 2021 19:40:29 +0300 Subject: netfilter: synproxy: Fix out of bounds when parsing TCP options The TCP option parser in synproxy (synproxy_parse_options) could read one byte out of bounds. When the length is 1, the execution flow gets into the loop, reads one byte of the opcode, and if the opcode is neither TCPOPT_EOL nor TCPOPT_NOP, it reads one more byte, which exceeds the length of 1. This fix is inspired by commit 9609dad263f8 ("ipv4: tcp_input: fix stack out of bounds when parsing TCP options."). v2 changes: Added an early return when length < 0 to avoid calling skb_header_pointer with negative length. Cc: Young Xiao <92siuyang@gmail.com> Fixes: 48b1de4c110a ("netfilter: add SYNPROXY core/target") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Florian Westphal Signed-off-by: David S. Miller --- net/netfilter/nf_synproxy_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index b100c04a0e43..3d6d49420db8 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -31,6 +31,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr; + if (unlikely(length < 0)) + return false; + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; @@ -47,6 +50,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, length--; continue; default: + if (length < 2) + return true; opsize = *ptr++; if (opsize < 2) return true; -- cgit v1.2.3 From 07718be265680dcf496347d475ce1a5442f55ad7 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 10 Jun 2021 19:40:30 +0300 Subject: mptcp: Fix out of bounds when parsing TCP options The TCP option parser in mptcp (mptcp_get_options) could read one byte out of bounds. When the length is 1, the execution flow gets into the loop, reads one byte of the opcode, and if the opcode is neither TCPOPT_EOL nor TCPOPT_NOP, it reads one more byte, which exceeds the length of 1. This fix is inspired by commit 9609dad263f8 ("ipv4: tcp_input: fix stack out of bounds when parsing TCP options."). Cc: Young Xiao <92siuyang@gmail.com> Fixes: cec37a6e41aa ("mptcp: Handle MP_CAPABLE options for outgoing connections") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 6b825fb3fa83..9b263f27ce9b 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -356,6 +356,8 @@ void mptcp_get_options(const struct sk_buff *skb, length--; continue; default: + if (length < 2) + return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; -- cgit v1.2.3 From ba91c49dedbde758ba0b72f57ac90b06ddf8e548 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 10 Jun 2021 19:40:31 +0300 Subject: sch_cake: Fix out of bounds when parsing TCP options and header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TCP option parser in cake qdisc (cake_get_tcpopt and cake_tcph_may_drop) could read one byte out of bounds. When the length is 1, the execution flow gets into the loop, reads one byte of the opcode, and if the opcode is neither TCPOPT_EOL nor TCPOPT_NOP, it reads one more byte, which exceeds the length of 1. This fix is inspired by commit 9609dad263f8 ("ipv4: tcp_input: fix stack out of bounds when parsing TCP options."). v2 changes: Added doff validation in cake_get_tcphdr to avoid parsing garbage as TCP header. Although it wasn't strictly an out-of-bounds access (memory was allocated), garbage values could be read where CAKE expected the TCP header if doff was smaller than 5. Cc: Young Xiao <92siuyang@gmail.com> Fixes: 8b7138814f29 ("sch_cake: Add optional ACK filter") Signed-off-by: Maxim Mikityanskiy Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 7d37638ee1c7..5c15968b5155 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -943,7 +943,7 @@ static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, } tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (!tcph) + if (!tcph || tcph->doff < 5) return NULL; return skb_header_pointer(skb, offset, @@ -967,6 +967,8 @@ static const void *cake_get_tcpopt(const struct tcphdr *tcph, length--; continue; } + if (length < 2) + break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; @@ -1104,6 +1106,8 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph, length--; continue; } + if (length < 2) + break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; -- cgit v1.2.3 From 72f961320d5d15bfcb26dbe3edaa3f7d25fd2c8a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:40 -0700 Subject: mptcp: try harder to borrow memory from subflow under pressure If the host is under sever memory pressure, and RX forward memory allocation for the msk fails, we try to borrow the required memory from the ingress subflow. The current attempt is a bit flaky: if skb->truesize is less than SK_MEM_QUANTUM, the ssk will not release any memory, and the next schedule will fail again. Instead, directly move the required amount of pages from the ssk to the msk, if available Fixes: 9c3f94e1681b ("mptcp: add missing memory scheduling in the rx path") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5edc686faff1..534cf500521d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -280,11 +280,13 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, /* try to fetch required memory from subflow */ if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - if (ssk->sk_forward_alloc < skb->truesize) - goto drop; - __sk_mem_reclaim(ssk, skb->truesize); - if (!sk_rmem_schedule(sk, skb, skb->truesize)) + int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; + + if (ssk->sk_forward_alloc < amount) goto drop; + + ssk->sk_forward_alloc -= amount; + sk->sk_forward_alloc += amount; } /* the skb map_seq accounts for the skb offset: -- cgit v1.2.3 From 99d1055ce2469dca3dd14be0991ff8133e25e3d0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:41 -0700 Subject: mptcp: wake-up readers only for in sequence data Currently we rely on the subflow->data_avail field, which is subject to races: ssk1 skb len = 500 DSS(seq=1, len=1000, off=0) # data_avail == MPTCP_SUBFLOW_DATA_AVAIL ssk2 skb len = 500 DSS(seq = 501, len=1000) # data_avail == MPTCP_SUBFLOW_DATA_AVAIL ssk1 skb len = 500 DSS(seq = 1, len=1000, off =500) # still data_avail == MPTCP_SUBFLOW_DATA_AVAIL, # as the skb is covered by a pre-existing map, # which was in-sequence at reception time. Instead we can explicitly check if some has been received in-sequence, propagating the info from __mptcp_move_skbs_from_subflow(). Additionally add the 'ONCE' annotation to the 'data_avail' memory access, as msk will read it outside the subflow socket lock. Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 33 ++++++++++++--------------------- net/mptcp/protocol.h | 1 - net/mptcp/subflow.c | 23 +++++++++-------------- 3 files changed, 21 insertions(+), 36 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 534cf500521d..f6e62a6dc9fb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -670,15 +670,13 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - mptcp_data_lock(sk); + return false; __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); @@ -690,7 +688,7 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); - mptcp_data_unlock(sk); + return moved > 0; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) @@ -698,7 +696,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); int sk_rbuf, ssk_rbuf; - bool wake; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing @@ -707,28 +704,22 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (unlikely(subflow->disposable)) return; - /* move_skbs_to_msk below can legitly clear the data_avail flag, - * but we will need later to properly woke the reader, cache its - * value - */ - wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL; - if (wake) - set_bit(MPTCP_DATA_READY, &msk->flags); - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) sk_rbuf = ssk_rbuf; - /* over limit? can't append more skbs to msk */ + /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) - goto wake; - - move_skbs_to_msk(msk, ssk); + return; -wake: - if (wake) + /* Wake-up the reader only for in-sequence data */ + mptcp_data_lock(sk); + if (move_skbs_to_msk(msk, ssk)) { + set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); + } + mptcp_data_unlock(sk); } static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) @@ -860,7 +851,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) sock_owned_by_me(sk); mptcp_for_each_subflow(msk, subflow) { - if (subflow->data_avail) + if (READ_ONCE(subflow->data_avail)) return mptcp_subflow_tcp_sock(subflow); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0c6f99c67345..385796f0ef19 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -362,7 +362,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk) enum mptcp_data_avail { MPTCP_SUBFLOW_NODATA, MPTCP_SUBFLOW_DATA_AVAIL, - MPTCP_SUBFLOW_OOO_DATA }; struct mptcp_delegated_action { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ef3d037f984a..ebb898acd65a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1000,7 +1000,7 @@ static bool subflow_check_data_avail(struct sock *ssk) struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) - subflow->data_avail = 0; + WRITE_ONCE(subflow->data_avail, 0); if (subflow->data_avail) return true; @@ -1039,18 +1039,13 @@ static bool subflow_check_data_avail(struct sock *ssk) ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, ack_seq); - if (ack_seq == old_ack) { - subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; - break; - } else if (after64(ack_seq, old_ack)) { - subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA; - break; + if (unlikely(before64(ack_seq, old_ack))) { + mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); + continue; } - /* only accept in-sequence mapping. Old values are spurious - * retransmission - */ - mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + break; } return true; @@ -1070,7 +1065,7 @@ fallback: subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); - subflow->data_avail = 0; + WRITE_ONCE(subflow->data_avail, 0); return false; } @@ -1080,7 +1075,7 @@ fallback: subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; - subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } @@ -1092,7 +1087,7 @@ bool mptcp_subflow_data_available(struct sock *sk) if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; - subflow->data_avail = 0; + WRITE_ONCE(subflow->data_avail, 0); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, -- cgit v1.2.3 From 61e710227e97172355d5f150d5c78c64175d9fb2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:42 -0700 Subject: mptcp: do not warn on bad input from the network warn_bad_map() produces a kernel WARN on bad input coming from the network. Use pr_debug() to avoid spamming the system log. Additionally, when the right bound check fails, warn_bad_map() reports the wrong ssn value, let's fix it. Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/107 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ebb898acd65a..e05e05ec9687 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -784,10 +784,10 @@ static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); } -static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { - WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", - ssn, subflow->map_subflow_seq, subflow->map_data_len); + pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) @@ -812,13 +812,13 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) /* Mapping covers data later in the subflow stream, * currently unsupported. */ - warn_bad_map(subflow, ssn); + dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ - warn_bad_map(subflow, ssn + skb->len); + dbg_bad_map(subflow, ssn); return false; } return true; -- cgit v1.2.3 From 2395da0e17935ce9158cdfae433962bdb6cbfa67 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:43 -0700 Subject: selftests: mptcp: enable syncookie only in absence of reorders Syncookie validation may fail for OoO packets, causing spurious resets and self-tests failures, so let's force syncookie only for tests iteration with no OoO. Fixes: fed61c4b584c ("selftests: mptcp: make 2nd net namespace use tcp syn cookies unconditionally") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/198 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 9ca5f1ba461e..2b495dc8d78e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -197,9 +197,6 @@ ip -net "$ns4" link set ns4eth3 up ip -net "$ns4" route add default via 10.0.3.2 ip -net "$ns4" route add default via dead:beef:3::2 -# use TCP syn cookies, even if no flooding was detected. -ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2 - set_ethtool_flags() { local ns="$1" local dev="$2" @@ -737,6 +734,14 @@ for sender in $ns1 $ns2 $ns3 $ns4;do exit $ret fi + # ns1<->ns2 is not subject to reordering/tc delays. Use it to test + # mptcp syncookie support. + if [ $sender = $ns1 ]; then + ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2 + else + ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1 + fi + run_tests "$ns2" $sender 10.0.1.2 run_tests "$ns2" $sender dead:beef:1::2 run_tests "$ns2" $sender 10.0.2.1 -- cgit v1.2.3 From 499ada5073361c631f2a3c4a8aed44d53b6f82ec Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:44 -0700 Subject: mptcp: fix soft lookup in subflow_error_report() Maxim reported a soft lookup in subflow_error_report(): watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:0] RIP: 0010:native_queued_spin_lock_slowpath RSP: 0018:ffffa859c0003bc0 EFLAGS: 00000202 RAX: 0000000000000101 RBX: 0000000000000001 RCX: 0000000000000000 RDX: ffff9195c2772d88 RSI: 0000000000000000 RDI: ffff9195c2772d88 RBP: ffff9195c2772d00 R08: 00000000000067b0 R09: c6e31da9eb1e44f4 R10: ffff9195ef379700 R11: ffff9195edb50710 R12: ffff9195c2772d88 R13: ffff9195f500e3d0 R14: ffff9195ef379700 R15: ffff9195ef379700 FS: 0000000000000000(0000) GS:ffff91961f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c000407000 CR3: 0000000002988000 CR4: 00000000000006f0 Call Trace: _raw_spin_lock_bh subflow_error_report mptcp_subflow_data_available __mptcp_move_skbs_from_subflow mptcp_data_ready tcp_data_queue tcp_rcv_established tcp_v4_do_rcv tcp_v4_rcv ip_protocol_deliver_rcu ip_local_deliver_finish __netif_receive_skb_one_core netif_receive_skb rtl8139_poll 8139too __napi_poll net_rx_action __do_softirq __irq_exit_rcu common_interrupt The calling function - mptcp_subflow_data_available() - can be invoked from different contexts: - plain ssk socket lock - ssk socket lock + mptcp_data_lock - ssk socket lock + mptcp_data_lock + msk socket lock. Since subflow_error_report() tries to acquire the mptcp_data_lock, the latter two call chains will cause soft lookup. This change addresses the issue moving the error reporting call to outer functions, where the held locks list is known and the we can acquire only the needed one. Reported-by: Maxim Galaganov Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/199 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++++++ net/mptcp/subflow.c | 75 +++++++++++++++++++++++++++------------------------- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f6e62a6dc9fb..632350018fb6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -680,6 +680,12 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); + if (unlikely(ssk->sk_err)) { + if (!sock_owned_by_user(sk)) + __mptcp_error_report(sk); + else + set_bit(MPTCP_ERROR_REPORT, &msk->flags); + } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but @@ -1948,6 +1954,9 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); tcp_cleanup_rbuf(ssk, moved); + + if (unlikely(ssk->sk_err)) + __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); } while (!done); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e05e05ec9687..be1de4084196 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1060,7 +1060,6 @@ fallback: * subflow_error_report() will introduce the appropriate barriers */ ssk->sk_err = EBADMSG; - ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; @@ -1115,41 +1114,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) *full_space = tcp_full_space(sk); } -static void subflow_data_ready(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - u16 state = 1 << inet_sk_state_load(sk); - struct sock *parent = subflow->conn; - struct mptcp_sock *msk; - - msk = mptcp_sk(parent); - if (state & TCPF_LISTEN) { - /* MPJ subflow are removed from accept queue before reaching here, - * avoid stray wakeups - */ - if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) - return; - - set_bit(MPTCP_DATA_READY, &msk->flags); - parent->sk_data_ready(parent); - return; - } - - WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && - !subflow->mp_join && !(state & TCPF_CLOSE)); - - if (mptcp_subflow_data_available(sk)) - mptcp_data_ready(parent, sk); -} - -static void subflow_write_space(struct sock *ssk) -{ - struct sock *sk = mptcp_subflow_ctx(ssk)->conn; - - mptcp_propagate_sndbuf(sk, ssk); - mptcp_write_space(sk); -} - void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; @@ -1190,6 +1154,43 @@ static void subflow_error_report(struct sock *ssk) mptcp_data_unlock(sk); } +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + u16 state = 1 << inet_sk_state_load(sk); + struct sock *parent = subflow->conn; + struct mptcp_sock *msk; + + msk = mptcp_sk(parent); + if (state & TCPF_LISTEN) { + /* MPJ subflow are removed from accept queue before reaching here, + * avoid stray wakeups + */ + if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) + return; + + set_bit(MPTCP_DATA_READY, &msk->flags); + parent->sk_data_ready(parent); + return; + } + + WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && + !subflow->mp_join && !(state & TCPF_CLOSE)); + + if (mptcp_subflow_data_available(sk)) + mptcp_data_ready(parent, sk); + else if (unlikely(sk->sk_err)) + subflow_error_report(sk); +} + +static void subflow_write_space(struct sock *ssk) +{ + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + + mptcp_propagate_sndbuf(sk, ssk); + mptcp_write_space(sk); +} + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@ -1500,6 +1501,8 @@ static void subflow_state_change(struct sock *sk) */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); + else if (unlikely(sk->sk_err)) + subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); -- cgit v1.2.3 From da9ef50f545f86ffe6ff786174d26500c4db737a Mon Sep 17 00:00:00 2001 From: Praneeth Bajjuri Date: Wed, 9 Jun 2021 19:43:42 -0500 Subject: net: phy: dp83867: perform soft reset and retain established link Current logic is performing hard reset and causing the programmed registers to be wiped out. as per datasheet: https://www.ti.com/lit/ds/symlink/dp83867cr.pdf 8.6.26 Control Register (CTRL) do SW_RESTART to perform a reset not including the registers, If performed when link is already present, it will drop the link and trigger re-auto negotiation. Signed-off-by: Praneeth Bajjuri Signed-off-by: Geet Modi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 9bd9a5c0b1db..6bbc81ad295f 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -826,16 +826,12 @@ static int dp83867_phy_reset(struct phy_device *phydev) { int err; - err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESET); + err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESTART); if (err < 0) return err; usleep_range(10, 20); - /* After reset FORCE_LINK_GOOD bit is set. Although the - * default value should be unset. Disable FORCE_LINK_GOOD - * for the phy to work properly. - */ return phy_modify(phydev, MII_DP83867_PHYCTRL, DP83867_PHYCR_FORCE_LINK_GOOD, 0); } -- cgit v1.2.3 From 33e381448cf7a05d76ac0b47d4a6531ecd0e5c53 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 11 Jun 2021 08:13:39 +0200 Subject: alx: Fix an error handling path in 'alx_probe()' If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: ab69bde6b2e9 ("alx: add a simple AR816x/AR817x device driver") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/alx/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c index b3d74332ed33..7748b276e5fd 100644 --- a/drivers/net/ethernet/atheros/alx/main.c +++ b/drivers/net/ethernet/atheros/alx/main.c @@ -1849,6 +1849,7 @@ out_free_netdev: free_netdev(netdev); out_pci_release: pci_release_mem_regions(pdev); + pci_disable_pcie_error_reporting(pdev); out_pci_disable: pci_disable_device(pdev); return err; -- cgit v1.2.3 From 42a2039753a7f758ba5c85cb199fcf10dc2111eb Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Fri, 11 Jun 2021 12:17:45 +0530 Subject: cxgb4: fix endianness when flashing boot image Boot images are copied to memory and updated with current underlying device ID before flashing them to adapter. Ensure the updated images are always flashed in Big Endian to allow the firmware to read the new images during boot properly. Fixes: 550883558f17 ("cxgb4: add support to flash boot image") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 44 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 9428ef1f04a8..1293505025c1 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -3060,16 +3060,19 @@ int t4_read_flash(struct adapter *adapter, unsigned int addr, * @addr: the start address to write * @n: length of data to write in bytes * @data: the data to write + * @byte_oriented: whether to store data as bytes or as words * * Writes up to a page of data (256 bytes) to the serial flash starting * at the given address. All the data must be written to the same page. + * If @byte_oriented is set the write data is stored as byte stream + * (i.e. matches what on disk), otherwise in big-endian. */ static int t4_write_flash(struct adapter *adapter, unsigned int addr, - unsigned int n, const u8 *data) + unsigned int n, const u8 *data, bool byte_oriented) { - int ret; - u32 buf[64]; unsigned int i, c, left, val, offset = addr & 0xff; + u32 buf[64]; + int ret; if (addr >= adapter->params.sf_size || offset + n > SF_PAGE_SIZE) return -EINVAL; @@ -3080,10 +3083,14 @@ static int t4_write_flash(struct adapter *adapter, unsigned int addr, (ret = sf1_write(adapter, 4, 1, 1, val)) != 0) goto unlock; - for (left = n; left; left -= c) { + for (left = n; left; left -= c, data += c) { c = min(left, 4U); - for (val = 0, i = 0; i < c; ++i) - val = (val << 8) + *data++; + for (val = 0, i = 0; i < c; ++i) { + if (byte_oriented) + val = (val << 8) + data[i]; + else + val = (val << 8) + data[c - i - 1]; + } ret = sf1_write(adapter, c, c != left, 1, val); if (ret) @@ -3096,7 +3103,8 @@ static int t4_write_flash(struct adapter *adapter, unsigned int addr, t4_write_reg(adapter, SF_OP_A, 0); /* unlock SF */ /* Read the page to verify the write succeeded */ - ret = t4_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1); + ret = t4_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, + byte_oriented); if (ret) return ret; @@ -3692,7 +3700,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) */ memcpy(first_page, fw_data, SF_PAGE_SIZE); ((struct fw_hdr *)first_page)->fw_ver = cpu_to_be32(0xffffffff); - ret = t4_write_flash(adap, fw_start, SF_PAGE_SIZE, first_page); + ret = t4_write_flash(adap, fw_start, SF_PAGE_SIZE, first_page, true); if (ret) goto out; @@ -3700,14 +3708,14 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) { addr += SF_PAGE_SIZE; fw_data += SF_PAGE_SIZE; - ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, fw_data); + ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, fw_data, true); if (ret) goto out; } - ret = t4_write_flash(adap, - fw_start + offsetof(struct fw_hdr, fw_ver), - sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver); + ret = t4_write_flash(adap, fw_start + offsetof(struct fw_hdr, fw_ver), + sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver, + true); out: if (ret) dev_err(adap->pdev_dev, "firmware download failed, error %d\n", @@ -10208,7 +10216,7 @@ int t4_load_cfg(struct adapter *adap, const u8 *cfg_data, unsigned int size) n = size - i; else n = SF_PAGE_SIZE; - ret = t4_write_flash(adap, addr, n, cfg_data); + ret = t4_write_flash(adap, addr, n, cfg_data, true); if (ret) goto out; @@ -10677,13 +10685,14 @@ int t4_load_boot(struct adapter *adap, u8 *boot_data, for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) { addr += SF_PAGE_SIZE; boot_data += SF_PAGE_SIZE; - ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, boot_data); + ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, boot_data, + false); if (ret) goto out; } ret = t4_write_flash(adap, boot_sector, SF_PAGE_SIZE, - (const u8 *)header); + (const u8 *)header, false); out: if (ret) @@ -10758,7 +10767,7 @@ int t4_load_bootcfg(struct adapter *adap, const u8 *cfg_data, unsigned int size) for (i = 0; i < size; i += SF_PAGE_SIZE) { n = min_t(u32, size - i, SF_PAGE_SIZE); - ret = t4_write_flash(adap, addr, n, cfg_data); + ret = t4_write_flash(adap, addr, n, cfg_data, false); if (ret) goto out; @@ -10770,7 +10779,8 @@ int t4_load_bootcfg(struct adapter *adap, const u8 *cfg_data, unsigned int size) for (i = 0; i < npad; i++) { u8 data = 0; - ret = t4_write_flash(adap, cfg_addr + size + i, 1, &data); + ret = t4_write_flash(adap, cfg_addr + size + i, 1, &data, + false); if (ret) goto out; } -- cgit v1.2.3 From f046bd0ae15d8a0bbe57d4647da182420f720c3d Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Fri, 11 Jun 2021 12:17:46 +0530 Subject: cxgb4: fix sleep in atomic when flashing PHY firmware Before writing new PHY firmware to on-chip memory, driver queries firmware for current running PHY firmware version, which can result in sleep waiting for reply. So, move spinlock closer to the actual on-chip memory write operation, instead of taking it at the callers. Fixes: 5fff701c838e ("cxgb4: always sync access when flashing PHY firmware") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 2 -- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 -- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 ++ 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 61ea3ec5c3fc..bc2de01d0539 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -1337,9 +1337,7 @@ static int cxgb4_ethtool_flash_phy(struct net_device *netdev, return ret; } - spin_lock_bh(&adap->win0_lock); ret = t4_load_phy_fw(adap, MEMWIN_NIC, NULL, data, size); - spin_unlock_bh(&adap->win0_lock); if (ret) dev_err(adap->pdev_dev, "Failed to load PHY FW\n"); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 1f601de02e70..762113a04dde 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4424,10 +4424,8 @@ static int adap_init0_phy(struct adapter *adap) /* Load PHY Firmware onto adapter. */ - spin_lock_bh(&adap->win0_lock); ret = t4_load_phy_fw(adap, MEMWIN_NIC, phy_info->phy_fw_version, (u8 *)phyf->data, phyf->size); - spin_unlock_bh(&adap->win0_lock); if (ret < 0) dev_err(adap->pdev_dev, "PHY Firmware transfer error %d\n", -ret); diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 1293505025c1..a0555f4d76fc 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -3820,9 +3820,11 @@ int t4_load_phy_fw(struct adapter *adap, int win, /* Copy the supplied PHY Firmware image to the adapter memory location * allocated by the adapter firmware. */ + spin_lock_bh(&adap->win0_lock); ret = t4_memory_rw(adap, win, mtype, maddr, phy_fw_size, (__be32 *)phy_fw_data, T4_MEMORY_WRITE); + spin_unlock_bh(&adap->win0_lock); if (ret) return ret; -- cgit v1.2.3 From 6d297540f75d759489054e8b07932208fc4db2cb Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Fri, 11 Jun 2021 12:17:47 +0530 Subject: cxgb4: halt chip before flashing PHY firmware image When using firmware-assisted PHY firmware image write to flash, halt the chip before beginning the flash write operation to allow the running firmware to store the image persistently. Otherwise, the running firmware will only store the PHY image in local on-chip RAM, which will be lost after next reset. Fixes: 4ee339e1e92a ("cxgb4: add support to flash PHY image") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index bc2de01d0539..df20485b5744 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -1337,11 +1337,27 @@ static int cxgb4_ethtool_flash_phy(struct net_device *netdev, return ret; } + /* We have to RESET the chip/firmware because we need the + * chip in uninitialized state for loading new PHY image. + * Otherwise, the running firmware will only store the PHY + * image in local RAM which will be lost after next reset. + */ + ret = t4_fw_reset(adap, adap->mbox, PIORSTMODE_F | PIORST_F); + if (ret < 0) { + dev_err(adap->pdev_dev, + "Set FW to RESET for flashing PHY FW failed. ret: %d\n", + ret); + return ret; + } + ret = t4_load_phy_fw(adap, MEMWIN_NIC, NULL, data, size); - if (ret) - dev_err(adap->pdev_dev, "Failed to load PHY FW\n"); + if (ret < 0) { + dev_err(adap->pdev_dev, "Failed to load PHY FW. ret: %d\n", + ret); + return ret; + } - return ret; + return 0; } static int cxgb4_ethtool_flash_fw(struct net_device *netdev, -- cgit v1.2.3 From 1adb20f0d496b2c61e9aa1f4761b8d71f93d258e Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 11 Jun 2021 15:16:11 +0800 Subject: net: stmmac: dwmac1000: Fix extended MAC address registers definition The register starts from 0x800 is the 16th MAC address register rather than the first one. Fixes: cffb13f4d6fb ("stmmac: extend mac addr reg and fix perfect filering") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac1000.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h index b70d44ac0990..3c73453725f9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h @@ -76,10 +76,10 @@ enum power_event { #define LPI_CTRL_STATUS_TLPIEN 0x00000001 /* Transmit LPI Entry */ /* GMAC HW ADDR regs */ -#define GMAC_ADDR_HIGH(reg) (((reg > 15) ? 0x00000800 : 0x00000040) + \ - (reg * 8)) -#define GMAC_ADDR_LOW(reg) (((reg > 15) ? 0x00000804 : 0x00000044) + \ - (reg * 8)) +#define GMAC_ADDR_HIGH(reg) ((reg > 15) ? 0x00000800 + (reg - 16) * 8 : \ + 0x00000040 + (reg * 8)) +#define GMAC_ADDR_LOW(reg) ((reg > 15) ? 0x00000804 + (reg - 16) * 8 : \ + 0x00000044 + (reg * 8)) #define GMAC_MAX_PERFECT_ADDRESSES 1 #define GMAC_PCS_BASE 0x000000c0 /* PCS register base */ -- cgit v1.2.3 From ea6932d70e223e02fea3ae20a4feff05d7c1ea9a Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 11 Jun 2021 22:29:59 +0800 Subject: net: make get_net_ns return error if NET_NS is disabled There is a panic in socket ioctl cmd SIOCGSKNS when NET_NS is not enabled. The reason is that nsfs tries to access ns->ops but the proc_ns_operations is not implemented in this case. [7.670023] Unable to handle kernel NULL pointer dereference at virtual address 00000010 [7.670268] pgd = 32b54000 [7.670544] [00000010] *pgd=00000000 [7.671861] Internal error: Oops: 5 [#1] SMP ARM [7.672315] Modules linked in: [7.672918] CPU: 0 PID: 1 Comm: systemd Not tainted 5.13.0-rc3-00375-g6799d4f2da49 #16 [7.673309] Hardware name: Generic DT based system [7.673642] PC is at nsfs_evict+0x24/0x30 [7.674486] LR is at clear_inode+0x20/0x9c The same to tun SIOCGSKNS command. To fix this problem, we make get_net_ns() return -EINVAL when NET_NS is disabled. Meanwhile move it to right place net/core/net_namespace.c. Signed-off-by: Changbin Du Fixes: c62cce2caee5 ("net: add an ioctl to get a socket network namespace") Cc: Cong Wang Cc: Jakub Kicinski Cc: David Laight Cc: Christian Brauner Suggested-by: Jakub Kicinski Acked-by: Christian Brauner Signed-off-by: David S. Miller --- include/linux/socket.h | 2 -- include/net/net_namespace.h | 7 +++++++ net/core/net_namespace.c | 12 ++++++++++++ net/socket.c | 13 ------------- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index b8fc5c53ba6f..0d8e3dcb7f88 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -438,6 +438,4 @@ extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); - -extern struct ns_common *get_net_ns(struct ns_common *ns); #endif /* _LINUX_SOCKET_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fa5887143f0d..6412d7833d97 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -184,6 +184,8 @@ struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); void net_ns_barrier(void); + +struct ns_common *get_net_ns(struct ns_common *ns); #else /* CONFIG_NET_NS */ #include #include @@ -203,6 +205,11 @@ static inline void net_ns_get_ownership(const struct net *net, } static inline void net_ns_barrier(void) {} + +static inline struct ns_common *get_net_ns(struct ns_common *ns) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_NET_NS */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 43b6ac4c4439..cc8dafb25d61 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -641,6 +641,18 @@ void __put_net(struct net *net) } EXPORT_SYMBOL_GPL(__put_net); +/** + * get_net_ns - increment the refcount of the network namespace + * @ns: common namespace (net) + * + * Returns the net's common namespace. + */ +struct ns_common *get_net_ns(struct ns_common *ns) +{ + return &get_net(container_of(ns, struct net, ns))->ns; +} +EXPORT_SYMBOL_GPL(get_net_ns); + struct net *get_net_ns_by_fd(int fd) { struct file *file; diff --git a/net/socket.c b/net/socket.c index 27e3e7d53f8e..4f2c6d2795d0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1072,19 +1072,6 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * what to do with it - that's up to the protocol still. */ -/** - * get_net_ns - increment the refcount of the network namespace - * @ns: common namespace (net) - * - * Returns the net's common namespace. - */ - -struct ns_common *get_net_ns(struct ns_common *ns) -{ - return &get_net(container_of(ns, struct net, ns))->ns; -} -EXPORT_SYMBOL_GPL(get_net_ns); - static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct socket *sock; -- cgit v1.2.3 From 4f667b8e049e716a0533fc927f50310fe6e40d22 Mon Sep 17 00:00:00 2001 From: Tyson Moore Date: Sat, 12 Jun 2021 02:54:11 -0400 Subject: sch_cake: revise docs for RFC 8622 LE PHB support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b8392808eb3fc28e ("sch_cake: add RFC 8622 LE PHB support to CAKE diffserv handling") added the LE mark to the Bulk tin. Update the comments to reflect the change. Signed-off-by: Tyson Moore Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 5c15968b5155..951542843cab 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2342,7 +2342,7 @@ static int cake_config_precedence(struct Qdisc *sch) /* List of known Diffserv codepoints: * - * Least Effort (CS1) + * Least Effort (CS1, LE) * Best Effort (CS0) * Max Reliability & LLT "Lo" (TOS1) * Max Throughput (TOS2) @@ -2364,7 +2364,7 @@ static int cake_config_precedence(struct Qdisc *sch) * Total 25 codepoints. */ -/* List of traffic classes in RFC 4594: +/* List of traffic classes in RFC 4594, updated by RFC 8622: * (roughly descending order of contended priority) * (roughly ascending order of uncontended throughput) * @@ -2379,7 +2379,7 @@ static int cake_config_precedence(struct Qdisc *sch) * Ops, Admin, Management (CS2,TOS1) - eg. ssh * Standard Service (CS0 & unrecognised codepoints) * High Throughput Data (AF1x,TOS2) - eg. web traffic - * Low Priority Data (CS1) - eg. BitTorrent + * Low Priority Data (CS1,LE) - eg. BitTorrent * Total 12 traffic classes. */ @@ -2395,7 +2395,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) * Video Streaming (AF4x, AF3x, CS3) * Bog Standard (CS0 etc.) * High Throughput (AF1x, TOS2) - * Background Traffic (CS1) + * Background Traffic (CS1, LE) * * Total 8 traffic classes. */ @@ -2439,7 +2439,7 @@ static int cake_config_diffserv4(struct Qdisc *sch) * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) * Best Effort (CS0, AF1x, TOS2, and those not specified) - * Background Traffic (CS1) + * Background Traffic (CS1, LE) * * Total 4 traffic classes. */ @@ -2477,7 +2477,7 @@ static int cake_config_diffserv4(struct Qdisc *sch) static int cake_config_diffserv3(struct Qdisc *sch) { /* Simplified Diffserv structure with 3 tins. - * Low Priority (CS1) + * Low Priority (CS1, LE) * Best Effort * Latency Sensitive (TOS4, VA, EF, CS6, CS7) */ -- cgit v1.2.3 From 994c393bb6886d6d94d628475b274a8cb3fc67a4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 11 Jun 2021 13:26:00 -0500 Subject: net: qualcomm: rmnet: don't over-count statistics The purpose of the loop using u64_stats_fetch_*_irq() is to ensure statistics on a given CPU are collected atomically. If one of the statistics values gets updated within the begin/retry window, the loop will run again. Currently the statistics totals are updated inside that window. This means that if the loop ever retries, the statistics for the CPU will be counted more than once. Fix this by taking a snapshot of a CPU's statistics inside the protected window, and then updating the counters with the snapshot values after exiting the loop. (Also add a newline at the end of this file...) Fixes: 192c4b5d48f2a ("net: qualcomm: rmnet: Add support for 64 bit stats") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index 41fbd2ceeede..ab1e0fcccabb 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -126,24 +126,24 @@ static void rmnet_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct rmnet_priv *priv = netdev_priv(dev); - struct rmnet_vnd_stats total_stats; + struct rmnet_vnd_stats total_stats = { }; struct rmnet_pcpu_stats *pcpu_ptr; + struct rmnet_vnd_stats snapshot; unsigned int cpu, start; - memset(&total_stats, 0, sizeof(struct rmnet_vnd_stats)); - for_each_possible_cpu(cpu) { pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu); do { start = u64_stats_fetch_begin_irq(&pcpu_ptr->syncp); - total_stats.rx_pkts += pcpu_ptr->stats.rx_pkts; - total_stats.rx_bytes += pcpu_ptr->stats.rx_bytes; - total_stats.tx_pkts += pcpu_ptr->stats.tx_pkts; - total_stats.tx_bytes += pcpu_ptr->stats.tx_bytes; + snapshot = pcpu_ptr->stats; /* struct assignment */ } while (u64_stats_fetch_retry_irq(&pcpu_ptr->syncp, start)); - total_stats.tx_drops += pcpu_ptr->stats.tx_drops; + total_stats.rx_pkts += snapshot.rx_pkts; + total_stats.rx_bytes += snapshot.rx_bytes; + total_stats.tx_pkts += snapshot.tx_pkts; + total_stats.tx_bytes += snapshot.tx_bytes; + total_stats.tx_drops += snapshot.tx_drops; } s->rx_packets = total_stats.rx_pkts; @@ -354,4 +354,4 @@ int rmnet_vnd_update_dev_mtu(struct rmnet_port *port, } return 0; -} \ No newline at end of file +} -- cgit v1.2.3 From e175aef902697826d344ce3a12189329848fe898 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 11 Jun 2021 18:49:48 -0700 Subject: ethtool: strset: fix message length calculation Outer nest for ETHTOOL_A_STRSET_STRINGSETS is not accounted for. This may result in ETHTOOL_MSG_STRSET_GET producing a warning like: calculated message payload length (684) not sufficient WARNING: CPU: 0 PID: 30967 at net/ethtool/netlink.c:369 ethnl_default_doit+0x87a/0xa20 and a splat. As usually with such warnings three conditions must be met for the warning to trigger: - there must be no skb size rounding up (e.g. reply_size of 684); - string set must be per-device (so that the header gets populated); - the device name must be at least 12 characters long. all in all with current user space it looks like reading priv flags is the only place this could potentially happen. Or with syzbot :) Reported-by: syzbot+59aa77b92d06cd5a54f2@syzkaller.appspotmail.com Fixes: 71921690f974 ("ethtool: provide string sets with STRSET_GET request") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/strset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index b3029fff715d..2d51b7ab4dc5 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -353,6 +353,8 @@ static int strset_reply_size(const struct ethnl_req_info *req_base, int len = 0; int ret; + len += nla_total_size(0); /* ETHTOOL_A_STRSET_STRINGSETS */ + for (i = 0; i < ETH_SS_COUNT; i++) { const struct strset_info *set_info = &data->sets[i]; -- cgit v1.2.3 From cb3376604a676e0302258b01893911bdd7aa5278 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 12 Jun 2021 14:37:46 +0200 Subject: qlcnic: Fix an error handling path in 'qlcnic_probe()' If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: 451724c821c1 ("qlcnic: aer support") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 96b947fde646..3beafc60747e 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -2690,6 +2690,7 @@ err_out_free_hw_res: kfree(ahw); err_out_free_res: + pci_disable_pcie_error_reporting(pdev); pci_release_regions(pdev); err_out_disable_pdev: -- cgit v1.2.3 From 49a10c7b176295f8fafb338911cf028e97f65f4d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 12 Jun 2021 14:53:12 +0200 Subject: netxen_nic: Fix an error handling path in 'netxen_nic_probe()' If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: e87ad5539343 ("netxen: support pci error handlers") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index 7e6bac85495d..344ea1143454 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -1602,6 +1602,8 @@ err_out_free_netdev: free_netdev(netdev); err_out_free_res: + if (NX_IS_REVISION_P3(pdev->revision)) + pci_disable_pcie_error_reporting(pdev); pci_release_regions(pdev); err_out_disable_pdev: -- cgit v1.2.3 From 09427c1915f754ebe7d3d8e54e79bbee48afe916 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Sat, 12 Jun 2021 19:20:44 +0530 Subject: cxgb4: fix wrong ethtool n-tuple rule lookup The TID returned during successful filter creation is relative to the region in which the filter is created. Using it directly always returns Hi Prio/Normal filter region's entry for the first couple of entries, even though the rule is actually inserted in Hash region. Fix by analyzing in which region the filter has been inserted and save the absolute TID to be used for lookup later. Fixes: db43b30cd89c ("cxgb4: add ethtool n-tuple filter deletion") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 24 ++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index df20485b5744..83ed10ac8660 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -1624,16 +1624,14 @@ static struct filter_entry *cxgb4_get_filter_entry(struct adapter *adap, u32 ftid) { struct tid_info *t = &adap->tids; - struct filter_entry *f; - if (ftid < t->nhpftids) - f = &adap->tids.hpftid_tab[ftid]; - else if (ftid < t->nftids) - f = &adap->tids.ftid_tab[ftid - t->nhpftids]; - else - f = lookup_tid(&adap->tids, ftid); + if (ftid >= t->hpftid_base && ftid < t->hpftid_base + t->nhpftids) + return &t->hpftid_tab[ftid - t->hpftid_base]; + + if (ftid >= t->ftid_base && ftid < t->ftid_base + t->nftids) + return &t->ftid_tab[ftid - t->ftid_base]; - return f; + return lookup_tid(t, ftid); } static void cxgb4_fill_filter_rule(struct ethtool_rx_flow_spec *fs, @@ -1840,6 +1838,11 @@ static int cxgb4_ntuple_del_filter(struct net_device *dev, filter_id = filter_info->loc_array[cmd->fs.location]; f = cxgb4_get_filter_entry(adapter, filter_id); + if (f->fs.prio) + filter_id -= adapter->tids.hpftid_base; + else if (!f->fs.hash) + filter_id -= (adapter->tids.ftid_base - adapter->tids.nhpftids); + ret = cxgb4_flow_rule_destroy(dev, f->fs.tc_prio, &f->fs, filter_id); if (ret) goto err; @@ -1899,6 +1902,11 @@ static int cxgb4_ntuple_set_filter(struct net_device *netdev, filter_info = &adapter->ethtool_filters->port[pi->port_id]; + if (fs.prio) + tid += adapter->tids.hpftid_base; + else if (!fs.hash) + tid += (adapter->tids.ftid_base - adapter->tids.nhpftids); + filter_info->loc_array[cmd->fs.location] = tid; set_bit(cmd->fs.location, filter_info->bmap); filter_info->in_use++; -- cgit v1.2.3 From 58af3d3d54e87bfc1f936e16c04ade3369d34011 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sat, 12 Jun 2021 17:51:22 +0300 Subject: net: caif: fix memory leak in ldisc_open Syzbot reported memory leak in tty_init_dev(). The problem was in unputted tty in ldisc_open() static int ldisc_open(struct tty_struct *tty) { ... ser->tty = tty_kref_get(tty); ... result = register_netdevice(dev); if (result) { rtnl_unlock(); free_netdev(dev); return -ENODEV; } ... } Ser pointer is netdev private_data, so after free_netdev() this pointer goes away with unputted tty reference. So, fix it by adding tty_kref_put() before freeing netdev. Reported-and-tested-by: syzbot+f303e045423e617d2cad@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/caif/caif_serial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c index d17482395a4d..4ffbfd534f18 100644 --- a/drivers/net/caif/caif_serial.c +++ b/drivers/net/caif/caif_serial.c @@ -350,6 +350,7 @@ static int ldisc_open(struct tty_struct *tty) rtnl_lock(); result = register_netdevice(dev); if (result) { + tty_kref_put(tty); rtnl_unlock(); free_netdev(dev); return -ENODEV; -- cgit v1.2.3 From b87b04f5019e821c8c6c7761f258402e43500a1f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 12 Jun 2021 18:24:59 -0600 Subject: ipv4: Fix device used for dst_alloc with local routes Oliver reported a use case where deleting a VRF device can hang waiting for the refcnt to drop to 0. The root cause is that the dst is allocated against the VRF device but cached on the loopback device. The use case (added to the selftests) has an implicit VRF crossing due to the ordering of the FIB rules (lookup local is before the l3mdev rule, but the problem occurs even if the FIB rules are re-ordered with local after l3mdev because the VRF table does not have a default route to terminate the lookup). The end result is is that the FIB lookup returns the loopback device as the nexthop, but the ingress device is in a VRF. The mismatch causes the dst alloc against the VRF device but then cached on the loopback. The fix is to bring the trick used for IPv6 (see ip6_rt_get_dev_rcu): pick the dst alloc device based the fib lookup result but with checks that the result has a nexthop device (e.g., not an unreachable or prohibit entry). Fixes: f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev if relevant") Reported-by: Oliver Herms Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 15 ++++++++++++++- tools/testing/selftests/net/fib_tests.sh | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6787c55f6ab..6a36ac98476f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2056,6 +2056,19 @@ martian_source: return err; } +/* get device for dst_alloc with local routes */ +static struct net_device *ip_rt_get_dev(struct net *net, + const struct fib_result *res) +{ + struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; + struct net_device *dev = NULL; + + if (nhc) + dev = l3mdev_master_dev_rcu(nhc->nhc_dev); + + return dev ? : net->loopback_dev; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet @@ -2212,7 +2225,7 @@ local_input: } } - rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, + rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 76d9487fb03c..5abe92d55b69 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -1384,12 +1384,37 @@ ipv4_rt_replace() ipv4_rt_replace_mpath } +# checks that cached input route on VRF port is deleted +# when VRF is deleted +ipv4_local_rt_cache() +{ + run_cmd "ip addr add 10.0.0.1/32 dev lo" + run_cmd "ip netns add test-ns" + run_cmd "ip link add veth-outside type veth peer name veth-inside" + run_cmd "ip link add vrf-100 type vrf table 1100" + run_cmd "ip link set veth-outside master vrf-100" + run_cmd "ip link set veth-inside netns test-ns" + run_cmd "ip link set veth-outside up" + run_cmd "ip link set vrf-100 up" + run_cmd "ip route add 10.1.1.1/32 dev veth-outside table 1100" + run_cmd "ip netns exec test-ns ip link set veth-inside up" + run_cmd "ip netns exec test-ns ip addr add 10.1.1.1/32 dev veth-inside" + run_cmd "ip netns exec test-ns ip route add 10.0.0.1/32 dev veth-inside" + run_cmd "ip netns exec test-ns ip route add default via 10.0.0.1" + run_cmd "ip netns exec test-ns ping 10.0.0.1 -c 1 -i 1" + run_cmd "ip link delete vrf-100" + + # if we do not hang test is a success + log_test $? 0 "Cached route removed from VRF port device" +} + ipv4_route_test() { route_setup ipv4_rt_add ipv4_rt_replace + ipv4_local_rt_cache route_cleanup } -- cgit v1.2.3 From ad9d24c9429e2159d1e279dc3a83191ccb4daf1d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Mon, 14 Jun 2021 15:06:50 +0300 Subject: net: qrtr: fix OOB Read in qrtr_endpoint_post Syzbot reported slab-out-of-bounds Read in qrtr_endpoint_post. The problem was in wrong _size_ type: if (len != ALIGN(size, 4) + hdrlen) goto err; If size from qrtr_hdr is 4294967293 (0xfffffffd), the result of ALIGN(size, 4) will be 0. In case of len == hdrlen and size == 4294967293 in header this check won't fail and skb_put_data(skb, data + hdrlen, size); will read out of bound from data, which is hdrlen allocated block. Fixes: 194ccc88297a ("net: qrtr: Support decoding incoming v2 packets") Reported-and-tested-by: syzbot+1917d778024161609247@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Reviewed-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c0477bec09bd..f2efaa4225f9 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -436,7 +436,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; - unsigned int size; + size_t size; unsigned int ver; size_t hdrlen; -- cgit v1.2.3 From 995fca15b73ff8f92888cc2d5d95f17ffdac74ba Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 14 Jun 2021 10:46:44 -0700 Subject: Bluetooth: SMP: Fix crash when receiving new connection when debug is enabled When receiving a new connection pchan->conn won't be initialized so the code cannot use bt_dev_dbg as the pointer to hci_dev won't be accessible. Fixes: 2e1614f7d61e4 ("Bluetooth: SMP: Convert BT_ERR/BT_DBG to bt_dev_err/bt_dev_dbg") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 372e3b25aaa4..7dd51da73845 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3229,7 +3229,7 @@ static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; - bt_dev_dbg(pchan->conn->hcon->hdev, "pchan %p", pchan); + BT_DBG("pchan %p", pchan); chan = l2cap_chan_create(); if (!chan) @@ -3250,7 +3250,7 @@ static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) */ atomic_set(&chan->nesting, L2CAP_NESTING_SMP); - bt_dev_dbg(pchan->conn->hcon->hdev, "created chan %p", chan); + BT_DBG("created chan %p", chan); return chan; } @@ -3354,7 +3354,7 @@ static void smp_del_chan(struct l2cap_chan *chan) { struct smp_dev *smp; - bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan); + BT_DBG("chan %p", chan); smp = chan->data; if (smp) { -- cgit v1.2.3 From d203b0fd863a2261e5d00b97f3d060c4c2a6db71 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 13:03:30 +0000 Subject: bpf: Inherit expanded/patched seen count from old aux data Instead of relying on current env->pass_cnt, use the seen count from the old aux data in adjust_insn_aux_data(), and expand it to the new range of patched instructions. This change is valid given we always expand 1:n with n>=1, so what applies to the old/original instruction needs to apply for the replacement as well. Not relying on env->pass_cnt is a prerequisite for a later change where we want to avoid marking an instruction seen when verified under speculative execution path. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94ba5163d4c5..f93c7befb5dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11366,6 +11366,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; + u32 old_seen = old_data[off].seen; u32 prog_len; int i; @@ -11386,7 +11387,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = env->pass_cnt; + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; -- cgit v1.2.3 From fe9a5ca7e370e613a9a75a13008a3845ea759d6e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 13:47:27 +0000 Subject: bpf: Do not mark insn as seen under speculative path verification ... in such circumstances, we do not want to mark the instruction as seen given the goal is still to jmp-1 rewrite/sanitize dead code, if it is not reachable from the non-speculative path verification. We do however want to verify it for safety regardless. With the patch as-is all the insns that have been marked as seen before the patch will also be marked as seen after the patch (just with a potentially different non-zero count). An upcoming patch will also verify paths that are unreachable in the non-speculative domain, hence this extension is needed. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f93c7befb5dc..af88d9b9c014 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6572,6 +6572,19 @@ do_sim: return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -10630,7 +10643,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -10857,7 +10870,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -12712,6 +12725,9 @@ static void free_states(struct bpf_verifier_env *env) * insn_aux_data was touched. These variables are compared to clear temporary * data from failed pass. For testing and experiments do_check_common() can be * run multiple times even when prior attempt to verify is unsuccessful. + * + * Note that special handling is needed on !env->bypass_spec_v1 if this is + * ever called outside of error path with subsequent program rejection. */ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 9183671af6dbf60a1219371d4ed73e23f43b49db Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 May 2021 15:47:32 +0000 Subject: bpf: Fix leakage under speculation on mispredicted branches The verifier only enumerates valid control-flow paths and skips paths that are unreachable in the non-speculative domain. And so it can miss issues under speculative execution on mispredicted branches. For example, a type confusion has been demonstrated with the following crafted program: // r0 = pointer to a map array entry // r6 = pointer to readable stack slot // r9 = scalar controlled by attacker 1: r0 = *(u64 *)(r0) // cache miss 2: if r0 != 0x0 goto line 4 3: r6 = r9 4: if r0 != 0x1 goto line 6 5: r9 = *(u8 *)(r6) 6: // leak r9 Since line 3 runs iff r0 == 0 and line 5 runs iff r0 == 1, the verifier concludes that the pointer dereference on line 5 is safe. But: if the attacker trains both the branches to fall-through, such that the following is speculatively executed ... r6 = r9 r9 = *(u8 *)(r6) // leak r9 ... then the program will dereference an attacker-controlled value and could leak its content under speculative execution via side-channel. This requires to mistrain the branch predictor, which can be rather tricky, because the branches are mutually exclusive. However such training can be done at congruent addresses in user space using different branches that are not mutually exclusive. That is, by training branches in user space ... A: if r0 != 0x0 goto line C B: ... C: if r0 != 0x0 goto line D D: ... ... such that addresses A and C collide to the same CPU branch prediction entries in the PHT (pattern history table) as those of the BPF program's lines 2 and 4, respectively. A non-privileged attacker could simply brute force such collisions in the PHT until observing the attack succeeding. Alternative methods to mistrain the branch predictor are also possible that avoid brute forcing the collisions in the PHT. A reliable attack has been demonstrated, for example, using the following crafted program: // r0 = pointer to a [control] map array entry // r7 = *(u64 *)(r0 + 0), training/attack phase // r8 = *(u64 *)(r0 + 8), oob address // [...] // r0 = pointer to a [data] map array entry 1: if r7 == 0x3 goto line 3 2: r8 = r0 // crafted sequence of conditional jumps to separate the conditional // branch in line 193 from the current execution flow 3: if r0 != 0x0 goto line 5 4: if r0 == 0x0 goto exit 5: if r0 != 0x0 goto line 7 6: if r0 == 0x0 goto exit [...] 187: if r0 != 0x0 goto line 189 188: if r0 == 0x0 goto exit // load any slowly-loaded value (due to cache miss in phase 3) ... 189: r3 = *(u64 *)(r0 + 0x1200) // ... and turn it into known zero for verifier, while preserving slowly- // loaded dependency when executing: 190: r3 &= 1 191: r3 &= 2 // speculatively bypassed phase dependency 192: r7 += r3 193: if r7 == 0x3 goto exit 194: r4 = *(u8 *)(r8 + 0) // leak r4 As can be seen, in training phase (phase != 0x3), the condition in line 1 turns into false and therefore r8 with the oob address is overridden with the valid map value address, which in line 194 we can read out without issues. However, in attack phase, line 2 is skipped, and due to the cache miss in line 189 where the map value is (zeroed and later) added to the phase register, the condition in line 193 takes the fall-through path due to prior branch predictor training, where under speculation, it'll load the byte at oob address r8 (unknown scalar type at that point) which could then be leaked via side-channel. One way to mitigate these is to 'branch off' an unreachable path, meaning, the current verification path keeps following the is_branch_taken() path and we push the other branch to the verification stack. Given this is unreachable from the non-speculative domain, this branch's vstate is explicitly marked as speculative. This is needed for two reasons: i) if this path is solely seen from speculative execution, then we later on still want the dead code elimination to kick in in order to sanitize these instructions with jmp-1s, and ii) to ensure that paths walked in the non-speculative domain are not pruned from earlier walks of paths walked in the speculative domain. Additionally, for robustness, we mark the registers which have been part of the conditional as unknown in the speculative path given there should be no assumptions made on their content. The fix in here mitigates type confusion attacks described earlier due to i) all code paths in the BPF program being explored and ii) existing verifier logic already ensuring that given memory access instruction references one specific data structure. An alternative to this fix that has also been looked at in this scope was to mark aux->alu_state at the jump instruction with a BPF_JMP_TAKEN state as well as direction encoding (always-goto, always-fallthrough, unknown), such that mixing of different always-* directions themselves as well as mixing of always-* with unknown directions would cause a program rejection by the verifier, e.g. programs with constructs like 'if ([...]) { x = 0; } else { x = 1; }' with subsequent 'if (x == 1) { [...] }'. For unprivileged, this would result in only single direction always-* taken paths, and unknown taken paths being allowed, such that the former could be patched from a conditional jump to an unconditional jump (ja). Compared to this approach here, it would have two downsides: i) valid programs that otherwise are not performing any pointer arithmetic, etc, would potentially be rejected/broken, and ii) we are required to turn off path pruning for unprivileged, where both can be avoided in this work through pushing the invalid branch to the verification stack. The issue was originally discovered by Adam and Ofek, and later independently discovered and reported as a result of Benedict and Piotr's research work. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Adam Morrison Reported-by: Ofek Kirzner Reported-by: Benedict Schlueter Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af88d9b9c014..c6a27574242d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6483,6 +6483,27 @@ struct bpf_sanitize_info { bool mask_to_left; }; +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -6566,7 +6587,8 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; @@ -8763,14 +8785,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } -- cgit v1.2.3 From 973377ffe8148180b2651825b92ae91988141b05 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 May 2021 12:34:24 +0000 Subject: bpf, selftests: Adjust few selftest outcomes wrt unreachable code In almost all cases from test_verifier that have been changed in here, we've had an unreachable path with a load from a register which has an invalid address on purpose. This was basically to make sure that we never walk this path and to have the verifier complain if it would otherwise. Change it to match on the right error for unprivileged given we now test these paths under speculative execution. There's one case where we match on exact # of insns_processed. Due to the extra path, this will of course mismatch on unprivileged. Thus, restrict the test->insn_processed check to privileged-only. In one other case, we result in a 'pointer comparison prohibited' error. This is similarly due to verifying an 'invalid' branch where we end up with a value pointer on one side of the comparison. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 2 +- tools/testing/selftests/bpf/verifier/and.c | 2 ++ tools/testing/selftests/bpf/verifier/bounds.c | 14 ++++++++++++++ tools/testing/selftests/bpf/verifier/dead_code.c | 2 ++ tools/testing/selftests/bpf/verifier/jmp32.c | 22 ++++++++++++++++++++++ tools/testing/selftests/bpf/verifier/jset.c | 10 ++++++---- tools/testing/selftests/bpf/verifier/unpriv.c | 2 ++ .../selftests/bpf/verifier/value_ptr_arith.c | 7 ++++--- 8 files changed, 53 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1512092e1e68..3a9e332c5e36 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1147,7 +1147,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } } - if (test->insn_processed) { + if (!unpriv && test->insn_processed) { uint32_t insn_processed; char *proc; diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index ca8fdb1b3f01..7d7ebee5cc7a 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -61,6 +61,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 0 }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 8a1caf46ffbc..e061e8799ce2 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -508,6 +508,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT }, { @@ -528,6 +530,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT }, { @@ -569,6 +573,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -589,6 +595,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -609,6 +617,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -674,6 +684,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -695,6 +707,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 17fe33a75034..2c8935b3e65d 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -8,6 +8,8 @@ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, }, diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index bd5cae4a7f73..1c857b2fbdf0 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -87,6 +87,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -150,6 +152,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -213,6 +217,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -280,6 +286,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -348,6 +356,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -416,6 +426,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -484,6 +496,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -552,6 +566,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -620,6 +636,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -688,6 +706,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -756,6 +776,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c index 8dcd4e0383d5..11fc68da735e 100644 --- a/tools/testing/selftests/bpf/verifier/jset.c +++ b/tools/testing/selftests/bpf/verifier/jset.c @@ -82,8 +82,8 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .retval_unpriv = 1, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .retval = 1, .result = ACCEPT, }, @@ -141,7 +141,8 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -162,6 +163,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index bd436df5cc32..111801aea5e3 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -420,6 +420,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R7 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 0, }, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index 7ae2859d495c..a3e593ddfafc 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -120,7 +120,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", + .errstr_unpriv = "R2 pointer comparison prohibited", .retval = 0, }, { @@ -159,7 +159,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), // fake-dead code; targeted from branch A to - // prevent dead code sanitization + // prevent dead code sanitization, rejected + // via branch B however BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -167,7 +168,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", + .errstr_unpriv = "R0 invalid mem access 'inv'", .retval = 0, }, { -- cgit v1.2.3 From 2214fb53006e6cfa6371b706070cb99794c68c3b Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Mon, 14 Jun 2021 15:03:25 -0600 Subject: net: mhi_net: Update the transmit handler prototype Update the function prototype of mhi_ndo_xmit to match ndo_start_xmit. This otherwise leads to run time failures when CFI is enabled in kernel. Fixes: 3ffec6a14f24 ("net: Add mhi-net driver") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/mhi/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c index 0d8293a47a56..b806f2f8f859 100644 --- a/drivers/net/mhi/net.c +++ b/drivers/net/mhi/net.c @@ -49,7 +49,7 @@ static int mhi_ndo_stop(struct net_device *ndev) return 0; } -static int mhi_ndo_xmit(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t mhi_ndo_xmit(struct sk_buff *skb, struct net_device *ndev) { struct mhi_net_dev *mhi_netdev = netdev_priv(ndev); const struct mhi_net_proto *proto = mhi_netdev->proto; -- cgit v1.2.3 From 475b92f932168a78da8109acd10bfb7578b8f2bb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Jun 2021 15:24:05 -0700 Subject: ptp: improve max_adj check against unreasonable values Scaled PPM conversion to PPB may (on 64bit systems) result in a value larger than s32 can hold (freq/scaled_ppm is a long). This means the kernel will not correctly reject unreasonably high ->freq values (e.g. > 4294967295ppb, 281474976645 scaled PPM). The conversion is equivalent to a division by ~66 (65.536), so the value of ppb is always smaller than ppm, but not small enough to assume narrowing the type from long -> s32 is okay. Note that reasonable user space (e.g. ptp4l) will not use such high values, anyway, 4289046510ppb ~= 4.3x, so the fix is somewhat pedantic. Fixes: d39a743511cd ("ptp: validate the requested frequency adjustment.") Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.") Signed-off-by: Jakub Kicinski Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 6 +++--- include/linux/ptp_clock_kernel.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 03a246e60fd9..21c4c34c52d8 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -63,7 +63,7 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue, spin_unlock_irqrestore(&queue->lock, flags); } -s32 scaled_ppm_to_ppb(long ppm) +long scaled_ppm_to_ppb(long ppm) { /* * The 'freq' field in the 'struct timex' is in parts per @@ -80,7 +80,7 @@ s32 scaled_ppm_to_ppb(long ppm) s64 ppb = 1 + ppm; ppb *= 125; ppb >>= 13; - return (s32) ppb; + return (long) ppb; } EXPORT_SYMBOL(scaled_ppm_to_ppb); @@ -138,7 +138,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx) delta = ktime_to_ns(kt); err = ops->adjtime(ops, delta); } else if (tx->modes & ADJ_FREQUENCY) { - s32 ppb = scaled_ppm_to_ppb(tx->freq); + long ppb = scaled_ppm_to_ppb(tx->freq); if (ppb > ops->max_adj || ppb < -ops->max_adj) return -ERANGE; if (ops->adjfine) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d47fd33b228..51d7f1b8b32a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -235,7 +235,7 @@ extern int ptp_clock_index(struct ptp_clock *ptp); * @ppm: Parts per million, but with a 16 bit binary fractional field */ -extern s32 scaled_ppm_to_ppb(long ppm); +extern long scaled_ppm_to_ppb(long ppm); /** * ptp_find_pin() - obtain the pin index of a given auxiliary function -- cgit v1.2.3 From e34492dea68d4f09e9989e518fc76cd41909d707 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 15 Jun 2021 07:52:43 +0800 Subject: net: inline function get_net_ns_by_fd if NET_NS is disabled The function get_net_ns_by_fd() could be inlined when NET_NS is not enabled. Signed-off-by: Changbin Du Signed-off-by: David S. Miller --- include/net/net_namespace.h | 7 ++++++- net/core/net_namespace.c | 8 +------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 6412d7833d97..bdc0459a595e 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -186,6 +186,7 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); void net_ns_barrier(void); struct ns_common *get_net_ns(struct ns_common *ns); +struct net *get_net_ns_by_fd(int fd); #else /* CONFIG_NET_NS */ #include #include @@ -210,13 +211,17 @@ static inline struct ns_common *get_net_ns(struct ns_common *ns) { return ERR_PTR(-EINVAL); } + +static inline struct net *get_net_ns_by_fd(int fd) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_NET_NS */ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); -struct net *get_net_ns_by_fd(int fd); #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cc8dafb25d61..9b5a767eddd5 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -672,14 +672,8 @@ struct net *get_net_ns_by_fd(int fd) fput(file); return net; } - -#else -struct net *get_net_ns_by_fd(int fd) -{ - return ERR_PTR(-EINVAL); -} -#endif EXPORT_SYMBOL_GPL(get_net_ns_by_fd); +#endif struct net *get_net_ns_by_pid(pid_t pid) { -- cgit v1.2.3 From c1a3d4067309451e68c33dbd356032549cc0bd8e Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 15 Jun 2021 01:05:49 -0700 Subject: net: cdc_ncm: switch to eth%d interface naming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is meant to make the host side cdc_ncm interface consistently named just like the older CDC protocols: cdc_ether & cdc_ecm (and even rndis_host), which all use 'FLAG_ETHER | FLAG_POINTTOPOINT'. include/linux/usb/usbnet.h: #define FLAG_ETHER 0x0020 /* maybe use "eth%d" names */ #define FLAG_WLAN 0x0080 /* use "wlan%d" names */ #define FLAG_WWAN 0x0400 /* use "wwan%d" names */ #define FLAG_POINTTOPOINT 0x1000 /* possibly use "usb%d" names */ drivers/net/usb/usbnet.c @ line 1711: strcpy (net->name, "usb%d"); ... // heuristic: "usb%d" for links we know are two-host, // else "eth%d" when there's reasonable doubt. userspace // can rename the link if it knows better. if ((dev->driver_info->flags & FLAG_ETHER) != 0 && ((dev->driver_info->flags & FLAG_POINTTOPOINT) == 0 || (net->dev_addr [0] & 0x02) == 0)) strcpy (net->name, "eth%d"); /* WLAN devices should always be named "wlan%d" */ if ((dev->driver_info->flags & FLAG_WLAN) != 0) strcpy(net->name, "wlan%d"); /* WWAN devices should always be named "wwan%d" */ if ((dev->driver_info->flags & FLAG_WWAN) != 0) strcpy(net->name, "wwan%d"); So by using ETHER | POINTTOPOINT the interface naming is either usb%d or eth%d based on the global uniqueness of the mac address of the device. Without this 2.5gbps ethernet dongles which all seem to use the cdc_ncm driver end up being called usb%d instead of eth%d even though they're definitely not two-host. (All 1gbps & 5gbps ethernet usb dongles I've tested don't hit this problem due to use of different drivers, primarily r8152 and aqc111) Fixes tag is based purely on git blame, and is really just here to make sure this hits LTS branches newer than v4.5. Cc: Lorenzo Colitti Fixes: 4d06dd537f95 ("cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind") Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index b04055fd1b79..df0d1837e4ed 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1880,7 +1880,7 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_LINK_INTR, + | FLAG_LINK_INTR | FLAG_ETHER, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, -- cgit v1.2.3 From 057d49334c02a79af81c30a8d240e641bd6f1741 Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Tue, 15 Jun 2021 12:01:51 +0200 Subject: qmi_wwan: Do not call netif_rx from rx_fixup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the QMI_WWAN_FLAG_PASS_THROUGH is set, netif_rx() is called from qmi_wwan_rx_fixup(). When the call to netif_rx() is successful (which is most of the time), usbnet_skb_return() is called (from rx_process()). usbnet_skb_return() will then call netif_rx() a second time for the same skb. Simplify the code and avoid the redundant netif_rx() call by changing qmi_wwan_rx_fixup() to always return 1 when QMI_WWAN_FLAG_PASS_THROUGH is set. We then leave it up to the existing infrastructure to call netif_rx(). Suggested-by: Bjørn Mork Signed-off-by: Kristian Evensen Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 6700f1970b24..bc55ec739af9 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -575,7 +575,7 @@ static int qmi_wwan_rx_fixup(struct usbnet *dev, struct sk_buff *skb) if (info->flags & QMI_WWAN_FLAG_PASS_THROUGH) { skb->protocol = htons(ETH_P_MAP); - return (netif_rx(skb) == NET_RX_SUCCESS); + return 1; } switch (skb->data[0] & 0xf0) { -- cgit v1.2.3 From 7ea6cd16f1599c1eac6018751eadbc5fc736b99a Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Tue, 15 Jun 2021 22:42:57 +0200 Subject: lantiq: net: fix duplicated skb in rx descriptor ring The previous commit didn't fix the bug properly. By mistake, it replaces the pointer of the next skb in the descriptor ring instead of the current one. As a result, the two descriptors are assigned the same SKB. The error is seen during the iperf test when skb_put tries to insert a second packet and exceeds the available buffer. Fixes: c7718ee96dbc ("net: lantiq: fix memory corruption in RX ring ") Signed-off-by: Aleksander Jan Bajkowski Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_xrx200.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/lantiq_xrx200.c b/drivers/net/ethernet/lantiq_xrx200.c index 0e10d8aeffe1..21ef2f128070 100644 --- a/drivers/net/ethernet/lantiq_xrx200.c +++ b/drivers/net/ethernet/lantiq_xrx200.c @@ -154,6 +154,7 @@ static int xrx200_close(struct net_device *net_dev) static int xrx200_alloc_skb(struct xrx200_chan *ch) { + struct sk_buff *skb = ch->skb[ch->dma.desc]; dma_addr_t mapping; int ret = 0; @@ -168,6 +169,7 @@ static int xrx200_alloc_skb(struct xrx200_chan *ch) XRX200_DMA_DATA_LEN, DMA_FROM_DEVICE); if (unlikely(dma_mapping_error(ch->priv->dev, mapping))) { dev_kfree_skb_any(ch->skb[ch->dma.desc]); + ch->skb[ch->dma.desc] = skb; ret = -ENOMEM; goto skip; } @@ -198,7 +200,6 @@ static int xrx200_hw_receive(struct xrx200_chan *ch) ch->dma.desc %= LTQ_DESC_NUM; if (ret) { - ch->skb[ch->dma.desc] = skb; net_dev->stats.rx_dropped++; netdev_err(net_dev, "failed to allocate new rx buffer\n"); return ret; -- cgit v1.2.3 From 2030043e616cab40f510299f09b636285e0a3678 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 21 May 2021 13:57:20 +0200 Subject: can: j1939: fix Use-after-Free, hold skb ref while in use This patch fixes a Use-after-Free found by the syzbot. The problem is that a skb is taken from the per-session skb queue, without incrementing the ref count. This leads to a Use-after-Free if the skb is taken concurrently from the session queue due to a CTS. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/r/20210521115720.7533-1-o.rempel@pengutronix.de Cc: Hillf Danton Cc: linux-stable Reported-by: syzbot+220c1a29987a9a490903@syzkaller.appspotmail.com Reported-by: syzbot+45199c1b73b4013525cf@syzkaller.appspotmail.com Signed-off-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 54 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index e09d087ba240..c3946c355882 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -330,6 +330,9 @@ static void j1939_session_skb_drop_old(struct j1939_session *session) if ((do_skcb->offset + do_skb->len) < offset_start) { __skb_unlink(do_skb, &session->skb_queue); + /* drop ref taken in j1939_session_skb_queue() */ + skb_unref(do_skb); + kfree_skb(do_skb); } spin_unlock_irqrestore(&session->skb_queue.lock, flags); @@ -349,12 +352,13 @@ void j1939_session_skb_queue(struct j1939_session *session, skcb->flags |= J1939_ECU_LOCAL_SRC; + skb_get(skb); skb_queue_tail(&session->skb_queue, skb); } static struct -sk_buff *j1939_session_skb_find_by_offset(struct j1939_session *session, - unsigned int offset_start) +sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session, + unsigned int offset_start) { struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *do_skcb; @@ -371,6 +375,10 @@ sk_buff *j1939_session_skb_find_by_offset(struct j1939_session *session, skb = do_skb; } } + + if (skb) + skb_get(skb); + spin_unlock_irqrestore(&session->skb_queue.lock, flags); if (!skb) @@ -381,12 +389,12 @@ sk_buff *j1939_session_skb_find_by_offset(struct j1939_session *session, return skb; } -static struct sk_buff *j1939_session_skb_find(struct j1939_session *session) +static struct sk_buff *j1939_session_skb_get(struct j1939_session *session) { unsigned int offset_start; offset_start = session->pkt.dpo * 7; - return j1939_session_skb_find_by_offset(session, offset_start); + return j1939_session_skb_get_by_offset(session, offset_start); } /* see if we are receiver @@ -776,7 +784,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) int ret = 0; u8 dat[8]; - se_skb = j1939_session_skb_find_by_offset(session, session->pkt.tx * 7); + se_skb = j1939_session_skb_get_by_offset(session, session->pkt.tx * 7); if (!se_skb) return -ENOBUFS; @@ -801,7 +809,8 @@ static int j1939_session_tx_dat(struct j1939_session *session) netdev_err_once(priv->ndev, "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n", __func__, session, skcb->offset, se_skb->len , session->pkt.tx); - return -EOVERFLOW; + ret = -EOVERFLOW; + goto out_free; } if (!len) { @@ -835,6 +844,12 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (pkt_done) j1939_tp_set_rxtimeout(session, 250); + out_free: + if (ret) + kfree_skb(se_skb); + else + consume_skb(se_skb); + return ret; } @@ -1007,7 +1022,7 @@ static int j1939_xtp_txnext_receiver(struct j1939_session *session) static int j1939_simple_txnext(struct j1939_session *session) { struct j1939_priv *priv = session->priv; - struct sk_buff *se_skb = j1939_session_skb_find(session); + struct sk_buff *se_skb = j1939_session_skb_get(session); struct sk_buff *skb; int ret; @@ -1015,8 +1030,10 @@ static int j1939_simple_txnext(struct j1939_session *session) return 0; skb = skb_clone(se_skb, GFP_ATOMIC); - if (!skb) - return -ENOMEM; + if (!skb) { + ret = -ENOMEM; + goto out_free; + } can_skb_set_owner(skb, se_skb->sk); @@ -1024,12 +1041,18 @@ static int j1939_simple_txnext(struct j1939_session *session) ret = j1939_send_one(priv, skb); if (ret) - return ret; + goto out_free; j1939_sk_errqueue(session, J1939_ERRQUEUE_SCHED); j1939_sk_queue_activate_next(session); - return 0; + out_free: + if (ret) + kfree_skb(se_skb); + else + consume_skb(se_skb); + + return ret; } static bool j1939_session_deactivate_locked(struct j1939_session *session) @@ -1170,9 +1193,10 @@ static void j1939_session_completed(struct j1939_session *session) struct sk_buff *skb; if (!session->transmission) { - skb = j1939_session_skb_find(session); + skb = j1939_session_skb_get(session); /* distribute among j1939 receivers */ j1939_sk_recv(session->priv, skb); + consume_skb(skb); } j1939_session_deactivate_activate_next(session); @@ -1744,7 +1768,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, { struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *skcb; - struct sk_buff *se_skb; + struct sk_buff *se_skb = NULL; const u8 *dat; u8 *tpdat; int offset; @@ -1786,7 +1810,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, goto out_session_cancel; } - se_skb = j1939_session_skb_find_by_offset(session, packet * 7); + se_skb = j1939_session_skb_get_by_offset(session, packet * 7); if (!se_skb) { netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__, session); @@ -1848,11 +1872,13 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, j1939_tp_set_rxtimeout(session, 250); } session->last_cmd = 0xff; + consume_skb(se_skb); j1939_session_put(session); return; out_session_cancel: + kfree_skb(se_skb); j1939_session_timers_cancel(session); j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); j1939_session_put(session); -- cgit v1.2.3 From 8d0caedb759683041d9db82069937525999ada53 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 5 Jun 2021 19:26:35 +0900 Subject: can: bcm/raw/isotp: use per module netdevice notifier syzbot is reporting hung task at register_netdevice_notifier() [1] and unregister_netdevice_notifier() [2], for cleanup_net() might perform time consuming operations while CAN driver's raw/bcm/isotp modules are calling {register,unregister}_netdevice_notifier() on each socket. Change raw/bcm/isotp modules to call register_netdevice_notifier() from module's __init function and call unregister_netdevice_notifier() from module's __exit function, as with gw/j1939 modules are doing. Link: https://syzkaller.appspot.com/bug?id=391b9498827788b3cc6830226d4ff5be87107c30 [1] Link: https://syzkaller.appspot.com/bug?id=1724d278c83ca6e6df100a2e320c10d991cf2bce [2] Link: https://lore.kernel.org/r/54a5f451-05ed-f977-8534-79e7aa2bcc8f@i-love.sakura.ne.jp Cc: linux-stable Reported-by: syzbot Reported-by: syzbot Reviewed-by: Kirill Tkhai Tested-by: syzbot Tested-by: Oliver Hartkopp Signed-off-by: Tetsuo Handa Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 59 ++++++++++++++++++++++++++++++++++++++++++------------ net/can/isotp.c | 61 ++++++++++++++++++++++++++++++++++++++++++++------------ net/can/raw.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 142 insertions(+), 40 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index 909b9e684e04..f00176b2a6c3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -125,7 +125,7 @@ struct bcm_sock { struct sock sk; int bound; int ifindex; - struct notifier_block notifier; + struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; @@ -133,6 +133,10 @@ struct bcm_sock { char procname [32]; /* inode number in decimal with \0 */ }; +static LIST_HEAD(bcm_notifier_list); +static DEFINE_SPINLOCK(bcm_notifier_lock); +static struct bcm_sock *bcm_busy_notifier; + static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; @@ -1378,20 +1382,15 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* * notification handler for netdevice status changes */ -static int bcm_notifier(struct notifier_block *nb, unsigned long msg, - void *ptr) +static void bcm_notify(struct bcm_sock *bo, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; switch (msg) { @@ -1426,7 +1425,28 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg, sk->sk_error_report(sk); } } +} +static int bcm_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + + spin_lock(&bcm_notifier_lock); + list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { + spin_unlock(&bcm_notifier_lock); + bcm_notify(bcm_busy_notifier, msg, dev); + spin_lock(&bcm_notifier_lock); + } + bcm_busy_notifier = NULL; + spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } @@ -1446,9 +1466,9 @@ static int bcm_init(struct sock *sk) INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ - bo->notifier.notifier_call = bcm_notifier; - - register_netdevice_notifier(&bo->notifier); + spin_lock(&bcm_notifier_lock); + list_add_tail(&bo->notifier, &bcm_notifier_list); + spin_unlock(&bcm_notifier_lock); return 0; } @@ -1471,7 +1491,14 @@ static int bcm_release(struct socket *sock) /* remove bcm_ops, timer, rx_unregister(), etc. */ - unregister_netdevice_notifier(&bo->notifier); + spin_lock(&bcm_notifier_lock); + while (bcm_busy_notifier == bo) { + spin_unlock(&bcm_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&bcm_notifier_lock); + } + list_del(&bo->notifier); + spin_unlock(&bcm_notifier_lock); lock_sock(sk); @@ -1692,6 +1719,10 @@ static struct pernet_operations canbcm_pernet_ops __read_mostly = { .exit = canbcm_pernet_exit, }; +static struct notifier_block canbcm_notifier = { + .notifier_call = bcm_notifier +}; + static int __init bcm_module_init(void) { int err; @@ -1705,12 +1736,14 @@ static int __init bcm_module_init(void) } register_pernet_subsys(&canbcm_pernet_ops); + register_netdevice_notifier(&canbcm_notifier); return 0; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); + unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } diff --git a/net/can/isotp.c b/net/can/isotp.c index 253b24417c8e..be6183f8ca11 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -143,10 +143,14 @@ struct isotp_sock { u32 force_tx_stmin; u32 force_rx_stmin; struct tpcon rx, tx; - struct notifier_block notifier; + struct list_head notifier; wait_queue_head_t wait; }; +static LIST_HEAD(isotp_notifier_list); +static DEFINE_SPINLOCK(isotp_notifier_lock); +static struct isotp_sock *isotp_busy_notifier; + static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; @@ -1013,7 +1017,14 @@ static int isotp_release(struct socket *sock) /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); - unregister_netdevice_notifier(&so->notifier); + spin_lock(&isotp_notifier_lock); + while (isotp_busy_notifier == so) { + spin_unlock(&isotp_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&isotp_notifier_lock); + } + list_del(&so->notifier); + spin_unlock(&isotp_notifier_lock); lock_sock(sk); @@ -1317,21 +1328,16 @@ static int isotp_getsockopt(struct socket *sock, int level, int optname, return 0; } -static int isotp_notifier(struct notifier_block *nb, unsigned long msg, - void *ptr) +static void isotp_notify(struct isotp_sock *so, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct isotp_sock *so = container_of(nb, struct isotp_sock, notifier); struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; if (so->ifindex != dev->ifindex) - return NOTIFY_DONE; + return; switch (msg) { case NETDEV_UNREGISTER: @@ -1357,7 +1363,28 @@ static int isotp_notifier(struct notifier_block *nb, unsigned long msg, sk->sk_error_report(sk); break; } +} +static int isotp_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + + spin_lock(&isotp_notifier_lock); + list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { + spin_unlock(&isotp_notifier_lock); + isotp_notify(isotp_busy_notifier, msg, dev); + spin_lock(&isotp_notifier_lock); + } + isotp_busy_notifier = NULL; + spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } @@ -1394,8 +1421,9 @@ static int isotp_init(struct sock *sk) init_waitqueue_head(&so->wait); - so->notifier.notifier_call = isotp_notifier; - register_netdevice_notifier(&so->notifier); + spin_lock(&isotp_notifier_lock); + list_add_tail(&so->notifier, &isotp_notifier_list); + spin_unlock(&isotp_notifier_lock); return 0; } @@ -1442,6 +1470,10 @@ static const struct can_proto isotp_can_proto = { .prot = &isotp_proto, }; +static struct notifier_block canisotp_notifier = { + .notifier_call = isotp_notifier +}; + static __init int isotp_module_init(void) { int err; @@ -1451,6 +1483,8 @@ static __init int isotp_module_init(void) err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed\n"); + else + register_netdevice_notifier(&canisotp_notifier); return err; } @@ -1458,6 +1492,7 @@ static __init int isotp_module_init(void) static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); + unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); diff --git a/net/can/raw.c b/net/can/raw.c index 139d9471ddcf..ac96fc210025 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -83,7 +83,7 @@ struct raw_sock { struct sock sk; int bound; int ifindex; - struct notifier_block notifier; + struct list_head notifier; int loopback; int recv_own_msgs; int fd_frames; @@ -95,6 +95,10 @@ struct raw_sock { struct uniqframe __percpu *uniq; }; +static LIST_HEAD(raw_notifier_list); +static DEFINE_SPINLOCK(raw_notifier_lock); +static struct raw_sock *raw_busy_notifier; + /* Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. @@ -263,21 +267,16 @@ static int raw_enable_allfilters(struct net *net, struct net_device *dev, return err; } -static int raw_notifier(struct notifier_block *nb, - unsigned long msg, void *ptr) +static void raw_notify(struct raw_sock *ro, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); struct sock *sk = &ro->sk; if (!net_eq(dev_net(dev), sock_net(sk))) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; if (ro->ifindex != dev->ifindex) - return NOTIFY_DONE; + return; switch (msg) { case NETDEV_UNREGISTER: @@ -305,7 +304,28 @@ static int raw_notifier(struct notifier_block *nb, sk->sk_error_report(sk); break; } +} + +static int raw_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + spin_lock(&raw_notifier_lock); + list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) { + spin_unlock(&raw_notifier_lock); + raw_notify(raw_busy_notifier, msg, dev); + spin_lock(&raw_notifier_lock); + } + raw_busy_notifier = NULL; + spin_unlock(&raw_notifier_lock); return NOTIFY_DONE; } @@ -334,9 +354,9 @@ static int raw_init(struct sock *sk) return -ENOMEM; /* set notifier */ - ro->notifier.notifier_call = raw_notifier; - - register_netdevice_notifier(&ro->notifier); + spin_lock(&raw_notifier_lock); + list_add_tail(&ro->notifier, &raw_notifier_list); + spin_unlock(&raw_notifier_lock); return 0; } @@ -351,7 +371,14 @@ static int raw_release(struct socket *sock) ro = raw_sk(sk); - unregister_netdevice_notifier(&ro->notifier); + spin_lock(&raw_notifier_lock); + while (raw_busy_notifier == ro) { + spin_unlock(&raw_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&raw_notifier_lock); + } + list_del(&ro->notifier); + spin_unlock(&raw_notifier_lock); lock_sock(sk); @@ -889,6 +916,10 @@ static const struct can_proto raw_can_proto = { .prot = &raw_proto, }; +static struct notifier_block canraw_notifier = { + .notifier_call = raw_notifier +}; + static __init int raw_module_init(void) { int err; @@ -898,6 +929,8 @@ static __init int raw_module_init(void) err = can_proto_register(&raw_can_proto); if (err < 0) pr_err("can: registration of raw protocol failed\n"); + else + register_netdevice_notifier(&canraw_notifier); return err; } @@ -905,6 +938,7 @@ static __init int raw_module_init(void) static __exit void raw_module_exit(void) { can_proto_unregister(&raw_can_proto); + unregister_netdevice_notifier(&canraw_notifier); } module_init(raw_module_init); -- cgit v1.2.3 From 5e87ddbe3942e27e939bdc02deb8579b0cbd8ecc Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Sat, 12 Jun 2021 22:18:54 +0200 Subject: can: bcm: fix infoleak in struct bcm_msg_head On 64-bit systems, struct bcm_msg_head has an added padding of 4 bytes between struct members count and ival1. Even though all struct members are initialized, the 4-byte hole will contain data from the kernel stack. This patch zeroes out struct bcm_msg_head before usage, preventing infoleaks to userspace. Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Link: https://lore.kernel.org/r/trinity-7c1b2e82-e34f-4885-8060-2cd7a13769ce-1623532166177@3c-app-gmx-bs52 Cc: linux-stable Signed-off-by: Norbert Slusarek Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/can/bcm.c b/net/can/bcm.c index f00176b2a6c3..f3e4d9528fa3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -406,6 +406,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ + memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; @@ -443,6 +444,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) /* this element is not throttled anymore */ data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); + memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; @@ -564,6 +566,7 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) } /* create notification to user */ + memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; -- cgit v1.2.3 From 91c02557174be7f72e46ed7311e3bea1939840b0 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 10 Jun 2021 00:58:33 +0300 Subject: can: mcba_usb: fix memory leak in mcba_usb Syzbot reported memory leak in SocketCAN driver for Microchip CAN BUS Analyzer Tool. The problem was in unfreed usb_coherent. In mcba_usb_start() 20 coherent buffers are allocated and there is nothing, that frees them: 1) In callback function the urb is resubmitted and that's all 2) In disconnect function urbs are simply killed, but URB_FREE_BUFFER is not set (see mcba_usb_start) and this flag cannot be used with coherent buffers. Fail log: | [ 1354.053291][ T8413] mcba_usb 1-1:0.0 can0: device disconnected | [ 1367.059384][ T8420] kmemleak: 20 new suspected memory leaks (see /sys/kernel/debug/kmem) So, all allocated buffers should be freed with usb_free_coherent() explicitly NOTE: The same pattern for allocating and freeing coherent buffers is used in drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Link: https://lore.kernel.org/r/20210609215833.30393-1-paskripkin@gmail.com Cc: linux-stable Reported-and-tested-by: syzbot+57281c762a3922e14dfe@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/mcba_usb.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 029e77dfa773..a45865bd7254 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -82,6 +82,8 @@ struct mcba_priv { bool can_ka_first_pass; bool can_speed_check; atomic_t free_ctx_cnt; + void *rxbuf[MCBA_MAX_RX_URBS]; + dma_addr_t rxbuf_dma[MCBA_MAX_RX_URBS]; }; /* CAN frame */ @@ -633,6 +635,7 @@ static int mcba_usb_start(struct mcba_priv *priv) for (i = 0; i < MCBA_MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf; + dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); @@ -642,7 +645,7 @@ static int mcba_usb_start(struct mcba_priv *priv) } buf = usb_alloc_coherent(priv->udev, MCBA_USB_RX_BUFF_SIZE, - GFP_KERNEL, &urb->transfer_dma); + GFP_KERNEL, &buf_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); usb_free_urb(urb); @@ -661,11 +664,14 @@ static int mcba_usb_start(struct mcba_priv *priv) if (err) { usb_unanchor_urb(urb); usb_free_coherent(priv->udev, MCBA_USB_RX_BUFF_SIZE, - buf, urb->transfer_dma); + buf, buf_dma); usb_free_urb(urb); break; } + priv->rxbuf[i] = buf; + priv->rxbuf_dma[i] = buf_dma; + /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); } @@ -708,7 +714,14 @@ static int mcba_usb_open(struct net_device *netdev) static void mcba_urb_unlink(struct mcba_priv *priv) { + int i; + usb_kill_anchored_urbs(&priv->rx_submitted); + + for (i = 0; i < MCBA_MAX_RX_URBS; ++i) + usb_free_coherent(priv->udev, MCBA_USB_RX_BUFF_SIZE, + priv->rxbuf[i], priv->rxbuf_dma[i]); + usb_kill_anchored_urbs(&priv->tx_submitted); } -- cgit v1.2.3 From 8f269102baf788aecfcbbc6313b6bceb54c9b990 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Wed, 16 Jun 2021 17:10:24 +0800 Subject: net: stmmac: disable clocks in stmmac_remove_config_dt() Platform drivers may call stmmac_probe_config_dt() to parse dt, could call stmmac_remove_config_dt() in error handing after dt parsed, so need disable clocks in stmmac_remove_config_dt(). Go through all platforms drivers which use stmmac_probe_config_dt(), none of them disable clocks manually, so it's safe to disable them in stmmac_remove_config_dt(). Fixes: commit d2ed0a7755fe ("net: ethernet: stmmac: fix of-node and fixed-link-phydev leaks") Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 1e17a23d9118..a696ada013eb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -622,6 +622,8 @@ error_pclk_get: void stmmac_remove_config_dt(struct platform_device *pdev, struct plat_stmmacenet_data *plat) { + clk_disable_unprepare(plat->stmmac_clk); + clk_disable_unprepare(plat->pclk); of_node_put(plat->phy_node); of_node_put(plat->mdio_node); } -- cgit v1.2.3 From 56b786d86694e079d8aad9b314e015cd4ac02a3d Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 16 Jun 2021 10:48:33 +0800 Subject: net: usb: fix possible use-after-free in smsc75xx_bind The commit 46a8b29c6306 ("net: usb: fix memory leak in smsc75xx_bind") fails to clean up the work scheduled in smsc75xx_reset-> smsc75xx_set_multicast, which leads to use-after-free if the work is scheduled to start after the deallocation. In addition, this patch also removes a dangling pointer - dev->data[0]. This patch calls cancel_work_sync to cancel the scheduled work and set the dangling pointer to NULL. Fixes: 46a8b29c6306 ("net: usb: fix memory leak in smsc75xx_bind") Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller --- drivers/net/usb/smsc75xx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index b286993da67c..13141dbfa3a8 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -1483,7 +1483,7 @@ static int smsc75xx_bind(struct usbnet *dev, struct usb_interface *intf) ret = smsc75xx_wait_ready(dev, 0); if (ret < 0) { netdev_warn(dev->net, "device not ready in smsc75xx_bind\n"); - goto err; + goto free_pdata; } smsc75xx_init_mac_address(dev); @@ -1492,7 +1492,7 @@ static int smsc75xx_bind(struct usbnet *dev, struct usb_interface *intf) ret = smsc75xx_reset(dev); if (ret < 0) { netdev_warn(dev->net, "smsc75xx_reset error %d\n", ret); - goto err; + goto cancel_work; } dev->net->netdev_ops = &smsc75xx_netdev_ops; @@ -1503,8 +1503,11 @@ static int smsc75xx_bind(struct usbnet *dev, struct usb_interface *intf) dev->net->max_mtu = MAX_SINGLE_PACKET_SIZE; return 0; -err: +cancel_work: + cancel_work_sync(&pdata->set_multicast); +free_pdata: kfree(pdata); + dev->data[0] = 0; return ret; } @@ -1515,7 +1518,6 @@ static void smsc75xx_unbind(struct usbnet *dev, struct usb_interface *intf) cancel_work_sync(&pdata->set_multicast); netif_dbg(dev, ifdown, dev->net, "free pdata\n"); kfree(pdata); - pdata = NULL; dev->data[0] = 0; } } -- cgit v1.2.3 From cb3cefe3f3f8af27c6076ef7d1f00350f502055d Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Wed, 16 Jun 2021 17:14:25 +0800 Subject: net: fec_ptp: add clock rate zero check Add clock rate zero check to fix coverity issue of "divide by 0". Fixes: commit 85bd1798b24a ("net: fec: fix spin_lock dead lock") Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_ptp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 1753807cbf97..7326a0612823 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -604,6 +604,10 @@ void fec_ptp_init(struct platform_device *pdev, int irq_idx) fep->ptp_caps.enable = fec_ptp_enable; fep->cycle_speed = clk_get_rate(fep->clk_ptp); + if (!fep->cycle_speed) { + fep->cycle_speed = NSEC_PER_SEC; + dev_err(&fep->pdev->dev, "clk_ptp clock rate is zero\n"); + } fep->ptp_inc = NSEC_PER_SEC / fep->cycle_speed; spin_lock_init(&fep->tmreg_lock); -- cgit v1.2.3 From d23765646e71b43ed2b809930411ba5c0aadee7b Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Wed, 16 Jun 2021 17:14:26 +0800 Subject: net: fec_ptp: fix issue caused by refactor the fec_devtype Commit da722186f654 ("net: fec: set GPR bit on suspend by DT configuration.") refactor the fec_devtype, need adjust ptp driver accordingly. Fixes: da722186f654 ("net: fec: set GPR bit on suspend by DT configuration.") Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_ptp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 7326a0612823..d71eac7e1924 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -215,15 +215,13 @@ static u64 fec_ptp_read(const struct cyclecounter *cc) { struct fec_enet_private *fep = container_of(cc, struct fec_enet_private, cc); - const struct platform_device_id *id_entry = - platform_get_device_id(fep->pdev); u32 tempval; tempval = readl(fep->hwp + FEC_ATIME_CTRL); tempval |= FEC_T_CTRL_CAPTURE; writel(tempval, fep->hwp + FEC_ATIME_CTRL); - if (id_entry->driver_data & FEC_QUIRK_BUG_CAPTURE) + if (fep->quirks & FEC_QUIRK_BUG_CAPTURE) udelay(1); return readl(fep->hwp + FEC_ATIME); -- cgit v1.2.3 From d8e2973029b8b2ce477b564824431f3385c77083 Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Wed, 16 Jun 2021 17:59:25 +0800 Subject: net: ipv4: fix memory leak in ip_mc_add1_src BUG: memory leak unreferenced object 0xffff888101bc4c00 (size 32): comm "syz-executor527", pid 360, jiffies 4294807421 (age 19.329s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 01 00 00 00 00 00 00 00 ac 14 14 bb 00 00 02 00 ................ backtrace: [<00000000f17c5244>] kmalloc include/linux/slab.h:558 [inline] [<00000000f17c5244>] kzalloc include/linux/slab.h:688 [inline] [<00000000f17c5244>] ip_mc_add1_src net/ipv4/igmp.c:1971 [inline] [<00000000f17c5244>] ip_mc_add_src+0x95f/0xdb0 net/ipv4/igmp.c:2095 [<000000001cb99709>] ip_mc_source+0x84c/0xea0 net/ipv4/igmp.c:2416 [<0000000052cf19ed>] do_ip_setsockopt net/ipv4/ip_sockglue.c:1294 [inline] [<0000000052cf19ed>] ip_setsockopt+0x114b/0x30c0 net/ipv4/ip_sockglue.c:1423 [<00000000477edfbc>] raw_setsockopt+0x13d/0x170 net/ipv4/raw.c:857 [<00000000e75ca9bb>] __sys_setsockopt+0x158/0x270 net/socket.c:2117 [<00000000bdb993a8>] __do_sys_setsockopt net/socket.c:2128 [inline] [<00000000bdb993a8>] __se_sys_setsockopt net/socket.c:2125 [inline] [<00000000bdb993a8>] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2125 [<000000006a1ffdbd>] do_syscall_64+0x40/0x80 arch/x86/entry/common.c:47 [<00000000b11467c4>] entry_SYSCALL_64_after_hwframe+0x44/0xae In commit 24803f38a5c0 ("igmp: do not remove igmp souce list info when set link down"), the ip_mc_clear_src() in ip_mc_destroy_dev() was removed, because it was also called in igmpv3_clear_delrec(). Rough callgraph: inetdev_destroy -> ip_mc_destroy_dev -> igmpv3_clear_delrec -> ip_mc_clear_src -> RCU_INIT_POINTER(dev->ip_ptr, NULL) However, ip_mc_clear_src() called in igmpv3_clear_delrec() doesn't release in_dev->mc_list->sources. And RCU_INIT_POINTER() assigns the NULL to dev->ip_ptr. As a result, in_dev cannot be obtained through inetdev_by_index() and then in_dev->mc_list->sources cannot be released by ip_mc_del1_src() in the sock_close. Rough call sequence goes like: sock_close -> __sock_release -> inet_release -> ip_mc_drop_socket -> inetdev_by_index -> ip_mc_leave_src -> ip_mc_del_src -> ip_mc_del1_src So we still need to call ip_mc_clear_src() in ip_mc_destroy_dev() to free in_dev->mc_list->sources. Fixes: 24803f38a5c0 ("igmp: do not remove igmp souce list info ...") Reported-by: Hulk Robot Signed-off-by: Chengyang Fan Acked-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7b272bbed2b4..6b3c558a4f23 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1801,6 +1801,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; + ip_mc_clear_src(i); ip_ma_put(i); } } -- cgit v1.2.3 From c7d2ef5dd4b03ed0ee1d13bc0c55f9cf62d49bd6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2021 06:42:01 -0700 Subject: net/packet: annotate accesses to po->bind tpacket_snd(), packet_snd(), packet_getname() and packet_seq_show() can read po->num without holding a lock. This means other threads can change po->num at the same time. KCSAN complained about this known fact [1] Add READ_ONCE()/WRITE_ONCE() to address the issue. [1] BUG: KCSAN: data-race in packet_do_bind / packet_sendmsg write to 0xffff888131a0dcc0 of 2 bytes by task 24714 on cpu 0: packet_do_bind+0x3ab/0x7e0 net/packet/af_packet.c:3181 packet_bind+0xc3/0xd0 net/packet/af_packet.c:3255 __sys_bind+0x200/0x290 net/socket.c:1637 __do_sys_bind net/socket.c:1648 [inline] __se_sys_bind net/socket.c:1646 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1646 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888131a0dcc0 of 2 bytes by task 24719 on cpu 1: packet_snd net/packet/af_packet.c:2899 [inline] packet_sendmsg+0x317/0x3570 net/packet/af_packet.c:3040 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2433 __do_sys_sendmsg net/socket.c:2442 [inline] __se_sys_sendmsg net/socket.c:2440 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2440 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000 -> 0x1200 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24719 Comm: syz-executor.5 Not tainted 5.13.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/packet/af_packet.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 74e6e45a8e84..e91a36bdd1ab 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2683,7 +2683,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); - proto = po->num; + proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2896,7 +2896,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); - proto = po->num; + proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -3171,7 +3171,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, /* prevents packet_notifier() from calling * register_prot_hook() */ - po->num = 0; + WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); dev_curr = po->prot_hook.dev; @@ -3181,7 +3181,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, } BUG_ON(po->running); - po->num = proto; + WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; if (unlikely(unlisted)) { @@ -3526,7 +3526,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; - sll->sll_protocol = po->num; + sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); @@ -4414,7 +4414,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, was_running = po->running; num = po->num; if (was_running) { - po->num = 0; + WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); @@ -4449,7 +4449,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, spin_lock(&po->bind_lock); if (was_running) { - po->num = num; + WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); @@ -4619,7 +4619,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) s, refcount_read(&s->sk_refcnt), s->sk_type, - ntohs(po->num), + ntohs(READ_ONCE(po->num)), po->ifindex, po->running, atomic_read(&s->sk_rmem_alloc), -- cgit v1.2.3 From e032f7c9c7cefffcfb79b9fc16c53011d2d9d11f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2021 06:42:02 -0700 Subject: net/packet: annotate accesses to po->ifindex Like prior patch, we need to annotate lockless accesses to po->ifindex For instance, packet_getname() is reading po->ifindex (twice) while another thread is able to change po->ifindex. KCSAN reported: BUG: KCSAN: data-race in packet_do_bind / packet_getname write to 0xffff888143ce3cbc of 4 bytes by task 25573 on cpu 1: packet_do_bind+0x420/0x7e0 net/packet/af_packet.c:3191 packet_bind+0xc3/0xd0 net/packet/af_packet.c:3255 __sys_bind+0x200/0x290 net/socket.c:1637 __do_sys_bind net/socket.c:1648 [inline] __se_sys_bind net/socket.c:1646 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1646 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888143ce3cbc of 4 bytes by task 25578 on cpu 0: packet_getname+0x5b/0x1a0 net/packet/af_packet.c:3525 __sys_getsockname+0x10e/0x1a0 net/socket.c:1887 __do_sys_getsockname net/socket.c:1902 [inline] __se_sys_getsockname net/socket.c:1899 [inline] __x64_sys_getsockname+0x3e/0x50 net/socket.c:1899 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25578 Comm: syz-executor.5 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/packet/af_packet.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e91a36bdd1ab..330ba68828e7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3187,11 +3187,11 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (unlikely(unlisted)) { dev_put(dev); po->prot_hook.dev = NULL; - po->ifindex = -1; + WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { po->prot_hook.dev = dev; - po->ifindex = dev ? dev->ifindex : 0; + WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } } @@ -3505,7 +3505,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); + dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); @@ -3520,16 +3520,18 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); + int ifindex; if (peer) return -EOPNOTSUPP; + ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; - sll->sll_ifindex = po->ifindex; + sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; @@ -4108,7 +4110,7 @@ static int packet_notifier(struct notifier_block *this, } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); - po->ifindex = -1; + WRITE_ONCE(po->ifindex, -1); if (po->prot_hook.dev) dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; @@ -4620,7 +4622,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), - po->ifindex, + READ_ONCE(po->ifindex), po->running, atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), -- cgit v1.2.3 From 0fd158b89b50b3a31c97a639ff496e1c59686e97 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 16 Jun 2021 16:03:21 +0200 Subject: selftests: net: veth: make test compatible with dash veth.sh is a shell script that uses /bin/sh; some distro (Ubuntu for example) use dash as /bin/sh and in this case the test reports the following error: # ./veth.sh: 21: local: -r: bad variable name # ./veth.sh: 21: local: -r: bad variable name This happens because dash doesn't support the option "-r" with local. Moreover, in case of missing bpf object, the script is exiting -1, that is an illegal number for dash: exit: Illegal number: -1 Change the script to be compatible both with bash and dash and prevent the errors above. Signed-off-by: Andrea Righi Signed-off-by: David S. Miller --- tools/testing/selftests/net/veth.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 2fedc0781ce8..11d7cdb898c0 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -18,7 +18,8 @@ ret=0 cleanup() { local ns - local -r jobs="$(jobs -p)" + local jobs + readonly jobs="$(jobs -p)" [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null rm -f $STATS @@ -108,7 +109,7 @@ chk_gro() { if [ ! -f ../bpf/xdp_dummy.o ]; then echo "Missing xdp_dummy helper. Build bpf selftest first" - exit -1 + exit 1 fi create_ns -- cgit v1.2.3 From a494bd642d9120648b06bb7d28ce6d05f55a7819 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2021 07:47:15 -0700 Subject: net/af_unix: fix a data-race in unix_dgram_sendmsg / unix_release_sock While unix_may_send(sk, osk) is called while osk is locked, it appears unix_release_sock() can overwrite unix_peer() after this lock has been released, making KCSAN unhappy. Changing unix_release_sock() to access/change unix_peer() before lock is released should fix this issue. BUG: KCSAN: data-race in unix_dgram_sendmsg / unix_release_sock write to 0xffff88810465a338 of 8 bytes by task 20852 on cpu 1: unix_release_sock+0x4ed/0x6e0 net/unix/af_unix.c:558 unix_release+0x2f/0x50 net/unix/af_unix.c:859 __sock_release net/socket.c:599 [inline] sock_close+0x6c/0x150 net/socket.c:1258 __fput+0x25b/0x4e0 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0xae/0x130 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:175 [inline] exit_to_user_mode_prepare+0x156/0x190 kernel/entry/common.c:209 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:302 do_syscall_64+0x56/0x90 arch/x86/entry/common.c:57 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88810465a338 of 8 bytes by task 20888 on cpu 0: unix_may_send net/unix/af_unix.c:189 [inline] unix_dgram_sendmsg+0x923/0x1610 net/unix/af_unix.c:1712 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffff888167905400 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 20888 Comm: syz-executor.0 Not tainted 5.13.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5a31307ceb76..5d1192ceb139 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -535,12 +535,14 @@ static void unix_release_sock(struct sock *sk, int embrion) u->path.mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; + + skpair = unix_peer(sk); + unix_peer(sk) = NULL; + unix_state_unlock(sk); wake_up_interruptible_all(&u->peer_wait); - skpair = unix_peer(sk); - if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); @@ -555,7 +557,6 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ - unix_peer(sk) = NULL; } /* Try to flush out this socket. Throw out buffers at least */ -- cgit v1.2.3 From 1b29df0e2e802cb15a5196c936f494161ec97502 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 16 Jun 2021 16:57:27 +0200 Subject: selftests: net: use bash to run udpgro_fwd test case udpgro_fwd.sh contains many bash specific operators ("[[", "local -r"), but it's using /bin/sh; in some distro /bin/sh is mapped to /bin/dash, that doesn't support such operators. Force the test to use /bin/bash explicitly and prevent false positive test failures. Signed-off-by: Andrea Righi Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro_fwd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index a8fa64136282..7f26591f236b 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 readonly BASE="ns-$(mktemp -u XXXXXX)" -- cgit v1.2.3 From 99718abdc00e86e4f286dd836408e2834886c16e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Jun 2021 12:53:03 -0700 Subject: r8152: Avoid memcpy() over-reading of ETH_SS_STATS In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally reading across neighboring array fields. The memcpy() is copying the entire structure, not just the first array. Adjust the source argument so the compiler can do appropriate bounds checking. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index f6abb2fbf972..e25bfb7021ed 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -8678,7 +8678,7 @@ static void rtl8152_get_strings(struct net_device *dev, u32 stringset, u8 *data) { switch (stringset) { case ETH_SS_STATS: - memcpy(data, *rtl8152_gstrings, sizeof(rtl8152_gstrings)); + memcpy(data, rtl8152_gstrings, sizeof(rtl8152_gstrings)); break; } } -- cgit v1.2.3 From 224004fbb033600715dbd626bceec10bfd9c58bc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Jun 2021 12:53:33 -0700 Subject: sh_eth: Avoid memcpy() over-reading of ETH_SS_STATS In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally reading across neighboring array fields. The memcpy() is copying the entire structure, not just the first array. Adjust the source argument so the compiler can do appropriate bounds checking. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index c5b154868c1f..713d3629b4c1 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2287,7 +2287,7 @@ static void sh_eth_get_strings(struct net_device *ndev, u32 stringset, u8 *data) { switch (stringset) { case ETH_SS_STATS: - memcpy(data, *sh_eth_gstrings_stats, + memcpy(data, sh_eth_gstrings_stats, sizeof(sh_eth_gstrings_stats)); break; } -- cgit v1.2.3 From da5ac772cfe2a03058b0accfac03fad60c46c24d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Jun 2021 12:53:59 -0700 Subject: r8169: Avoid memcpy() over-reading of ETH_SS_STATS In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally reading across neighboring array fields. The memcpy() is copying the entire structure, not just the first array. Adjust the source argument so the compiler can do appropriate bounds checking. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 2c89cde7da1e..2ee72dc431cd 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1671,7 +1671,7 @@ static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data) { switch(stringset) { case ETH_SS_STATS: - memcpy(data, *rtl8169_gstrings, sizeof(rtl8169_gstrings)); + memcpy(data, rtl8169_gstrings, sizeof(rtl8169_gstrings)); break; } } -- cgit v1.2.3 From 94a4b8414d3e91104873007b659252f855ee344a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 8 Mar 2021 15:41:55 +0200 Subject: net/mlx5: Fix error path for set HCA defaults In the case of the failure to execute mlx5_core_set_hca_defaults(), we used wrong goto label to execute error unwind flow. Fixes: 5bef709d76a2 ("net/mlx5: Enable host PF HCA after eswitch is initialized") Reviewed-by: Saeed Mahameed Reviewed-by: Moshe Shemesh Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a1d67bd7fb43..0d0f63a27aba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1161,7 +1161,7 @@ static int mlx5_load(struct mlx5_core_dev *dev) err = mlx5_core_set_hca_defaults(dev); if (err) { mlx5_core_err(dev, "Failed to set hca defaults\n"); - goto err_sriov; + goto err_set_hca; } mlx5_vhca_event_start(dev); @@ -1194,6 +1194,7 @@ err_ec: mlx5_sf_hw_table_destroy(dev); err_vhca: mlx5_vhca_event_stop(dev); +err_set_hca: mlx5_cleanup_fs(dev); err_fs: mlx5_accel_tls_cleanup(dev); -- cgit v1.2.3 From 2058cc9c8041fde9c0bdd8e868c72b137cff8563 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 21 Mar 2021 19:57:14 +0200 Subject: net/mlx5: Check that driver was probed prior attaching the device The device can be requested to be attached despite being not probed. This situation is possible if devlink reload races with module removal, and the following kernel panic is an outcome of such race. mlx5_core 0000:00:09.0: firmware version: 4.7.9999 mlx5_core 0000:00:09.0: 0.000 Gb/s available PCIe bandwidth (8.0 GT/s PCIe x255 link) BUG: unable to handle page fault for address: fffffffffffffff0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 3218067 P4D 3218067 PUD 321a067 PMD 0 Oops: 0000 [#1] SMP KASAN NOPTI CPU: 7 PID: 250 Comm: devlink Not tainted 5.12.0-rc2+ #2836 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5_attach_device+0x80/0x280 [mlx5_core] Code: f8 48 c1 e8 03 42 80 3c 38 00 0f 85 80 01 00 00 48 8b 45 68 48 8d 78 f0 48 89 fe 48 c1 ee 03 42 80 3c 3e 00 0f 85 70 01 00 00 <48> 8b 40 f0 48 85 c0 74 0d 48 89 ef ff d0 85 c0 0f 85 84 05 0e 00 RSP: 0018:ffff8880129675f0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffffffff827407f1 RDX: 1ffff110011336cf RSI: 1ffffffffffffffe RDI: fffffffffffffff0 RBP: ffff888008e0c000 R08: 0000000000000008 R09: ffffffffa0662ee7 R10: fffffbfff40cc5dc R11: 0000000000000000 R12: ffff88800ea002e0 R13: ffffed1001d459f7 R14: ffffffffa05ef4f8 R15: dffffc0000000000 FS: 00007f51dfeaf740(0000) GS:ffff88806d5c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffffff0 CR3: 000000000bc82006 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_load_one+0x117/0x1d0 [mlx5_core] devlink_reload+0x2d5/0x520 ? devlink_remote_reload_actions_performed+0x30/0x30 ? mutex_trylock+0x24b/0x2d0 ? devlink_nl_cmd_reload+0x62b/0x1070 devlink_nl_cmd_reload+0x66d/0x1070 ? devlink_reload+0x520/0x520 ? devlink_nl_pre_doit+0x64/0x4d0 genl_family_rcv_msg_doit+0x1e9/0x2f0 ? mutex_lock_io_nested+0x1130/0x1130 ? genl_family_rcv_msg_attrs_parse.constprop.0+0x240/0x240 ? security_capable+0x51/0x90 genl_rcv_msg+0x27f/0x4a0 ? genl_get_cmd+0x3c0/0x3c0 ? lock_acquire+0x1a9/0x6d0 ? devlink_reload+0x520/0x520 ? lock_release+0x6c0/0x6c0 netlink_rcv_skb+0x11d/0x340 ? genl_get_cmd+0x3c0/0x3c0 ? netlink_ack+0x9f0/0x9f0 ? lock_release+0x1f9/0x6c0 genl_rcv+0x24/0x40 netlink_unicast+0x433/0x700 ? netlink_attachskb+0x730/0x730 ? _copy_from_iter_full+0x178/0x650 ? __alloc_skb+0x113/0x2b0 netlink_sendmsg+0x6f1/0xbd0 ? netlink_unicast+0x700/0x700 ? netlink_unicast+0x700/0x700 sock_sendmsg+0xb0/0xe0 __sys_sendto+0x193/0x240 ? __x64_sys_getpeername+0xb0/0xb0 ? copy_page_range+0x2300/0x2300 ? __up_read+0x1a1/0x7b0 ? do_user_addr_fault+0x219/0xdc0 __x64_sys_sendto+0xdd/0x1b0 ? syscall_enter_from_user_mode+0x1d/0x50 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f51dffb514a Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 76 c3 0f 1f 44 00 00 55 48 83 ec 30 44 89 4c RSP: 002b:00007ffcaef22e78 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f51dffb514a RDX: 0000000000000030 RSI: 000055750daf2440 RDI: 0000000000000003 RBP: 000055750daf2410 R08: 00007f51e0081200 R09: 000000000000000c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: mlx5_core(-) ptp pps_core ib_ipoib rdma_ucm rdma_cm iw_cm ib_cm ib_umad ib_uverbs ib_core [last unloaded: mlx5_ib] CR2: fffffffffffffff0 ---[ end trace 7789831bfe74fa42 ]--- Fixes: a925b5e309c9 ("net/mlx5: Register mlx5 devices to auxiliary virtual bus") Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index a9166cd85013..8de118adfb54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -320,6 +320,16 @@ int mlx5_attach_device(struct mlx5_core_dev *dev) } } else { adev = &priv->adev[i]->adev; + + /* Pay attention that this is not PCI driver that + * mlx5_core_dev is connected, but auxiliary driver. + * + * Here we can race of module unload with devlink + * reload, but we don't need to take extra lock because + * we are holding global mlx5_intf_mutex. + */ + if (!adev->dev.driver) + continue; adrv = to_auxiliary_drv(adev->dev.driver); if (adrv->resume) @@ -350,6 +360,10 @@ void mlx5_detach_device(struct mlx5_core_dev *dev) continue; adev = &priv->adev[i]->adev; + /* Auxiliary driver was unbind manually through sysfs */ + if (!adev->dev.driver) + goto skip_suspend; + adrv = to_auxiliary_drv(adev->dev.driver); if (adrv->suspend) { @@ -357,6 +371,7 @@ void mlx5_detach_device(struct mlx5_core_dev *dev) continue; } +skip_suspend: del_adev(&priv->adev[i]->adev); priv->adev[i] = NULL; } -- cgit v1.2.3 From bbc8222dc49db8d49add0f27bcac33f4b92193dc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 8 Jun 2021 19:14:08 +0300 Subject: net/mlx5: E-Switch, Read PF mac address External controller PF's MAC address is not read from the device during vport setup. Fail to read this results in showing all zeros to user while the factory programmed MAC is a valid value. $ devlink port show eth1 -jp { "port": { "pci/0000:03:00.0/196608": { "type": "eth", "netdev": "eth1", "flavour": "pcipf", "controller": 1, "pfnum": 0, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Hence, read it when enabling a vport. After the fix, $ devlink port show eth1 -jp { "port": { "pci/0000:03:00.0/196608": { "type": "eth", "netdev": "eth1", "flavour": "pcipf", "controller": 1, "pfnum": 0, "splittable": false, "function": { "hw_addr": "98:03:9b:a0:60:11" } } } } Fixes: f099fde16db3 ("net/mlx5: E-switch, Support querying port function mac address") Signed-off-by: Bodong Wang Signed-off-by: Parav Pandit Reviewed-by: Alaa Hleihel Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index b88705a3a1a8..97e6cb6f13c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1054,6 +1054,12 @@ int mlx5_esw_vport_enable(struct mlx5_eswitch *esw, u16 vport_num, goto err_vhca_mapping; } + /* External controller host PF has factory programmed MAC. + * Read it from the device. + */ + if (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) + mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, vport->info.mac); + esw_vport_change_handle_locked(vport); esw->enabled_vports++; -- cgit v1.2.3 From ca36fc4d77b35b8d142cf1ed0eae5ec2e071dc3c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 8 Jun 2021 19:03:24 +0300 Subject: net/mlx5: E-Switch, Allow setting GUID for host PF vport E-switch should be able to set the GUID of host PF vport. Currently it returns an error. This results in below error when user attempts to configure MAC address of the PF of an external controller. $ devlink port function set pci/0000:03:00.0/196608 \ hw_addr 00:00:00:11:22:33 mlx5_core 0000:03:00.0: mlx5_esw_set_vport_mac_locked:1876:(pid 6715):\ "Failed to set vport 0 node guid, err = -22. RDMA_CM will not function properly for this VF." Check for zero vport is no longer needed. Fixes: 330077d14de1 ("net/mlx5: E-switch, Supporting setting devlink port function mac address") Signed-off-by: Yuval Avnery Signed-off-by: Parav Pandit Reviewed-by: Bodong Wang Reviewed-by: Alaa Hleihel Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 457ad42eaa2a..4c1440a95ad7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -465,8 +465,6 @@ int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, void *in; int err; - if (!vport) - return -EINVAL; if (!MLX5_CAP_GEN(mdev, vport_group_manager)) return -EACCES; -- cgit v1.2.3 From c7d6c19b3bde66d7aebbe93e0f9e6d9ff57fc3fa Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 10 Jun 2021 18:39:53 +0300 Subject: net/mlx5: SF_DEV, remove SF device on invalid state When auxiliary bus autoprobe is disabled and SF is in ACTIVE state, on SF port deletion it transitions from ACTIVE->ALLOCATED->INVALID. When VHCA event handler queries the state, it is already transition to INVALID state. In this scenario, event handler missed to delete the SF device. Fix it by deleting the SF when SF state is INVALID. Fixes: 90d010b8634b ("net/mlx5: SF, Add auxiliary device support") Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index 6a0c6f965ad1..fa0288afc0dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -163,6 +163,7 @@ mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_ sf_index = event->function_id - base_id; sf_dev = xa_load(&table->devices, sf_index); switch (event->new_vhca_state) { + case MLX5_VHCA_STATE_INVALID: case MLX5_VHCA_STATE_ALLOCATED: if (sf_dev) mlx5_sf_dev_del(table->dev, sf_dev, sf_index); -- cgit v1.2.3 From 65fb7d109abe3a1a9f1c2d3ba7e1249bc978d5f0 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Tue, 1 Jun 2021 18:10:06 +0300 Subject: net/mlx5: DR, Fix STEv1 incorrect L3 decapsulation padding Decapsulation L3 on small inner packets which are less than 64 Bytes was done incorrectly. In small packets there is an extra padding added in L2 which should not be included in L3 length. The issue was that after decapL3 the extra L2 padding caused an update on the L3 length. To avoid this issue the new header is pushed to the beginning of the packet (offset 0) which should not cause a HW reparse and update the L3 length. Fixes: c349b4137cfd ("net/mlx5: DR, Add STEv1 modify header logic") Reviewed-by: Erez Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_ste_v1.c | 26 +++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c index 054c2e2b6554..7466f016375c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c @@ -694,7 +694,11 @@ static int dr_ste_v1_set_action_decap_l3_list(void *data, if (hw_action_sz / DR_STE_ACTION_DOUBLE_SZ < DR_STE_DECAP_L3_ACTION_NUM) return -EINVAL; - memcpy(padded_data, data, data_sz); + inline_data_sz = + MLX5_FLD_SZ_BYTES(ste_double_action_insert_with_inline_v1, inline_data); + + /* Add an alignment padding */ + memcpy(padded_data + data_sz % inline_data_sz, data, data_sz); /* Remove L2L3 outer headers */ MLX5_SET(ste_single_action_remove_header_v1, hw_action, action_id, @@ -706,32 +710,34 @@ static int dr_ste_v1_set_action_decap_l3_list(void *data, hw_action += DR_STE_ACTION_DOUBLE_SZ; used_actions++; /* Remove and NOP are a single double action */ - inline_data_sz = - MLX5_FLD_SZ_BYTES(ste_double_action_insert_with_inline_v1, inline_data); + /* Point to the last dword of the header */ + data_ptr += (data_sz / inline_data_sz) * inline_data_sz; - /* Add the new header inline + 2 extra bytes */ + /* Add the new header using inline action 4Byte at a time, the header + * is added in reversed order to the beginning of the packet to avoid + * incorrect parsing by the HW. Since header is 14B or 18B an extra + * two bytes are padded and later removed. + */ for (i = 0; i < data_sz / inline_data_sz + 1; i++) { void *addr_inline; MLX5_SET(ste_double_action_insert_with_inline_v1, hw_action, action_id, DR_STE_V1_ACTION_ID_INSERT_INLINE); /* The hardware expects here offset to words (2 bytes) */ - MLX5_SET(ste_double_action_insert_with_inline_v1, hw_action, start_offset, - i * 2); + MLX5_SET(ste_double_action_insert_with_inline_v1, hw_action, start_offset, 0); /* Copy bytes one by one to avoid endianness problem */ addr_inline = MLX5_ADDR_OF(ste_double_action_insert_with_inline_v1, hw_action, inline_data); - memcpy(addr_inline, data_ptr, inline_data_sz); + memcpy(addr_inline, data_ptr - i * inline_data_sz, inline_data_sz); hw_action += DR_STE_ACTION_DOUBLE_SZ; - data_ptr += inline_data_sz; used_actions++; } - /* Remove 2 extra bytes */ + /* Remove first 2 extra bytes */ MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, action_id, DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); - MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, start_offset, data_sz / 2); + MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, start_offset, 0); /* The hardware expects here size in words (2 bytes) */ MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, remove_size, 1); used_actions++; -- cgit v1.2.3 From a5ae8fc9058e37437c8c1f82b3d412b4abd1b9e6 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Fri, 14 May 2021 11:14:19 +0300 Subject: net/mlx5e: Don't create devices during unload flow Running devlink reload command for port in switchdev mode cause resources to corrupt: driver can't release allocated EQ and reclaim memory pages, because "rdma" auxiliary device had add CQs which blocks EQ from deletion. Erroneous sequence happens during reload-down phase, and is following: 1. detach device - suspends auxiliary devices which support it, destroys others. During this step "eth-rep" and "rdma-rep" are destroyed, "eth" - suspended. 2. disable SRIOV - moves device to legacy mode; as part of disablement - rescans drivers. This step adds "rdma" auxiliary device. 3. destroy EQ table - . Driver shouldn't create any device during unload flows. To handle that implement MLX5_PRIV_FLAGS_DETACH flag, set it on device detach and unset on device attach. If flag is set do no-op on drivers rescan. Fixes: a925b5e309c9 ("net/mlx5: Register mlx5 devices to auxiliary virtual bus") Signed-off-by: Dmytro Linkin Reviewed-by: Leon Romanovsky Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 4 ++++ include/linux/mlx5/driver.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index 8de118adfb54..ceebfc20f65e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -303,6 +303,7 @@ int mlx5_attach_device(struct mlx5_core_dev *dev) int ret = 0, i; mutex_lock(&mlx5_intf_mutex); + priv->flags &= ~MLX5_PRIV_FLAGS_DETACH; for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) { if (!priv->adev[i]) { bool is_supported = false; @@ -375,6 +376,7 @@ skip_suspend: del_adev(&priv->adev[i]->adev); priv->adev[i] = NULL; } + priv->flags |= MLX5_PRIV_FLAGS_DETACH; mutex_unlock(&mlx5_intf_mutex); } @@ -463,6 +465,8 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev) struct mlx5_priv *priv = &dev->priv; lockdep_assert_held(&mlx5_intf_mutex); + if (priv->flags & MLX5_PRIV_FLAGS_DETACH) + return 0; delete_drivers(dev); if (priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 020a8f7fdbdd..f8902bcd91e2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -542,6 +542,10 @@ struct mlx5_core_roce { enum { MLX5_PRIV_FLAGS_DISABLE_IB_ADEV = 1 << 0, MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV = 1 << 1, + /* Set during device detach to block any further devices + * creation/deletion on drivers rescan. Unset during device attach. + */ + MLX5_PRIV_FLAGS_DETACH = 1 << 2, }; struct mlx5_adev { -- cgit v1.2.3 From 0232fc2ddcf4ffe01069fd1aa07922652120f44a Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 10 Jun 2021 14:20:28 +0300 Subject: net/mlx5: Reset mkey index on creation Reset only the index part of the mkey and keep the variant part. On devlink reload, driver recreates mkeys, so the mkey index may change. Trying to preserve the variant part of the mkey, driver mistakenly merged the mkey index with current value. In case of a devlink reload, current value of index part is dirty, so the index may be corrupted. Fixes: 54c62e13ad76 ("{IB,net}/mlx5: Setup mkey variant before mr create command invocation") Signed-off-by: Aya Levin Signed-off-by: Amir Tzin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 50af84e76fb6..174f71ed5280 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -54,7 +54,7 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index); mkey->iova = MLX5_GET64(mkc, mkc, start_addr); mkey->size = MLX5_GET64(mkc, mkc, len); - mkey->key |= mlx5_idx_to_mkey(mkey_index); + mkey->key = (u32)mlx5_mkey_variant(mkey->key) | mlx5_idx_to_mkey(mkey_index); mkey->pd = MLX5_GET(mkc, mkc, pd); init_waitqueue_head(&mkey->wait); -- cgit v1.2.3 From c19c8c0e666f9259e2fc4d2fa4b9ff8e3b40ee5d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 16 Jun 2021 20:43:37 +0200 Subject: be2net: Fix an error handling path in 'be_probe()' If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: d6b6d9877878 ("be2net: use PCIe AER capability") Signed-off-by: Christophe JAILLET Acked-by: Somnath Kotur Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index b6eba29d8e99..7968568bbe21 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5897,6 +5897,7 @@ drv_cleanup: unmap_bars: be_unmap_pci_bars(adapter); free_netdev: + pci_disable_pcie_error_reporting(pdev); free_netdev(netdev); rel_reg: pci_release_regions(pdev); -- cgit v1.2.3 From 7edcc682301492380fbdd604b4516af5ae667a13 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Wed, 16 Jun 2021 22:09:06 +0300 Subject: net: hamradio: fix memory leak in mkiss_close My local syzbot instance hit memory leak in mkiss_open()[1]. The problem was in missing free_netdev() in mkiss_close(). In mkiss_open() netdevice is allocated and then registered, but in mkiss_close() netdevice was only unregistered, but not freed. Fail log: BUG: memory leak unreferenced object 0xffff8880281ba000 (size 4096): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): 61 78 30 00 00 00 00 00 00 00 00 00 00 00 00 00 ax0............. 00 27 fa 2a 80 88 ff ff 00 00 00 00 00 00 00 00 .'.*............ backtrace: [] kvmalloc_node+0x61/0xf0 [] alloc_netdev_mqs+0x98/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff8880141a9a00 (size 96): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): e8 a2 1b 28 80 88 ff ff e8 a2 1b 28 80 88 ff ff ...(.......(.... 98 92 9c aa b0 40 02 00 00 00 00 00 00 00 00 00 .....@.......... backtrace: [] __hw_addr_create_ex+0x5b/0x310 [] __hw_addr_add_ex+0x1f8/0x2b0 [] dev_addr_init+0x10b/0x1f0 [] alloc_netdev_mqs+0x13b/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff8880219bfc00 (size 512): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): 00 a0 1b 28 80 88 ff ff 80 8f b1 8d ff ff ff ff ...(............ 80 8f b1 8d ff ff ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [] kvmalloc_node+0x61/0xf0 [] alloc_netdev_mqs+0x777/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff888029b2b200 (size 256): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kvmalloc_node+0x61/0xf0 [] alloc_netdev_mqs+0x912/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 815f62bf7427 ("[PATCH] SMP rewrite of mkiss") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/hamradio/mkiss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 65154224d5b8..7685a1721597 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -799,6 +799,7 @@ static void mkiss_close(struct tty_struct *tty) ax->tty = NULL; unregister_netdev(ax->dev); + free_netdev(ax->dev); } /* Perform I/O control on an active ax25 channel. */ -- cgit v1.2.3 From c3b26fdf1b32f91c7a3bc743384b4a298ab53ad7 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Thu, 17 Jun 2021 07:32:32 +0800 Subject: net: cdc_eem: fix tx fixup skb leak when usbnet transmit a skb, eem fixup it in eem_tx_fixup(), if skb_copy_expand() failed, it return NULL, usbnet_start_xmit() will have no chance to free original skb. fix it by free orginal skb in eem_tx_fixup() first, then check skb clone status, if failed, return NULL to usbnet. Fixes: 9f722c0978b0 ("usbnet: CDC EEM support (v5)") Signed-off-by: Linyu Yuan Reviewed-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/net/usb/cdc_eem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_eem.c b/drivers/net/usb/cdc_eem.c index 2e60bc1b9a6b..359ea0d10e59 100644 --- a/drivers/net/usb/cdc_eem.c +++ b/drivers/net/usb/cdc_eem.c @@ -123,10 +123,10 @@ static struct sk_buff *eem_tx_fixup(struct usbnet *dev, struct sk_buff *skb, } skb2 = skb_copy_expand(skb, EEM_HEAD, ETH_FCS_LEN + padlen, flags); + dev_kfree_skb_any(skb); if (!skb2) return NULL; - dev_kfree_skb_any(skb); skb = skb2; done: -- cgit v1.2.3 From 1c200f832e14420fa770193f9871f4ce2df00d07 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jun 2021 10:09:53 -0700 Subject: net: qed: Fix memcpy() overflow of qed_dcbx_params() The source (&dcbx_info->operational.params) and dest (&p_hwfn->p_dcbx_info->set.config.params) are both struct qed_dcbx_params (560 bytes), not struct qed_dcbx_admin_params (564 bytes), which is used as the memcpy() size. However it seems that struct qed_dcbx_operational_params (dcbx_info->operational)'s layout matches struct qed_dcbx_admin_params (p_hwfn->p_dcbx_info->set.config)'s 4 byte difference (3 padding, 1 byte for "valid"). On the assumption that the size is wrong (rather than the source structure type), adjust the memcpy() size argument to be 4 bytes smaller and add a BUILD_BUG_ON() to validate any changes to the structure sizes. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index 17d5b649eb36..e81dd34a3cac 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -1266,9 +1266,11 @@ int qed_dcbx_get_config_params(struct qed_hwfn *p_hwfn, p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_STATIC; p_hwfn->p_dcbx_info->set.enabled = dcbx_info->operational.enabled; + BUILD_BUG_ON(sizeof(dcbx_info->operational.params) != + sizeof(p_hwfn->p_dcbx_info->set.config.params)); memcpy(&p_hwfn->p_dcbx_info->set.config.params, &dcbx_info->operational.params, - sizeof(struct qed_dcbx_admin_params)); + sizeof(p_hwfn->p_dcbx_info->set.config.params)); p_hwfn->p_dcbx_info->set.config.valid = true; memcpy(params, &p_hwfn->p_dcbx_info->set, sizeof(struct qed_dcbx_set)); -- cgit v1.2.3 From 1236af327af476731aa548dfcbbefb1a3ec6726a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 17 Jun 2021 12:38:54 +0200 Subject: mac80211: minstrel_ht: fix sample time check We need to skip sampling if the next sample time is after jiffies, not before. This patch fixes an issue where in some cases only very little sampling (or none at all) is performed, leading to really bad data rates Fixes: 80d55154b2f8 ("mac80211: minstrel_ht: significantly redesign the rate probing strategy") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20210617103854.61875-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 6487b05da6fa..a6f3fb4a9197 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1514,7 +1514,7 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) return; - if (time_is_before_jiffies(mi->sample_time)) + if (time_is_after_jiffies(mi->sample_time)) return; mi->sample_time = jiffies + MINSTREL_SAMPLE_INTERVAL; -- cgit v1.2.3 From b5642479b0f7168fe16d156913533fe65ab4f8d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:29 +0300 Subject: cfg80211: make certificate generation more robust If all net/wireless/certs/*.hex files are deleted, the build will hang at this point since the 'cat' command will have no arguments. Do "echo | cat - ..." so that even if the "..." part is empty, the whole thing won't hang. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.c989056c3664.Ic3b77531d00b30b26dcd69c64e55ae2f60c3f31e@changeid Signed-off-by: Johannes Berg --- net/wireless/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 2eee93985ab0..af590ae606b6 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -28,7 +28,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) @$(kecho) " GEN $@" @(echo '#include "reg.h"'; \ echo 'const u8 shipped_regdb_certs[] = {'; \ - cat $^ ; \ + echo | cat - $^ ; \ echo '};'; \ echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ ) > $@ -- cgit v1.2.3 From 0288e5e16a2e18f0b7e61a2b70d9037fc6e4abeb Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 18 Jun 2021 13:41:31 +0300 Subject: cfg80211: avoid double free of PMSR request If cfg80211_pmsr_process_abort() moves all the PMSR requests that need to be freed into a local list before aborting and freeing them. As a result, it is possible that cfg80211_pmsr_complete() will run in parallel and free the same PMSR request. Fix it by freeing the request in cfg80211_pmsr_complete() only if it is still in the original pmsr list. Cc: stable@vger.kernel.org Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.1fbef57e269a.I00294bebdb0680b892f8d1d5c871fd9dbe785a5e@changeid Signed-off-by: Johannes Berg --- net/wireless/pmsr.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 6bdd96408022..d245968b74cb 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -334,6 +334,7 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_pmsr_request *tmp, *prev, *to_free = NULL; struct sk_buff *msg; void *hdr; @@ -364,9 +365,20 @@ free_msg: nlmsg_free(msg); free_request: spin_lock_bh(&wdev->pmsr_lock); - list_del(&req->list); + /* + * cfg80211_pmsr_process_abort() may have already moved this request + * to the free list, and will free it later. In this case, don't free + * it here. + */ + list_for_each_entry_safe(tmp, prev, &wdev->pmsr_list, list) { + if (tmp == req) { + list_del(&req->list); + to_free = req; + break; + } + } spin_unlock_bh(&wdev->pmsr_lock); - kfree(req); + kfree(to_free); } EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete); -- cgit v1.2.3 From bbc6f03ff26e7b71d6135a7b78ce40e7dee3d86a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:49 +0300 Subject: mac80211: reset profile_periodicity/ema_ap Apparently we never clear these values, so they'll remain set since the setting of them is conditional. Clear the values in the relevant other cases. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.316e32d136a9.I2a12e51814258e1e1b526103894f4b9f19a91c8d@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2480bd0577bb..3f2aad2e7436 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4062,10 +4062,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (elems.mbssid_config_ie) bss_conf->profile_periodicity = elems.mbssid_config_ie->profile_periodicity; + else + bss_conf->profile_periodicity = 0; if (elems.ext_capab_len >= 11 && (elems.ext_capab[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) bss_conf->ema_ap = true; + else + bss_conf->ema_ap = false; /* continue assoc process */ ifmgd->assoc_data->timeout = jiffies; @@ -5802,12 +5806,16 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, beacon_ies->data, beacon_ies->len); if (elem && elem->datalen >= 3) sdata->vif.bss_conf.profile_periodicity = elem->data[2]; + else + sdata->vif.bss_conf.profile_periodicity = 0; elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, beacon_ies->data, beacon_ies->len); if (elem && elem->datalen >= 11 && (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) sdata->vif.bss_conf.ema_ap = true; + else + sdata->vif.bss_conf.ema_ap = false; } else { assoc_data->timeout = jiffies; assoc_data->timeout_started = true; -- cgit v1.2.3 From 652e8363bbc7d149fa194a5cbf30b1001c0274b0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Jun 2021 13:41:45 +0300 Subject: mac80211: handle various extensible elements correctly Various elements are parsed with a requirement to have an exact size, when really we should only check that they have the minimum size that we need. Check only that and therefore ignore any additional data that they might carry. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210618133832.cd101f8040a4.Iadf0e9b37b100c6c6e79c7b298cc657c2be9151a@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 93d96a4f9c3e..060059ef9668 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -947,7 +947,7 @@ static void ieee80211_parse_extension_element(u32 *crc, switch (elem->data[0]) { case WLAN_EID_EXT_HE_MU_EDCA: - if (len == sizeof(*elems->mu_edca_param_set)) { + if (len >= sizeof(*elems->mu_edca_param_set)) { elems->mu_edca_param_set = data; if (crc) *crc = crc32_be(*crc, (void *)elem, @@ -968,7 +968,7 @@ static void ieee80211_parse_extension_element(u32 *crc, } break; case WLAN_EID_EXT_UORA: - if (len == 1) + if (len >= 1) elems->uora_element = data; break; case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME: @@ -976,7 +976,7 @@ static void ieee80211_parse_extension_element(u32 *crc, elems->max_channel_switch_time = data; break; case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION: - if (len == sizeof(*elems->mbssid_config_ie)) + if (len >= sizeof(*elems->mbssid_config_ie)) elems->mbssid_config_ie = data; break; case WLAN_EID_EXT_HE_SPR: @@ -985,7 +985,7 @@ static void ieee80211_parse_extension_element(u32 *crc, elems->he_spr = data; break; case WLAN_EID_EXT_HE_6GHZ_CAPA: - if (len == sizeof(*elems->he_6ghz_capa)) + if (len >= sizeof(*elems->he_6ghz_capa)) elems->he_6ghz_capa = data; break; } @@ -1074,14 +1074,14 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, switch (id) { case WLAN_EID_LINK_ID: - if (elen + 2 != sizeof(struct ieee80211_tdls_lnkie)) { + if (elen + 2 < sizeof(struct ieee80211_tdls_lnkie)) { elem_parse_failed = true; break; } elems->lnk_id = (void *)(pos - 2); break; case WLAN_EID_CHAN_SWITCH_TIMING: - if (elen != sizeof(struct ieee80211_ch_switch_timing)) { + if (elen < sizeof(struct ieee80211_ch_switch_timing)) { elem_parse_failed = true; break; } @@ -1244,7 +1244,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elems->sec_chan_offs = (void *)pos; break; case WLAN_EID_CHAN_SWITCH_PARAM: - if (elen != + if (elen < sizeof(*elems->mesh_chansw_params_ie)) { elem_parse_failed = true; break; @@ -1253,7 +1253,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, break; case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: if (!action || - elen != sizeof(*elems->wide_bw_chansw_ie)) { + elen < sizeof(*elems->wide_bw_chansw_ie)) { elem_parse_failed = true; break; } @@ -1272,7 +1272,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH, pos, elen); if (ie) { - if (ie[1] == sizeof(*elems->wide_bw_chansw_ie)) + if (ie[1] >= sizeof(*elems->wide_bw_chansw_ie)) elems->wide_bw_chansw_ie = (void *)(ie + 2); else @@ -1316,7 +1316,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elems->cisco_dtpc_elem = pos; break; case WLAN_EID_ADDBA_EXT: - if (elen != sizeof(struct ieee80211_addba_ext_ie)) { + if (elen < sizeof(struct ieee80211_addba_ext_ie)) { elem_parse_failed = true; break; } @@ -1342,7 +1342,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elem, elems); break; case WLAN_EID_S1G_CAPABILITIES: - if (elen == sizeof(*elems->s1g_capab)) + if (elen >= sizeof(*elems->s1g_capab)) elems->s1g_capab = (void *)pos; else elem_parse_failed = true; -- cgit v1.2.3 From 39eb028183bc7378bb6187067e20bf6d8c836407 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 18 Jun 2021 11:29:48 +0200 Subject: cxgb4: fix wrong shift. While fixing coverity warning, commit dd2c79677375 introduced typo in shift value. Fix that. Signed-off-by: Pavel Machek (CIP) Fixes: dd2c79677375 ("cxgb4: Fix unintentional sign extension issues") Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 22c9ac922eba..6260b3bebd2b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -198,7 +198,7 @@ static void set_nat_params(struct adapter *adap, struct filter_entry *f, WORD_MASK, f->fs.nat_lip[3] | f->fs.nat_lip[2] << 8 | f->fs.nat_lip[1] << 16 | - (u64)f->fs.nat_lip[0] << 25, 1); + (u64)f->fs.nat_lip[0] << 24, 1); } } -- cgit v1.2.3 From 0afd6a4e8028cc487c240b6cfe04094e45a306e4 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 18 Jun 2021 02:07:25 -0400 Subject: bnxt_en: Rediscover PHY capabilities after firmware reset There is a missing bnxt_probe_phy() call in bnxt_fw_init_one() to rediscover the PHY capabilities after a firmware reset. This can cause some PHY related functionalities to fail after a firmware reset. For example, in multi-host, the ability for any host to configure the PHY settings may be lost after a firmware reset. Fixes: ec5d31e3c15d ("bnxt_en: Handle firmware reset status during IF_UP.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fcc729d52b17..3685db6dc93d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11750,6 +11750,8 @@ static void bnxt_fw_init_one_p3(struct bnxt *bp) bnxt_hwrm_coal_params_qcaps(bp); } +static int bnxt_probe_phy(struct bnxt *bp, bool fw_dflt); + static int bnxt_fw_init_one(struct bnxt *bp) { int rc; @@ -11764,6 +11766,9 @@ static int bnxt_fw_init_one(struct bnxt *bp) netdev_err(bp->dev, "Firmware init phase 2 failed\n"); return rc; } + rc = bnxt_probe_phy(bp, false); + if (rc) + return rc; rc = bnxt_approve_mac(bp, bp->dev->dev_addr, false); if (rc) return rc; -- cgit v1.2.3 From c12e1643d2738bcd4e26252ce531878841dd3f38 Mon Sep 17 00:00:00 2001 From: Rukhsana Ansari Date: Fri, 18 Jun 2021 02:07:26 -0400 Subject: bnxt_en: Fix TQM fastpath ring backing store computation TQM fastpath ring needs to be sized to store both the requester and responder side of RoCE QPs in TQM for supporting bi-directional tests. Fix bnxt_alloc_ctx_mem() to multiply the RoCE QPs by a factor of 2 when computing the number of entries for TQM fastpath ring. This fixes an RX pipeline stall issue when running bi-directional max RoCE QP tests. Fixes: c7dd7ab4b204 ("bnxt_en: Improve TQM ring context memory sizing formulas.") Signed-off-by: Rukhsana Ansari Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3685db6dc93d..c913cb1f2a72 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7308,7 +7308,7 @@ skip_rdma: entries_sp = ctx->vnic_max_vnic_entries + ctx->qp_max_l2_entries + 2 * (extra_qps + ctx->qp_min_qp1_entries) + min; entries_sp = roundup(entries_sp, ctx->tqm_entries_multiple); - entries = ctx->qp_max_l2_entries + extra_qps + ctx->qp_min_qp1_entries; + entries = ctx->qp_max_l2_entries + 2 * (extra_qps + ctx->qp_min_qp1_entries); entries = roundup(entries, ctx->tqm_entries_multiple); entries = clamp_t(u32, entries, min, ctx->tqm_max_entries_per_ring); for (i = 0; i < ctx->tqm_fp_rings_count + 1; i++) { -- cgit v1.2.3 From 03400aaa69f916a376e11526cf591901a96a3a5c Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Fri, 18 Jun 2021 02:07:27 -0400 Subject: bnxt_en: Call bnxt_ethtool_free() in bnxt_init_one() error path bnxt_ethtool_init() may have allocated some memory and we need to call bnxt_ethtool_free() to properly unwind if bnxt_init_one() fails. Fixes: 7c3809181468 ("bnxt_en: Refactor bnxt_init_one() and turn on TPA support on 57500 chips.") Signed-off-by: Somnath Kotur Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c913cb1f2a72..aef3fccc27a9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -13160,6 +13160,7 @@ init_err_pci_clean: bnxt_hwrm_func_drv_unrgtr(bp); bnxt_free_hwrm_short_cmd_req(bp); bnxt_free_hwrm_resources(bp); + bnxt_ethtool_free(bp); kfree(bp->fw_health); bp->fw_health = NULL; bnxt_cleanup_pci(bp); -- cgit v1.2.3 From 35036d69b9bd6f06201f8e2f6b9cadb21ad1e093 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 18 Jun 2021 09:00:30 +0200 Subject: MAINTAINERS: add Guvenc as SMC maintainer Add Guvenc as maintainer for Shared Memory Communications (SMC) Sockets. Cc: Julian Wiedmann Acked-by: Guvenc Gulce Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bfb3d0931cba..1634bb68972f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16554,6 +16554,7 @@ F: drivers/misc/sgi-xp/ SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS M: Karsten Graul +M: Guvenc Gulce L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ -- cgit v1.2.3 From 6aa32217a9a446275440ee8724b1ecaf1838df47 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 18 Jun 2021 12:52:23 +0200 Subject: net: ll_temac: Make sure to free skb when it is completely used With the skb pointer piggy-backed on the TX BD, we have a simple and efficient way to free the skb buffer when the frame has been transmitted. But in order to avoid freeing the skb while there are still fragments from the skb in use, we need to piggy-back on the TX BD of the skb, not the first. Without this, we are doing use-after-free on the DMA side, when the first BD of a multi TX BD packet is seen as completed in xmit_done, and the remaining BDs are still being processed. Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index a1f5f07f4ca9..e82f162cd80c 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -876,7 +876,6 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev) return NETDEV_TX_OK; } cur_p->phys = cpu_to_be32(skb_dma_addr); - ptr_to_txbd((void *)skb, cur_p); for (ii = 0; ii < num_frag; ii++) { if (++lp->tx_bd_tail >= lp->tx_bd_num) @@ -915,6 +914,11 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev) } cur_p->app0 |= cpu_to_be32(STS_CTRL_APP0_EOP); + /* Mark last fragment with skb address, so it can be consumed + * in temac_start_xmit_done() + */ + ptr_to_txbd((void *)skb, cur_p); + tail_p = lp->tx_bd_p + sizeof(*lp->tx_bd_v) * lp->tx_bd_tail; lp->tx_bd_tail++; if (lp->tx_bd_tail >= lp->tx_bd_num) -- cgit v1.2.3 From 28d9fab458b16bcd83f9dd07ede3d585c3e1a69e Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 18 Jun 2021 12:52:28 +0200 Subject: net: ll_temac: Add memory-barriers for TX BD access Add a couple of memory-barriers to ensure correct ordering of read/write access to TX BDs. In xmit_done, we should ensure that reading the additional BD fields are only done after STS_CTRL_APP0_CMPLT bit is set. When xmit_done marks the BD as free by setting APP0=0, we need to ensure that the other BD fields are reset first, so we avoid racing with the xmit path, which writes to the same fields. Finally, making sure to read APP0 of next BD after the current BD, ensures that we see all available buffers. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index e82f162cd80c..9797aa3221d1 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -774,12 +774,15 @@ static void temac_start_xmit_done(struct net_device *ndev) stat = be32_to_cpu(cur_p->app0); while (stat & STS_CTRL_APP0_CMPLT) { + /* Make sure that the other fields are read after bd is + * released by dma + */ + rmb(); dma_unmap_single(ndev->dev.parent, be32_to_cpu(cur_p->phys), be32_to_cpu(cur_p->len), DMA_TO_DEVICE); skb = (struct sk_buff *)ptr_from_txbd(cur_p); if (skb) dev_consume_skb_irq(skb); - cur_p->app0 = 0; cur_p->app1 = 0; cur_p->app2 = 0; cur_p->app3 = 0; @@ -788,6 +791,12 @@ static void temac_start_xmit_done(struct net_device *ndev) ndev->stats.tx_packets++; ndev->stats.tx_bytes += be32_to_cpu(cur_p->len); + /* app0 must be visible last, as it is used to flag + * availability of the bd + */ + smp_mb(); + cur_p->app0 = 0; + lp->tx_bd_ci++; if (lp->tx_bd_ci >= lp->tx_bd_num) lp->tx_bd_ci = 0; @@ -814,6 +823,9 @@ static inline int temac_check_tx_bd_space(struct temac_local *lp, int num_frag) if (cur_p->app0) return NETDEV_TX_BUSY; + /* Make sure to read next bd app0 after this one */ + rmb(); + tail++; if (tail >= lp->tx_bd_num) tail = 0; -- cgit v1.2.3 From c364df2489b8ef2f5e3159b1dff1ff1fdb16040d Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 18 Jun 2021 12:52:33 +0200 Subject: net: ll_temac: Fix TX BD buffer overwrite Just as the initial check, we need to ensure num_frag+1 buffers available, as that is the number of buffers we are going to use. This fixes a buffer overflow, which might be seen during heavy network load. Complete lockup of TEMAC was reproducible within about 10 minutes of a particular load. Fixes: 84823ff80f74 ("net: ll_temac: Fix race condition causing TX hang") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 9797aa3221d1..cc482ee36501 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -861,7 +861,7 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev) smp_mb(); /* Space might have just been freed - check again */ - if (temac_check_tx_bd_space(lp, num_frag)) + if (temac_check_tx_bd_space(lp, num_frag + 1)) return NETDEV_TX_BUSY; netif_wake_queue(ndev); -- cgit v1.2.3 From f6396341194234e9b01cd7538bc2c6ac4501ab14 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 18 Jun 2021 12:52:38 +0200 Subject: net: ll_temac: Avoid ndo_start_xmit returning NETDEV_TX_BUSY As documented in Documentation/networking/driver.rst, the ndo_start_xmit method must not return NETDEV_TX_BUSY under any normal circumstances, and as recommended, we simply stop the tx queue in advance, when there is a risk that the next xmit would cause a NETDEV_TX_BUSY return. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index cc482ee36501..9a13953ea70f 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -942,6 +942,11 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev) wmb(); lp->dma_out(lp, TX_TAILDESC_PTR, tail_p); /* DMA start */ + if (temac_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) { + netdev_info(ndev, "%s -> netif_stop_queue\n", __func__); + netif_stop_queue(ndev); + } + return NETDEV_TX_OK; } -- cgit v1.2.3 From 321827477360934dc040e9d3c626bf1de6c3ab3c Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 18 Jun 2021 13:04:35 +0200 Subject: icmp: don't send out ICMP messages with a source address of 0.0.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When constructing ICMP response messages, the kernel will try to pick a suitable source address for the outgoing packet. However, if no IPv4 addresses are configured on the system at all, this will fail and we end up producing an ICMP message with a source address of 0.0.0.0. This can happen on a box routing IPv4 traffic via v6 nexthops, for instance. Since 0.0.0.0 is not generally routable on the internet, there's a good chance that such ICMP messages will never make it back to the sender of the original packet that the ICMP message was sent in response to. This, in turn, can create connectivity and PMTUd problems for senders. Fortunately, RFC7600 reserves a dummy address to be used as a source for ICMP messages (192.0.0.8/32), so let's teach the kernel to substitute that address as a last resort if the regular source address selection procedure fails. Below is a quick example reproducing this issue with network namespaces: ip netns add ns0 ip l add type veth peer netns ns0 ip l set dev veth0 up ip a add 10.0.0.1/24 dev veth0 ip a add fc00:dead:cafe:42::1/64 dev veth0 ip r add 10.1.0.0/24 via inet6 fc00:dead:cafe:42::2 ip -n ns0 l set dev veth0 up ip -n ns0 a add fc00:dead:cafe:42::2/64 dev veth0 ip -n ns0 r add 10.0.0.0/24 via inet6 fc00:dead:cafe:42::1 ip netns exec ns0 sysctl -w net.ipv4.icmp_ratelimit=0 ip netns exec ns0 sysctl -w net.ipv4.ip_forward=1 tcpdump -tpni veth0 -c 2 icmp & ping -w 1 10.1.0.1 > /dev/null tcpdump: verbose output suppressed, use -v[v]... for full protocol decode listening on veth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 29, seq 1, length 64 IP 0.0.0.0 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92 2 packets captured 2 packets received by filter 0 packets dropped by kernel With this patch the above capture changes to: IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 31127, seq 1, length 64 IP 192.0.0.8 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Juliusz Chroboczek Reviewed-by: David Ahern Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 3 +++ net/ipv4/icmp.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 7d6687618d80..d1b327036ae4 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -289,6 +289,9 @@ struct sockaddr_in { /* Address indicating an error return. */ #define INADDR_NONE ((unsigned long int) 0xffffffff) +/* Dummy address for src of ICMP replies if no real address is set (RFC7600). */ +#define INADDR_DUMMY ((unsigned long int) 0xc0000008) + /* Network number for local host loopback. */ #define IN_LOOPBACKNET 127 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7b6931a4d775..752e392083e6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -759,6 +759,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); + /* if we don't have a source address at this point, fall back to the + * dummy address instead of sending out a packet with a source address + * of 0.0.0.0 + */ + if (!fl4.saddr) + fl4.saddr = htonl(INADDR_DUMMY); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); -- cgit v1.2.3 From 7e9838b7915e29ae0dfe4a3e5f007c9dc6ab9b45 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 18 Jun 2021 13:04:36 +0200 Subject: selftests/net: Add icmp.sh for testing ICMP dummy address responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a new icmp.sh selftest for testing that the kernel will respond correctly with an ICMP unreachable message with the dummy (192.0.0.8) source address when there are no IPv4 addresses configured to use as source addresses. Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/icmp.sh | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100755 tools/testing/selftests/net/icmp.sh diff --git a/tools/testing/selftests/net/icmp.sh b/tools/testing/selftests/net/icmp.sh new file mode 100755 index 000000000000..e4b04cd1644a --- /dev/null +++ b/tools/testing/selftests/net/icmp.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test for checking ICMP response with dummy address instead of 0.0.0.0. +# Sets up two namespaces like: +# +----------------------+ +--------------------+ +# | ns1 | v4-via-v6 routes: | ns2 | +# | | ' | | +# | +--------+ -> 172.16.1.0/24 -> +--------+ | +# | | veth0 +--------------------------+ veth0 | | +# | +--------+ <- 172.16.0.0/24 <- +--------+ | +# | 172.16.0.1 | | 2001:db8:1::2/64 | +# | 2001:db8:1::2/64 | | | +# +----------------------+ +--------------------+ +# +# And then tries to ping 172.16.1.1 from ns1. This results in a "net +# unreachable" message being sent from ns2, but there is no IPv4 address set in +# that address space, so the kernel should substitute the dummy address +# 192.0.0.8 defined in RFC7600. + +NS1=ns1 +NS2=ns2 +H1_IP=172.16.0.1/32 +H1_IP6=2001:db8:1::1 +RT1=172.16.1.0/24 +PINGADDR=172.16.1.1 +RT2=172.16.0.0/24 +H2_IP6=2001:db8:1::2 + +TMPFILE=$(mktemp) + +cleanup() +{ + rm -f "$TMPFILE" + ip netns del $NS1 + ip netns del $NS2 +} + +trap cleanup EXIT + +# Namespaces +ip netns add $NS1 +ip netns add $NS2 + +# Connectivity +ip -netns $NS1 link add veth0 type veth peer name veth0 netns $NS2 +ip -netns $NS1 link set dev veth0 up +ip -netns $NS2 link set dev veth0 up +ip -netns $NS1 addr add $H1_IP dev veth0 +ip -netns $NS1 addr add $H1_IP6/64 dev veth0 nodad +ip -netns $NS2 addr add $H2_IP6/64 dev veth0 nodad +ip -netns $NS1 route add $RT1 via inet6 $H2_IP6 +ip -netns $NS2 route add $RT2 via inet6 $H1_IP6 + +# Make sure ns2 will respond with ICMP unreachable +ip netns exec $NS2 sysctl -qw net.ipv4.icmp_ratelimit=0 net.ipv4.ip_forward=1 + +# Run the test - a ping runs in the background, and we capture ICMP responses +# with tcpdump; -c 1 means it should exit on the first ping, but add a timeout +# in case something goes wrong +ip netns exec $NS1 ping -w 3 -i 0.5 $PINGADDR >/dev/null & +ip netns exec $NS1 timeout 10 tcpdump -tpni veth0 -c 1 'icmp and icmp[icmptype] != icmp-echo' > $TMPFILE 2>/dev/null + +# Parse response and check for dummy address +# tcpdump output looks like: +# IP 192.0.0.8 > 172.16.0.1: ICMP net 172.16.1.1 unreachable, length 92 +RESP_IP=$(awk '{print $2}' < $TMPFILE) +if [[ "$RESP_IP" != "192.0.0.8" ]]; then + echo "FAIL - got ICMP response from $RESP_IP, should be 192.0.0.8" + exit 1 +else + echo "OK" + exit 0 +fi -- cgit v1.2.3 From 9cca0c2d70149160407bda9a9446ce0c29b6e6c6 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 18 Jun 2021 16:49:02 +0300 Subject: net: ethernet: fix potential use-after-free in ec_bhf_remove static void ec_bhf_remove(struct pci_dev *dev) { ... struct ec_bhf_priv *priv = netdev_priv(net_dev); unregister_netdev(net_dev); free_netdev(net_dev); pci_iounmap(dev, priv->dma_io); pci_iounmap(dev, priv->io); ... } priv is netdev private data, but it is used after free_netdev(). It can cause use-after-free when accessing priv pointer. So, fix it by moving free_netdev() after pci_iounmap() calls. Fixes: 6af55ff52b02 ("Driver for Beckhoff CX5020 EtherCAT master module.") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/ec_bhf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c index 46b0dbab8aad..7c992172933b 100644 --- a/drivers/net/ethernet/ec_bhf.c +++ b/drivers/net/ethernet/ec_bhf.c @@ -576,10 +576,12 @@ static void ec_bhf_remove(struct pci_dev *dev) struct ec_bhf_priv *priv = netdev_priv(net_dev); unregister_netdev(net_dev); - free_netdev(net_dev); pci_iounmap(dev, priv->dma_io); pci_iounmap(dev, priv->io); + + free_netdev(net_dev); + pci_release_regions(dev); pci_clear_master(dev); pci_disable_device(dev); -- cgit v1.2.3