From 07bf7908950a8b14e81aa1807e3c667eab39287a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 1 Aug 2018 13:45:11 +0200 Subject: xfrm: Validate address prefix lengths in the xfrm selector. We don't validate the address prefix lengths in the xfrm selector we got from userspace. This can lead to undefined behaviour in the address matching functions if the prefix is too big for the given address family. Fix this by checking the prefixes and refuse SA/policy insertation when a prefix is invalid. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Air Icy Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 33878e6e0d0a..5151b3ebf068 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -151,10 +151,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; switch (p->family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + goto out; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + goto out; + break; #else err = -EAFNOSUPPORT; @@ -1359,10 +1365,16 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) switch (p->sel.family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + return -EINVAL; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + return -EINVAL; + break; #else return -EAFNOSUPPORT; -- cgit v1.2.3 From 215ab0f021c9fea3c18b75e7d522400ee6a49990 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 31 Aug 2018 08:38:49 -0300 Subject: xfrm6: call kfree_skb when skb is toobig After commit d6990976af7c5d8f55903bfb4289b6fb030bf754 ("vti6: fix PMTU caching and reporting on xmit"), some too big skbs might be potentially passed down to __xfrm6_output, causing it to fail to transmit but not free the skb, causing a leak of skb, and consequentially a leak of dst references. After running pmtu.sh, that shows as failure to unregister devices in a namespace: [ 311.397671] unregister_netdevice: waiting for veth_b to become free. Usage count = 1 The fix is to call kfree_skb in case of transmit failures. Fixes: dd767856a36e ("xfrm6: Don't call icmpv6_send on local error") Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5959ce9620eb..6a74080005cf 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -170,9 +170,11 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } -- cgit v1.2.3 From bfc0698bebcb16d19ecfc89574ad4d696955e5d3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 3 Sep 2018 04:36:52 -0700 Subject: xfrm: reset transport header back to network header after all input transforms ahave been applied A policy may have been set up with multiple transforms (e.g., ESP and ipcomp). In this situation, the ingress IPsec processing iterates in xfrm_input() and applies each transform in turn, processing the nexthdr to find any additional xfrm that may apply. This patch resets the transport header back to network header only after the last transformation so that subsequent xfrms can find the correct transport header. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Suggested-by: Steffen Klassert Signed-off-by: Sowmini Varadhan Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 1 + net/ipv4/xfrm4_mode_transport.c | 4 +--- net/ipv6/xfrm6_input.c | 1 + net/ipv6/xfrm6_mode_transport.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index bcfc00e88756..f8de2482a529 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 3d36644890bb..1ad2c2c4e250 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841f4a07438e..9ef490dddcea 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -59,6 +59,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return -1; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 9ad07a91708e..3c29da5defe6 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -51,7 +51,6 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -60,8 +59,7 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } -- cgit v1.2.3 From 782710e333a526780d65918d669cb96646983ba2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 3 Sep 2018 04:36:53 -0700 Subject: xfrm: reset crypto_done when iterating over multiple input xfrms We only support one offloaded xfrm (we do not have devices that can handle more than one offload), so reset crypto_done in xfrm_input() when iterating over multiple transforms in xfrm_input, so that we can invoke the appropriate x->type->input for the non-offloaded transforms Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Sowmini Varadhan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 352abca2605f..86f5afbd0a0c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -453,6 +453,7 @@ resume: XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); -- cgit v1.2.3 From 8682250b3c1b75a45feb7452bc413d004cfe3778 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:13 +0300 Subject: mac80211: Always report TX status If a frame is dropped for any reason, mac80211 wouldn't report the TX status back to user space. As the user space may rely on the TX_STATUS to kick its state machines, resends etc, it's better to just report this frame as not acked instead. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/status.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 9a6d7208bf4f..001a869c059c 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -479,11 +479,6 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (!skb) return; - if (dropped) { - dev_kfree_skb_any(skb); - return; - } - if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; @@ -506,6 +501,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, } rcu_read_unlock(); + dev_kfree_skb_any(skb); + } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ -- cgit v1.2.3 From 94a5b3acd0aef83c0e38b5117eda7b2abf4a05a4 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:14 +0300 Subject: mac80211: Don't wake up from PS for offchannel TX Otherwise the offchannel frame might be queued due to IEEE80211_QUEUE_STOP_REASON_PS and later dropped (in ieee80211_tx_frags()). Anyway, it doesn't make much sense to wake up the device during ROC. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f353d9db54bc..131542513c8f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -214,6 +214,7 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) @@ -242,6 +243,9 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; + if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) + return TX_CONTINUE; + ifmgd = &tx->sdata->u.mgd; /* -- cgit v1.2.3 From 24f33e64fcd0d50a4b1a8e5b41bd0257aa66b0e8 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:12 +0300 Subject: cfg80211: reg: Init wiphy_idx in regulatory_hint_core() Core regulatory hints didn't set wiphy_idx to WIPHY_IDX_INVALID. Since the regulatory request is zeroed, wiphy_idx was always implicitly set to 0. This resulted in updating only phy #0. Fix that. Fixes: 806a9e39670b ("cfg80211: make regulatory_request use wiphy_idx instead of wiphy") Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho [add fixes tag] Signed-off-by: Johannes Berg --- net/wireless/reg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2f702adf2912..765dedb12361 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2867,6 +2867,7 @@ static int regulatory_hint_core(const char *alpha2) request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; + request->wiphy_idx = WIPHY_IDX_INVALID; queue_regulatory_request(request); -- cgit v1.2.3 From 6eae4a6c2be387fec41b0d2782c4fffb57159498 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Wed, 5 Sep 2018 06:22:59 -0400 Subject: mac80211: fix pending queue hang due to TX_DROP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our environment running lots of mesh nodes, we are seeing the pending queue hang periodically, with the debugfs queues file showing lines such as: 00: 0x00000000/348 i.e. there are a large number of frames but no stop reason set. One way this could happen is if queue processing from the pending tasklet exited early without processing all frames, and without having some future event (incoming frame, stop reason flag, ...) to reschedule it. Exactly this can occur today if ieee80211_tx() returns false due to packet drops or power-save buffering in the tx handlers. In the past, this function would return true in such cases, and the change to false doesn't seem to be intentional. Fix this case by reverting to the previous behavior. Fixes: bb42f2d13ffc ("mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue") Signed-off-by: Bob Copeland Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 131542513c8f..25ba24bef8f5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1894,7 +1894,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) - return false; + return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; -- cgit v1.2.3 From 119f94a6fefcc76d47075b83d2b73d04c895df78 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 5 Sep 2018 18:52:22 +0300 Subject: cfg80211: Address some corner cases in scan result channel updating cfg80211_get_bss_channel() is used to update the RX channel based on the available frame payload information (channel number from DSSS Parameter Set element or HT Operation element). This is needed on 2.4 GHz channels where frames may be received on neighboring channels due to overlapping frequency range. This might of some use on the 5 GHz band in some corner cases, but things are more complex there since there is no n:1 or 1:n mapping between channel numbers and frequencies due to multiple different starting frequencies in different operating classes. This could result in ieee80211_channel_to_frequency() returning incorrect frequency and ieee80211_get_channel() returning incorrect channel information (or indication of no match). In the previous implementation, this could result in some scan results being dropped completely, e.g., for the 4.9 GHz channels. That prevented connection to such BSSs. Fix this by using the driver-provided channel pointer if ieee80211_get_channel() does not find matching channel data for the channel number in the frame payload and if the scan is done with 5 MHz or 10 MHz channel bandwidth. While doing this, also add comments describing what the function is trying to achieve to make it easier to understand what happens here and why. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/scan.c | 58 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d36c3eb7b931..d0e7472dd9fd 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1058,13 +1058,23 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, return NULL; } +/* + * Update RX channel information based on the available frame payload + * information. This is mainly for the 2.4 GHz band where frames can be received + * from neighboring channels and the Beacon frames use the DSSS Parameter Set + * element to indicate the current (transmitting) channel, but this might also + * be needed on other bands if RX frequency does not match with the actual + * operating channel of a BSS. + */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, - struct ieee80211_channel *channel) + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width) { const u8 *tmp; u32 freq; int channel_number = -1; + struct ieee80211_channel *alt_channel; tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp[1] == 1) { @@ -1078,16 +1088,45 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, } } - if (channel_number < 0) + if (channel_number < 0) { + /* No channel information in frame payload */ return channel; + } freq = ieee80211_channel_to_frequency(channel_number, channel->band); - channel = ieee80211_get_channel(wiphy, freq); - if (!channel) - return NULL; - if (channel->flags & IEEE80211_CHAN_DISABLED) + alt_channel = ieee80211_get_channel(wiphy, freq); + if (!alt_channel) { + if (channel->band == NL80211_BAND_2GHZ) { + /* + * Better not allow unexpected channels when that could + * be going beyond the 1-11 range (e.g., discovering + * BSS on channel 12 when radio is configured for + * channel 11. + */ + return NULL; + } + + /* No match for the payload channel number - ignore it */ + return channel; + } + + if (scan_width == NL80211_BSS_CHAN_WIDTH_10 || + scan_width == NL80211_BSS_CHAN_WIDTH_5) { + /* + * Ignore channel number in 5 and 10 MHz channels where there + * may not be an n:1 or 1:n mapping between frequencies and + * channel numbers. + */ + return channel; + } + + /* + * Use the channel determined through the payload channel number + * instead of the RX channel reported by the driver. + */ + if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; - return channel; + return alt_channel; } /* Returned bss is reference counted and must be cleaned up appropriately. */ @@ -1112,7 +1151,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, (data->signal < 0 || data->signal > 100))) return NULL; - channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan); + channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan, + data->scan_width); if (!channel) return NULL; @@ -1210,7 +1250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, return NULL; channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable, - ielen, data->chan); + ielen, data->chan, data->scan_width); if (!channel) return NULL; -- cgit v1.2.3 From cb59bc14e830028d2244861216df038165d7625d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Sep 2018 13:34:02 +0200 Subject: mac80211: TDLS: fix skb queue/priority assignment If the TDLS setup happens over a connection to an AP that doesn't have QoS, we nevertheless assign a non-zero TID (skb->priority) and queue mapping, which may confuse us or drivers later. Fix it by just assigning the special skb->priority and then using ieee80211_select_queue() just like other data frames would go through. Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 5cd5e6e5834e..6c647f425e05 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -16,6 +16,7 @@ #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" +#include "wme.h" /* give usermode some time for retries in setting up the TDLS session */ #define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) @@ -1010,14 +1011,13 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: case WLAN_TDLS_SETUP_RESPONSE: - skb_set_queue_mapping(skb, IEEE80211_AC_BK); - skb->priority = 2; + skb->priority = 256 + 2; break; default: - skb_set_queue_mapping(skb, IEEE80211_AC_VI); - skb->priority = 5; + skb->priority = 256 + 5; break; } + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb)); /* * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress. -- cgit v1.2.3 From c42055105785580563535e6d3143cad95c7ac7ee Mon Sep 17 00:00:00 2001 From: Yuan-Chi Pang Date: Thu, 6 Sep 2018 16:57:48 +0800 Subject: mac80211: fix TX status reporting for ieee80211s TX status reporting to ieee80211s is through ieee80211s_update_metric. There are two problems about ieee80211s_update_metric: 1. The purpose is to estimate the fail probability to a specific link. No need to restrict to data frame. 2. Current implementation does not work if wireless driver does not pass tx_status with skb. Fix this by removing ieee80211_is_data condition, passing ieee80211_tx_status directly to ieee80211s_update_metric, and putting it in both __ieee80211_tx_status and ieee80211_tx_status_ext. Signed-off-by: Yuan-Chi Pang Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 3 ++- net/mac80211/mesh_hwmp.c | 9 +++------ net/mac80211/status.c | 4 +++- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index ee56f18cad3f..21526630bf65 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -217,7 +217,8 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, + struct ieee80211_tx_status *st); void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index daf9db3c8f24..6950cd0bf594 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -295,15 +295,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, } void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, + struct ieee80211_tx_status *st) { - struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *txinfo = st->info; int failed; - if (!ieee80211_is_data(hdr->frame_control)) - return; - failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); /* moving average, scaled to 100. diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 001a869c059c..91d7c0cd1882 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -808,7 +808,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) - ieee80211s_update_metric(local, sta, skb); + ieee80211s_update_metric(local, sta, status); if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); @@ -969,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, } rate_control_tx_status(local, sband, status); + if (ieee80211_vif_is_mesh(&sta->sdata->vif)) + ieee80211s_update_metric(local, sta, status); } if (acked || noack_success) { -- cgit v1.2.3 From 9e1437937807b0122e8da1ca8765be2adca9aee6 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 11 Sep 2018 10:31:15 +0200 Subject: xfrm: Fix NULL pointer dereference when skb_dst_force clears the dst_entry. Since commit 222d7dbd258d ("net: prevent dst uses after free") skb_dst_force() might clear the dst_entry attached to the skb. The xfrm code don't expect this to happen, so we crash with a NULL pointer dereference in this case. Fix it by checking skb_dst(skb) for NULL after skb_dst_force() and drop the packet in cast the dst_entry was cleared. Fixes: 222d7dbd258d ("net: prevent dst uses after free") Reported-by: Tobias Hommel Reported-by: Kristian Evensen Reported-by: Wolfgang Walter Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 4 ++++ net/xfrm/xfrm_policy.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 89b178a78dc7..36d15a38ce5e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -101,6 +101,10 @@ static int xfrm_output_one(struct sk_buff *skb, int err) spin_unlock_bh(&x->lock); skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + goto error_nolock; + } if (xfrm_offload(skb)) { x->type_offload->encap(x, skb); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7c5e8978aeaa..626e0f4d1749 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2548,6 +2548,10 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); + return 0; + } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { -- cgit v1.2.3 From 32bf94fb5c2ec4ec842152d0e5937cd4bb6738fa Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Wed, 19 Sep 2018 13:54:56 -0600 Subject: xfrm: validate template mode XFRM mode parameters passed as part of the user templates in the IP_XFRM_POLICY are never properly validated. Passing values other than valid XFRM modes can cause stack-out-of-bounds reads to occur later in the XFRM processing: [ 140.535608] ================================================================ [ 140.543058] BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x17e4/0x1cc4 [ 140.550306] Read of size 4 at addr ffffffc0238a7a58 by task repro/5148 [ 140.557369] [ 140.558927] Call trace: [ 140.558936] dump_backtrace+0x0/0x388 [ 140.558940] show_stack+0x24/0x30 [ 140.558946] __dump_stack+0x24/0x2c [ 140.558949] dump_stack+0x8c/0xd0 [ 140.558956] print_address_description+0x74/0x234 [ 140.558960] kasan_report+0x240/0x264 [ 140.558963] __asan_report_load4_noabort+0x2c/0x38 [ 140.558967] xfrm_state_find+0x17e4/0x1cc4 [ 140.558971] xfrm_resolve_and_create_bundle+0x40c/0x1fb8 [ 140.558975] xfrm_lookup+0x238/0x1444 [ 140.558977] xfrm_lookup_route+0x48/0x11c [ 140.558984] ip_route_output_flow+0x88/0xc4 [ 140.558991] raw_sendmsg+0xa74/0x266c [ 140.558996] inet_sendmsg+0x258/0x3b0 [ 140.559002] sock_sendmsg+0xbc/0xec [ 140.559005] SyS_sendto+0x3a8/0x5a8 [ 140.559008] el0_svc_naked+0x34/0x38 [ 140.559009] [ 140.592245] page dumped because: kasan: bad access detected [ 140.597981] page_owner info is not active (free page?) [ 140.603267] [ 140.653503] ================================================================ Signed-off-by: Sean Tranchetti Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5151b3ebf068..d0672c400c2f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1455,6 +1455,9 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) (ut[i].family != prev_family)) return -EINVAL; + if (ut[i].mode >= XFRM_MODE_MAX) + return -EINVAL; + prev_family = ut[i].family; switch (ut[i].family) { -- cgit v1.2.3 From a173f066c7cfc031acb8f541708041e009fc9812 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 17 Sep 2018 08:20:36 -0700 Subject: netfilter: bridge: Don't sabotage nf_hook calls from an l3mdev For starters, the bridge netfilter code registers operations that are invoked any time nh_hook is called. Specifically, ip_sabotage_in watches for nested calls for NF_INET_PRE_ROUTING when a bridge is in the stack. Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish, then resets in_prerouting flag to 0 and the packet continues up the stack. The packet eventually makes it to the VRF driver and it invokes nf_hook for NF_INET_PRE_ROUTING in case any rules have been added against the vrf device. Because of the registered operations the call to nf_hook causes ip_sabotage_in to be invoked. That function sees the nf_bridge on the skb and that in_prerouting is not set. Thinking it is an invalid nested call it steals (drops) the packet. Update ip_sabotage_in to recognize that the bridge or one of its upper devices (e.g., vlan) can be enslaved to a VRF (L3 master device) and allow the packet to go through the nf_hook a second time. Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device") Reported-by: D'Souza, Nelson Signed-off-by: David Ahern Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..37278dc280eb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; } -- cgit v1.2.3 From bab4344975fe2c719eda32de59298d6de26fe126 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 17 Sep 2018 22:21:36 -0700 Subject: netfilter: nft_osf: use enum nft_data_types for nft_validate_register_store The function nft_validate_register_store requires a struct of type struct nft_data_types. NFTA_DATA_VALUE is of type enum nft_verdict_attributes. Pass the correct enum type. This fixes a warning seen with Clang: net/netfilter/nft_osf.c:52:8: warning: implicit conversion from enumeration type 'enum nft_data_attributes' to different enumeration type 'enum nft_data_types' [-Wenum-conversion] NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); ^~~~~~~~~~~~~~~ Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Link: https://github.com/ClangBuiltLinux/linux/issues/103 Signed-off-by: Stefan Agner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 5af74b37f423..a35fb59ace73 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -49,7 +49,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); + NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); if (err < 0) return err; -- cgit v1.2.3 From 346fa83d10934cf206e2fd0f514bf8ce186f08fe Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 19 Sep 2018 20:21:11 +0800 Subject: netfilter: conntrack: get rid of double sizeof sizeof(sizeof()) is quite strange and does not seem to be what is wanted here. The issue is detected with the help of Coccinelle. Fixes: 39215846740a ("netfilter: conntrack: remove nlattr_size pointer from l4proto trackers") Signed-off-by: zhong jiang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b4bdf9eda7b7..247b89784a6f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1213,8 +1213,8 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags)))) + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { -- cgit v1.2.3 From 92ef12b32feab8f277b69e9fb89ede2796777f4d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 18:21:58 +0200 Subject: tipc: fix flow control accounting for implicit connect In the case of implicit connect message with data > 1K, the flow control accounting is incorrect. At this state, the socket does not know the peer nodes capability and falls back to legacy flow control by return 1, however the receiver of this message will perform the new block accounting. This leads to a slack and eventually traffic disturbance. In this commit, we perform tipc_node_get_capabilities() at implicit connect and perform accounting based on the peer's capability. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3f03ddd0e35b..b6f99b021d09 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1419,8 +1419,10 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) /* Handle implicit connection setup */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dlen); - if (dlen && (dlen == rc)) + if (dlen && dlen == rc) { + tsk->peer_caps = tipc_node_get_capabilities(net, dnode); tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); + } return rc; } -- cgit v1.2.3 From 94b6ddce71780575fbbf9d2c36afc8440e61a281 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 21:56:57 +0200 Subject: tipc: reset bearer if device carrier not ok If we detect that under lying carrier detects errors and goes down, we reset the bearer. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 418f03d0be90..645c16052052 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -609,16 +609,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, switch (evt) { case NETDEV_CHANGE: - if (netif_carrier_ok(dev)) + if (netif_carrier_ok(dev) && netif_oper_up(dev)) { + test_and_set_bit_lock(0, &b->up); break; - /* else: fall through */ - case NETDEV_UP: - test_and_set_bit_lock(0, &b->up); - break; + } + /* fall through */ case NETDEV_GOING_DOWN: clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; + case NETDEV_UP: + test_and_set_bit_lock(0, &b->up); + break; case NETDEV_CHANGEMTU: if (tipc_mtu_bad(dev, 0)) { bearer_disable(net, b); -- cgit v1.2.3 From 3f32d0be6c16b902b687453c962d17eea5b8ea19 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 22:09:10 +0200 Subject: tipc: lock wakeup & inputq at tipc_link_reset() In tipc_link_reset() we copy the wakeup queue to input queue using skb_queue_splice_init(link->wakeupq, link->inputq). This is performed without holding any locks. The lists might be simultaneously be accessed by other cpu threads in tipc_sk_rcv(), something leading to to random missing packets. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index b1f0bee54eac..26cc033ee167 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -841,9 +841,14 @@ void tipc_link_reset(struct tipc_link *l) l->in_session = false; l->session++; l->mtu = l->advertised_mtu; + spin_lock_bh(&l->wakeupq.lock); + spin_lock_bh(&l->inputq->lock); + skb_queue_splice_init(&l->wakeupq, l->inputq); + spin_unlock_bh(&l->inputq->lock); + spin_unlock_bh(&l->wakeupq.lock); + __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); - skb_queue_splice_init(&l->wakeupq, l->inputq); __skb_queue_purge(&l->backlogq); l->backlog[TIPC_LOW_IMPORTANCE].len = 0; l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; -- cgit v1.2.3 From 8105f9b8a8879bff7f1d43d0720c993a99c9d135 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 22 Sep 2018 18:35:31 +0200 Subject: mac80211: allocate TXQs for active monitor interfaces Monitor mode interfaces with the active flag are passed down to the driver. Drivers using TXQ expect that all interfaces have allocated TXQs before they get added. Fixes: 79af1f866193d ("mac80211: avoid allocating TXQs that won't be used") Cc: stable@vger.kernel.org Reported-by: Catrinel Catrinescu Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5e6cf2cee965..5836ddeac9e3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1756,7 +1756,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (local->ops->wake_tx_queue && type != NL80211_IFTYPE_AP_VLAN && - type != NL80211_IFTYPE_MONITOR) + (type != NL80211_IFTYPE_MONITOR || + (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; -- cgit v1.2.3 From 30fe6d50eb088783c8729c7d930f65296b2b3fa7 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 25 Sep 2018 11:15:00 +0900 Subject: nl80211: Fix possible Spectre-v1 for NL80211_TXRATE_HT Use array_index_nospec() to sanitize ridx with respect to speculation. Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4b8ec659e797..bd26230de63e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3756,6 +3756,7 @@ static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, return false; /* check availability */ + ridx = array_index_nospec(ridx, IEEE80211_HT_MCS_MASK_LEN); if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) mcs[ridx] |= rbit; else -- cgit v1.2.3 From cb28c306b93b71f2741ce1a5a66289db26715f4d Mon Sep 17 00:00:00 2001 From: Matias Karhumaa Date: Wed, 26 Sep 2018 09:13:46 +0300 Subject: Bluetooth: SMP: fix crash in unpairing In case unpair_device() was called through mgmt interface at the same time when pairing was in progress, Bluetooth kernel module crash was seen. [ 600.351225] general protection fault: 0000 [#1] SMP PTI [ 600.351235] CPU: 1 PID: 11096 Comm: btmgmt Tainted: G OE 4.19.0-rc1+ #1 [ 600.351238] Hardware name: Dell Inc. Latitude E5440/08RCYC, BIOS A18 05/14/2017 [ 600.351272] RIP: 0010:smp_chan_destroy.isra.10+0xce/0x2c0 [bluetooth] [ 600.351276] Code: c0 0f 84 b4 01 00 00 80 78 28 04 0f 84 53 01 00 00 4d 85 ed 0f 85 ab 00 00 00 48 8b 08 48 8b 50 08 be 10 00 00 00 48 89 51 08 <48> 89 0a 48 b9 00 02 00 00 00 00 ad de 48 89 48 08 48 8b 83 00 01 [ 600.351279] RSP: 0018:ffffa9be839b3b50 EFLAGS: 00010246 [ 600.351282] RAX: ffff9c999ac565a0 RBX: ffff9c9996e98c00 RCX: ffff9c999aa28b60 [ 600.351285] RDX: dead000000000200 RSI: 0000000000000010 RDI: ffff9c999e403500 [ 600.351287] RBP: ffffa9be839b3b70 R08: 0000000000000000 R09: ffffffff92a25c00 [ 600.351290] R10: ffffa9be839b3ae8 R11: 0000000000000001 R12: ffff9c995375b800 [ 600.351292] R13: 0000000000000000 R14: ffff9c99619a5000 R15: ffff9c9962a01c00 [ 600.351295] FS: 00007fb2be27c700(0000) GS:ffff9c999e880000(0000) knlGS:0000000000000000 [ 600.351298] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.351300] CR2: 00007fb2bdadbad0 CR3: 000000041c328001 CR4: 00000000001606e0 [ 600.351302] Call Trace: [ 600.351325] smp_failure+0x4f/0x70 [bluetooth] [ 600.351345] smp_cancel_pairing+0x74/0x80 [bluetooth] [ 600.351370] unpair_device+0x1c1/0x330 [bluetooth] [ 600.351399] hci_sock_sendmsg+0x960/0x9f0 [bluetooth] [ 600.351409] ? apparmor_socket_sendmsg+0x1e/0x20 [ 600.351417] sock_sendmsg+0x3e/0x50 [ 600.351422] sock_write_iter+0x85/0xf0 [ 600.351429] do_iter_readv_writev+0x12b/0x1b0 [ 600.351434] do_iter_write+0x87/0x1a0 [ 600.351439] vfs_writev+0x98/0x110 [ 600.351443] ? ep_poll+0x16d/0x3d0 [ 600.351447] ? ep_modify+0x73/0x170 [ 600.351451] do_writev+0x61/0xf0 [ 600.351455] ? do_writev+0x61/0xf0 [ 600.351460] __x64_sys_writev+0x1c/0x20 [ 600.351465] do_syscall_64+0x5a/0x110 [ 600.351471] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 600.351474] RIP: 0033:0x7fb2bdb62fe0 [ 600.351477] Code: 73 01 c3 48 8b 0d b8 6e 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 69 c7 2c 00 00 75 10 b8 14 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 de 80 01 00 48 89 04 24 [ 600.351479] RSP: 002b:00007ffe062cb8f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 [ 600.351484] RAX: ffffffffffffffda RBX: 000000000255b3d0 RCX: 00007fb2bdb62fe0 [ 600.351487] RDX: 0000000000000001 RSI: 00007ffe062cb920 RDI: 0000000000000004 [ 600.351490] RBP: 00007ffe062cb920 R08: 000000000255bd80 R09: 0000000000000000 [ 600.351494] R10: 0000000000000353 R11: 0000000000000246 R12: 0000000000000001 [ 600.351497] R13: 00007ffe062cbbe0 R14: 0000000000000000 R15: 0000000000000000 [ 600.351501] Modules linked in: algif_hash algif_skcipher af_alg cmac ipt_MASQUERADE nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_nat nf_nat_ipv4 xt_addrtype iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c br_netfilter bridge stp llc overlay arc4 nls_iso8859_1 dm_crypt intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp dell_laptop kvm_intel crct10dif_pclmul dell_smm_hwmon crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper intel_cstate intel_rapl_perf uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_common videodev media hid_multitouch input_leds joydev serio_raw dell_wmi snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic dell_smbios dcdbas sparse_keymap [ 600.351569] snd_hda_intel btusb snd_hda_codec btrtl btbcm btintel snd_hda_core bluetooth(OE) snd_hwdep snd_pcm iwlmvm ecdh_generic wmi_bmof dell_wmi_descriptor snd_seq_midi mac80211 snd_seq_midi_event lpc_ich iwlwifi snd_rawmidi snd_seq snd_seq_device snd_timer cfg80211 snd soundcore mei_me mei dell_rbtn dell_smo8800 mac_hid parport_pc ppdev lp parport autofs4 hid_generic usbhid hid i915 nouveau kvmgt vfio_mdev mdev vfio_iommu_type1 vfio kvm irqbypass i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect sysimgblt mxm_wmi psmouse ahci sdhci_pci cqhci libahci fb_sys_fops sdhci drm e1000e video wmi [ 600.351637] ---[ end trace e49e9f1df09c94fb ]--- [ 600.351664] RIP: 0010:smp_chan_destroy.isra.10+0xce/0x2c0 [bluetooth] [ 600.351666] Code: c0 0f 84 b4 01 00 00 80 78 28 04 0f 84 53 01 00 00 4d 85 ed 0f 85 ab 00 00 00 48 8b 08 48 8b 50 08 be 10 00 00 00 48 89 51 08 <48> 89 0a 48 b9 00 02 00 00 00 00 ad de 48 89 48 08 48 8b 83 00 01 [ 600.351669] RSP: 0018:ffffa9be839b3b50 EFLAGS: 00010246 [ 600.351672] RAX: ffff9c999ac565a0 RBX: ffff9c9996e98c00 RCX: ffff9c999aa28b60 [ 600.351674] RDX: dead000000000200 RSI: 0000000000000010 RDI: ffff9c999e403500 [ 600.351676] RBP: ffffa9be839b3b70 R08: 0000000000000000 R09: ffffffff92a25c00 [ 600.351679] R10: ffffa9be839b3ae8 R11: 0000000000000001 R12: ffff9c995375b800 [ 600.351681] R13: 0000000000000000 R14: ffff9c99619a5000 R15: ffff9c9962a01c00 [ 600.351684] FS: 00007fb2be27c700(0000) GS:ffff9c999e880000(0000) knlGS:0000000000000000 [ 600.351686] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.351689] CR2: 00007fb2bdadbad0 CR3: 000000041c328001 CR4: 00000000001606e0 Crash happened because list_del_rcu() was called twice for smp->ltk. This was possible if unpair_device was called right after ltk was generated but before keys were distributed. In this commit smp_cancel_pairing was refactored to cancel pairing if it is in progress and otherwise just removes keys. Once keys are removed from rcu list, pointers to smp context's keys are set to NULL to make sure removed list items are not accessed later. This commit also adjusts the functionality of mgmt unpair_device() little bit. Previously pairing was canceled only if pairing was in state that keys were already generated. With this commit unpair_device() cancels pairing already in earlier states. Bug was found by fuzzing kernel SMP implementation using Synopsys Defensics. Reported-by: Pekka Oikarainen Signed-off-by: Matias Karhumaa Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 7 ++----- net/bluetooth/smp.c | 29 +++++++++++++++++++++++++---- net/bluetooth/smp.h | 3 ++- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3bdc8f3ca259..ccce954f8146 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2434,9 +2434,8 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, /* LE address type */ addr_type = le_addr_type(cp->addr.type); - hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); - - err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); + /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */ + err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, @@ -2450,8 +2449,6 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, goto done; } - /* Abort any ongoing SMP pairing */ - smp_cancel_pairing(conn); /* Defer clearing up the connection parameters until closing to * give a chance of keeping them if a repairing happens. diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3a7b0773536b..73f7211d0431 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2422,30 +2422,51 @@ unlock: return ret; } -void smp_cancel_pairing(struct hci_conn *hcon) +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct hci_conn *hcon; + struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; + int err; + + err = hci_remove_ltk(hdev, bdaddr, addr_type); + hci_remove_irk(hdev, bdaddr, addr_type); + + hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); + if (!hcon) + goto done; + conn = hcon->l2cap_data; if (!conn) - return; + goto done; chan = conn->smp; if (!chan) - return; + goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { + /* Set keys to NULL to make sure smp_failure() does not try to + * remove and free already invalidated rcu list entries. */ + smp->ltk = NULL; + smp->slave_ltk = NULL; + smp->remote_irk = NULL; + if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); + err = 0; } l2cap_chan_unlock(chan); + +done: + return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 0ff6247eaa6c..121edadd5f8d 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -181,7 +181,8 @@ enum smp_key_pref { }; /* SMP Commands */ -void smp_cancel_pairing(struct hci_conn *hcon); +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type); bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -- cgit v1.2.3 From 36f19d5b4f99fa9fa8263877e5f8e669d7fddc14 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 26 Sep 2018 17:35:14 -0700 Subject: net/ipv6: Remove extra call to ip6_convert_metrics for multipath case The change to move metrics from the dst to rt6_info moved the call to ip6_convert_metrics from ip6_route_add to ip6_route_info_create. In doing so it makes the call in ip6_route_info_append redundant and actually leaks the metrics installed as part of the ip6_route_info_create. Remove the now unnecessary call. Fixes: d4ead6b34b67f ("net/ipv6: move metrics from dst to rt6_info") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 826b14de7dbb..a366c05a239d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4321,11 +4321,6 @@ static int ip6_route_info_append(struct net *net, if (!nh) return -ENOMEM; nh->fib6_info = rt; - err = ip6_convert_metrics(net, rt, r_cfg); - if (err) { - kfree(nh); - return err; - } memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->next, rt6_nh_list); -- cgit v1.2.3 From 6194114324139dc16f3251c67ed853bd6d4ae056 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 24 Sep 2018 21:58:59 +0200 Subject: net: core: add member wol_enabled to struct net_device Add flag wol_enabled to struct net_device indicating whether Wake-on-LAN is enabled. As first user phy_suspend() will use it to decide whether PHY can be suspended or not. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Fixes: e8cfd9d6c772 ("net: phy: call state machine synchronously in phy_stop") Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/ethtool.c | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca5ab98053c8..c7861e4b402c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1730,6 +1730,8 @@ enum netdev_priv_flags { * switch driver and used to set the phys state of the * switch port. * + * @wol_enabled: Wake-on-LAN is enabled + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2014,6 +2016,7 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; + unsigned wol_enabled:1; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 234a0ec2e932..0762aaf8e964 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1483,6 +1483,7 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; + int ret; if (!dev->ethtool_ops->set_wol) return -EOPNOTSUPP; @@ -1490,7 +1491,13 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; - return dev->ethtool_ops->set_wol(dev, &wol); + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + return ret; + + dev->wol_enabled = !!wol.wolopts; + + return 0; } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) -- cgit v1.2.3 From d4ce58082f206bf6e7d697380c7bc5480a8b0264 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 25 Sep 2018 21:59:28 -0700 Subject: net-tcp: /proc/sys/net/ipv4/tcp_probe_interval is a u32 not int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (fix documentation and sysctl access to treat it as such) Tested: # zcat /proc/config.gz | egrep ^CONFIG_HZ CONFIG_HZ_1000=y CONFIG_HZ=1000 # echo $[(1<<32)/1000 + 1] | tee /proc/sys/net/ipv4/tcp_probe_interval 4294968 tee: /proc/sys/net/ipv4/tcp_probe_interval: Invalid argument # echo $[(1<<32)/1000] | tee /proc/sys/net/ipv4/tcp_probe_interval 4294967 # echo 0 | tee /proc/sys/net/ipv4/tcp_probe_interval # echo -1 | tee /proc/sys/net/ipv4/tcp_probe_interval -1 tee: /proc/sys/net/ipv4/tcp_probe_interval: Invalid argument Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 2 +- net/ipv4/sysctl_net_ipv4.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8313a636dd53..960de8fe3f40 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -425,7 +425,7 @@ tcp_mtu_probing - INTEGER 1 - Disabled by default, enabled when an ICMP black hole detected 2 - Always enabled, use initial MSS of tcp_base_mss. -tcp_probe_interval - INTEGER +tcp_probe_interval - UNSIGNED INTEGER Controls how often to start TCP Packetization-Layer Path MTU Discovery reprobe. The default is reprobing every 10 minutes as per RFC4821. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..891ed2f91467 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; +static u32 u32_max_div_HZ = UINT_MAX / HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -745,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, - .maxlen = sizeof(int), + .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", -- cgit v1.2.3 From 1222a16014888ed9733c11e221730d4a8196222b Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 25 Sep 2018 11:15:01 +0900 Subject: nl80211: Fix possible Spectre-v1 for CQM RSSI thresholds Use array_index_nospec() to sanitize i with respect to speculation. Note that the user doesn't control i directly, but can make it out of bounds by not finding a threshold in the array. Signed-off-by: Masashi Honma [add note about user control, as explained by Masashi] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bd26230de63e..176edfefcbaa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10231,7 +10231,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = dev->ieee80211_ptr; s32 last, low, high; u32 hyst; - int i, n; + int i, n, low_index; int err; /* RSSI reporting disabled? */ @@ -10268,10 +10268,19 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (last < wdev->cqm_config->rssi_thresholds[i]) break; - low = i > 0 ? - (wdev->cqm_config->rssi_thresholds[i - 1] - hyst) : S32_MIN; - high = i < n ? - (wdev->cqm_config->rssi_thresholds[i] + hyst - 1) : S32_MAX; + low_index = i - 1; + if (low_index >= 0) { + low_index = array_index_nospec(low_index, n); + low = wdev->cqm_config->rssi_thresholds[low_index] - hyst; + } else { + low = S32_MIN; + } + if (i < n) { + i = array_index_nospec(i, n); + high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1; + } else { + high = S32_MAX; + } return rdev_set_cqm_rssi_range_config(rdev, dev, low, high); } -- cgit v1.2.3 From 092ffc51fb3f9b8369e737c9320bf0bffb2c898f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:07 +0100 Subject: rxrpc: Remove dup code from rxrpc_find_connection_rcu() rxrpc_find_connection_rcu() initialises variable k twice with the same information. Remove one of the initialisations. Signed-off-by: David Howells --- net/rxrpc/conn_object.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 77440a356b14..1746b48cb165 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -85,9 +85,6 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) goto not_found; - k.epoch = sp->hdr.epoch; - k.cid = sp->hdr.cid & RXRPC_CIDMASK; - /* We may have to handle mixing IPv4 and IPv6 */ if (srx.transport.family != local->srx.transport.family) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", -- cgit v1.2.3 From dc71db34e4f3c06b8277c8f3c2ff014610607a8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Fix checks as to whether we should set up a new call There's a check in rxrpc_data_ready() that's checking the CLIENT_INITIATED flag in the packet type field rather than in the packet flags field. Fix this by creating a pair of helper functions to check whether the packet is going to the client or to the server and use them generally. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 10 ++++++++++ net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 12 ++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c97558710421..9fcb3e197b14 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -463,6 +463,16 @@ struct rxrpc_connection { u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; +static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) +{ + return sp->hdr.flags & RXRPC_CLIENT_INITIATED; +} + +static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) +{ + return !rxrpc_to_server(sp); +} + /* * Flags in call->flags. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1746b48cb165..390ba50cfab4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -96,7 +96,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { + if (rxrpc_to_server(sp)) { /* We need to look up service connections by the full protocol * parameter set. We look up the peer first as an intermediate * step and then the connection from the peer's tree. diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cfdc199c6351..ec299c627f77 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1177,10 +1177,6 @@ void rxrpc_data_ready(struct sock *udp_sk) trace_rxrpc_rx_packet(sp); - _net("Rx RxRPC %s ep=%x call=%x:%x", - sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { _proto("Rx Bad Packet Type %u", sp->hdr.type); @@ -1189,13 +1185,13 @@ void rxrpc_data_ready(struct sock *udp_sk) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) + if (rxrpc_to_client(sp)) goto discard; rxrpc_post_packet_to_local(local, skb); goto out; case RXRPC_PACKET_TYPE_BUSY: - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + if (rxrpc_to_server(sp)) goto discard; /* Fall through */ @@ -1280,7 +1276,7 @@ void rxrpc_data_ready(struct sock *udp_sk) call = rcu_dereference(chan->call); if (sp->hdr.callNumber > chan->call_id) { - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) { + if (rxrpc_to_client(sp)) { rcu_read_unlock(); goto reject_packet; } @@ -1303,7 +1299,7 @@ void rxrpc_data_ready(struct sock *udp_sk) } if (!call || atomic_read(&call->usage) == 0) { - if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) || + if (rxrpc_to_client(sp) || sp->hdr.callNumber == 0 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message_unlock; -- cgit v1.2.3 From b604dd9883f783a94020d772e4fe03160f455372 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Fix RTT gathering Fix RTT information gathering in AF_RXRPC by the following means: (1) Enable Rx timestamping on the transport socket with SO_TIMESTAMPNS. (2) If the sk_buff doesn't have a timestamp set when rxrpc_data_ready() collects it, set it at that point. (3) Allow ACKs to be requested on the last packet of a client call, but not a service call. We need to be careful lest we undo: bf7d620abf22c321208a4da4f435e7af52551a21 Author: David Howells Date: Thu Oct 6 08:11:51 2016 +0100 rxrpc: Don't request an ACK on the last DATA packet of a call's Tx phase but that only really applies to service calls that we're handling, since the client side gets to send the final ACK (or not). (4) When about to transmit an ACK or DATA packet, record the Tx timestamp before only; don't update the timestamp afterwards. (5) Switch the ordering between recording the serial and recording the timestamp to always set the serial number first. The serial number shouldn't be seen referenced by an ACK packet until we've transmitted the packet bearing it - so in the Rx path, we don't need the timestamp until we've checked the serial number. Fixes: cf1a6474f807 ("rxrpc: Add per-peer RTT tracker") Signed-off-by: David Howells --- net/rxrpc/input.c | 8 ++++++-- net/rxrpc/local_object.c | 9 +++++++++ net/rxrpc/output.c | 31 ++++++++++++++++++------------- 3 files changed, 33 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ec299c627f77..7f9ed3a60b9a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -622,13 +622,14 @@ static void rxrpc_input_requested_ack(struct rxrpc_call *call, if (!skb) continue; + sent_at = skb->tstamp; + smp_rmb(); /* Read timestamp before serial. */ sp = rxrpc_skb(skb); if (sp->hdr.serial != orig_serial) continue; - smp_rmb(); - sent_at = skb->tstamp; goto found; } + return; found: @@ -1143,6 +1144,9 @@ void rxrpc_data_ready(struct sock *udp_sk) return; } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); _net("recv skb %p", skb); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 777c3ed4cfc0..81de7d889ffa 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -173,6 +173,15 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) _debug("setsockopt failed"); goto error; } + + /* We want receive timestamps. */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS, + (char *)&opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } break; default: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ccf5de160444..8a4da3fe96df 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -124,7 +124,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; - ktime_t now; size_t len, n; int ret; u8 reason; @@ -196,9 +195,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, /* We need to stick a time in before we send the packet in case * the reply gets back before kernel_sendmsg() completes - but * asking UDP to send the packet can take a relatively long - * time, so we update the time after, on the assumption that - * the packet transmission is more likely to happen towards the - * end of the kernel_sendmsg() call. + * time. */ call->ping_time = ktime_get_real(); set_bit(RXRPC_CALL_PINGING, &call->flags); @@ -206,9 +203,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - now = ktime_get_real(); - if (ping) - call->ping_time = now; conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -363,8 +357,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. + * + * However, we mustn't request an ACK on the last reply packet of a + * service call, lest OpenAFS incorrectly send us an ACK with some + * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && + if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || + rxrpc_to_server(sp) + ) && (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || @@ -390,6 +390,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, goto send_fragmentable; down_read(&conn->params.local->defrag_sem); + + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -413,12 +418,8 @@ done: trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, lost); if (ret >= 0) { - ktime_t now = ktime_get_real(); - skb->tstamp = now; - smp_wmb(); - sp->hdr.serial = serial; if (whdr.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = now; + call->peer->rtt_last_req = skb->tstamp; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); if (call->peer->rtt_usage > 1) { unsigned long nowj = jiffies, ack_lost_at; @@ -457,6 +458,10 @@ send_fragmentable: down_write(&conn->params.local->defrag_sem); + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + switch (conn->params.local->srx.transport.family) { case AF_INET: opt = IP_PMTUDISC_DONT; -- cgit v1.2.3 From ece64fec164f523bfbe874abdef2a0e6ff376251 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Emit BUSY packets when supposed to rather than ABORTs In the input path, a received sk_buff can be marked for rejection by setting RXRPC_SKB_MARK_* in skb->mark and, if needed, some auxiliary data (such as an abort code) in skb->priority. The rejection is handled by queueing the sk_buff up for dealing with in process context. The output code reads the mark and priority and, theoretically, generates an appropriate response packet. However, if RXRPC_SKB_MARK_BUSY is set, this isn't noticed and an ABORT message with a random abort code is generated (since skb->priority wasn't set to anything). Fix this by outputting the appropriate sort of packet. Also, whilst we're at it, most of the marks are no longer used, so remove them and rename the remaining two to something more obvious. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 13 ++++--------- net/rxrpc/call_accept.c | 6 +++--- net/rxrpc/input.c | 2 +- net/rxrpc/output.c | 23 ++++++++++++++++++----- 4 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9fcb3e197b14..e8861cb78070 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -40,17 +40,12 @@ struct rxrpc_crypt { struct rxrpc_connection; /* - * Mark applied to socket buffers. + * Mark applied to socket buffers in skb->mark. skb->priority is used + * to pass supplementary information. */ enum rxrpc_skb_mark { - RXRPC_SKB_MARK_DATA, /* data message */ - RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ - RXRPC_SKB_MARK_BUSY, /* server busy message */ - RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ - RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ - RXRPC_SKB_MARK_NET_ERROR, /* network error message */ - RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ - RXRPC_SKB_MARK_NEW_CALL, /* local error message */ + RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ + RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9d1e298b784c..e88f131c1d7f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -353,7 +353,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [service]"); return NULL; @@ -364,7 +364,7 @@ found_service: rx->sk.sk_state == RXRPC_CLOSE) { trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [close]"); call = NULL; @@ -373,7 +373,7 @@ found_service: call = rxrpc_alloc_incoming_call(rx, local, conn, skb); if (!call) { - skb->mark = RXRPC_SKB_MARK_BUSY; + skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); call = NULL; goto out; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7f9ed3a60b9a..b0f12471f5e7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1354,7 +1354,7 @@ bad_message: protocol_error: skb->priority = RX_PROTOCOL_ERROR; post_abort: - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8a4da3fe96df..e8fb8922bca8 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -524,7 +524,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) struct kvec iov[2]; size_t size; __be32 code; - int ret; + int ret, ioc; _enter("%d", local->debug_id); @@ -532,7 +532,6 @@ void rxrpc_reject_packets(struct rxrpc_local *local) iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); msg.msg_name = &srx.transport; msg.msg_control = NULL; @@ -540,17 +539,31 @@ void rxrpc_reject_packets(struct rxrpc_local *local) msg.msg_flags = 0; memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); + switch (skb->mark) { + case RXRPC_SKB_MARK_REJECT_BUSY: + whdr.type = RXRPC_PACKET_TYPE_BUSY; + size = sizeof(whdr); + ioc = 1; + break; + case RXRPC_SKB_MARK_REJECT_ABORT: + whdr.type = RXRPC_PACKET_TYPE_ABORT; + code = htonl(skb->priority); + size = sizeof(whdr) + sizeof(code); + ioc = 2; + break; + default: + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + continue; + } + if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { msg.msg_namelen = srx.transport_len; - code = htonl(skb->priority); - whdr.epoch = htonl(sp->hdr.epoch); whdr.cid = htonl(sp->hdr.cid); whdr.callNumber = htonl(sp->hdr.callNumber); -- cgit v1.2.3 From 403fc2a138457f1071b186786a7589ef7382c8bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: rxrpc: Improve up-front incoming packet checking Do more up-front checking on incoming packets to weed out invalid ones and also ones aimed at services that we don't support. Whilst we're at it, replace the clearing of call and skew if we don't find a connection with just initialising the variables to zero at the top of the function. Signed-off-by: David Howells --- net/rxrpc/input.c | 63 +++++++++++++++++++++++++++++++++++++++++----------- net/rxrpc/protocol.h | 15 ------------- 2 files changed, 50 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b0f12471f5e7..a569e9e010d1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1125,12 +1125,13 @@ void rxrpc_data_ready(struct sock *udp_sk) { struct rxrpc_connection *conn; struct rxrpc_channel *chan; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; + struct rxrpc_sock *rx; struct sk_buff *skb; unsigned int channel; - int ret, skew; + int ret, skew = 0; _enter("%p", udp_sk); @@ -1181,12 +1182,6 @@ void rxrpc_data_ready(struct sock *udp_sk) trace_rxrpc_rx_packet(sp); - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || - !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { - _proto("Rx Bad Packet Type %u", sp->hdr.type); - goto bad_message; - } - switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) @@ -1198,24 +1193,63 @@ void rxrpc_data_ready(struct sock *udp_sk) if (rxrpc_to_server(sp)) goto discard; /* Fall through */ + case RXRPC_PACKET_TYPE_ACK: + case RXRPC_PACKET_TYPE_ACKALL: + if (sp->hdr.callNumber == 0) + goto bad_message; + /* Fall through */ + case RXRPC_PACKET_TYPE_ABORT: + break; case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0) + if (sp->hdr.callNumber == 0 || + sp->hdr.seq == 0) goto bad_message; if (sp->hdr.flags & RXRPC_JUMBO_PACKET && !rxrpc_validate_jumbo(skb)) goto bad_message; break; + case RXRPC_PACKET_TYPE_CHALLENGE: + if (rxrpc_to_server(sp)) + goto discard; + break; + case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_to_client(sp)) + goto discard; + break; + /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: goto discard; + + default: + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; } + if (sp->hdr.serviceId == 0) + goto bad_message; + rcu_read_lock(); + if (rxrpc_to_server(sp)) { + /* Weed out packets to services we're not offering. Packets + * that would begin a call are explicitly rejected and the rest + * are just discarded. + */ + rx = rcu_dereference(local->service); + if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && + sp->hdr.serviceId != rx->second_service)) { + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.seq == 1) + goto unsupported_service; + goto discard_unlock; + } + } + conn = rxrpc_find_connection_rcu(local, skb); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) @@ -1297,14 +1331,10 @@ void rxrpc_data_ready(struct sock *udp_sk) if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) set_bit(RXRPC_CALL_RX_HEARD, &call->flags); } - } else { - skew = 0; - call = NULL; } if (!call || atomic_read(&call->usage) == 0) { if (rxrpc_to_client(sp) || - sp->hdr.callNumber == 0 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message_unlock; if (sp->hdr.seq != 1) @@ -1340,6 +1370,13 @@ wrong_security: skb->priority = RXKADINCONSISTENCY; goto post_abort; +unsupported_service: + rcu_read_unlock(); + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->priority = RX_INVALID_OPERATION; + goto post_abort; + reupgrade: rcu_read_unlock(); trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 93da73bf7098..f9cb83c938f3 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -50,7 +50,6 @@ struct rxrpc_wire_header { #define RXRPC_PACKET_TYPE_10 10 /* Ignored */ #define RXRPC_PACKET_TYPE_11 11 /* Ignored */ #define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ -#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ uint8_t flags; /* packet flags */ #define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ @@ -72,20 +71,6 @@ struct rxrpc_wire_header { } __packed; -#define RXRPC_SUPPORTED_PACKET_TYPES ( \ - (1 << RXRPC_PACKET_TYPE_DATA) | \ - (1 << RXRPC_PACKET_TYPE_ACK) | \ - (1 << RXRPC_PACKET_TYPE_BUSY) | \ - (1 << RXRPC_PACKET_TYPE_ABORT) | \ - (1 << RXRPC_PACKET_TYPE_ACKALL) | \ - (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ - (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ - /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ - (1 << RXRPC_PACKET_TYPE_PARAMS) | \ - (1 << RXRPC_PACKET_TYPE_10) | \ - (1 << RXRPC_PACKET_TYPE_11) | \ - (1 << RXRPC_PACKET_TYPE_VERSION)) - /*****************************************************************************/ /* * jumbo packet secondary header -- cgit v1.2.3 From 0099dc589bfa7caf6f2608c4cbc1181cfee22b0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: rxrpc: Make service call handling more robust Make the following changes to improve the robustness of the code that sets up a new service call: (1) Cache the rxrpc_sock struct obtained in rxrpc_data_ready() to do a service ID check and pass that along to rxrpc_new_incoming_call(). This means that I can remove the check from rxrpc_new_incoming_call() without the need to worry about the socket attached to the local endpoint getting replaced - which would invalidate the check. (2) Cache the rxrpc_peer struct, thereby allowing the peer search to be done once. The peer is passed to rxrpc_new_incoming_call(), thereby saving the need to repeat the search. This also reduces the possibility of rxrpc_publish_service_conn() BUG()'ing due to the detection of a duplicate connection, despite the initial search done by rxrpc_find_connection_rcu() having turned up nothing. This BUG() shouldn't ever get hit since rxrpc_data_ready() *should* be non-reentrant and the result of the initial search should still hold true, but it has proven possible to hit. I *think* this may be due to __rxrpc_lookup_peer_rcu() cutting short the iteration over the hash table if it finds a matching peer with a zero usage count, but I don't know for sure since it's only ever been hit once that I know of. Another possibility is that a bug in rxrpc_data_ready() that checked the wrong byte in the header for the RXRPC_CLIENT_INITIATED flag might've let through a packet that caused a spurious and invalid call to be set up. That is addressed in another patch. (3) Fix __rxrpc_lookup_peer_rcu() to skip peer records that have a zero usage count rather than stopping and returning not found, just in case there's another peer record behind it in the bucket. (4) Don't search the peer records in rxrpc_alloc_incoming_call(), but rather either use the peer cached in (2) or, if one wasn't found, preemptively install a new one. Fixes: 8496af50eb38 ("rxrpc: Use RCU to access a peer's service connection tree") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 8 +++++--- net/rxrpc/call_accept.c | 41 ++++++++++++----------------------------- net/rxrpc/conn_object.c | 7 ++++++- net/rxrpc/input.c | 7 ++++--- net/rxrpc/peer_object.c | 35 +++++++++++------------------------ 5 files changed, 38 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e8861cb78070..c72686193d83 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -722,6 +722,8 @@ extern struct workqueue_struct *rxrpc_workqueue; int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, + struct rxrpc_sock *, + struct rxrpc_peer *, struct rxrpc_connection *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); @@ -913,7 +915,8 @@ extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, - struct sk_buff *); + struct sk_buff *, + struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); @@ -1049,8 +1052,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, - struct rxrpc_peer *); +void rxrpc_new_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e88f131c1d7f..9c7f26d06a52 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -249,11 +249,11 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) */ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; - struct rxrpc_peer *peer, *xpeer; struct rxrpc_call *call; unsigned short call_head, conn_head, peer_head; unsigned short call_tail, conn_tail, peer_tail; @@ -276,21 +276,18 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, return NULL; if (!conn) { - /* No connection. We're going to need a peer to start off - * with. If one doesn't yet exist, use a spare from the - * preallocation set. We dump the address into the spare in - * anticipation - and to save on stack space. - */ - xpeer = b->peer_backlog[peer_tail]; - if (rxrpc_extract_addr_from_skb(local, &xpeer->srx, skb) < 0) - return NULL; - - peer = rxrpc_lookup_incoming_peer(local, xpeer); - if (peer == xpeer) { + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + peer = b->peer_backlog[peer_tail]; + if (rxrpc_extract_addr_from_skb(local, &peer->srx, skb) < 0) + return NULL; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + + rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ @@ -335,30 +332,16 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_sock *rx; struct rxrpc_call *call; - u16 service_id = sp->hdr.serviceId; _enter(""); - /* Get the socket providing the service */ - rx = rcu_dereference(local->service); - if (rx && (service_id == rx->srx.srx_service || - service_id == rx->second_service)) - goto found_service; - - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [service]"); - return NULL; - -found_service: spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { @@ -371,7 +354,7 @@ found_service: goto out; } - call = rxrpc_alloc_incoming_call(rx, local, conn, skb); + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 390ba50cfab4..b4438f98dc5c 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -69,10 +69,14 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) * If successful, a pointer to the connection is returned, but no ref is taken. * NULL is returned if there is no match. * + * When searching for a service call, if we find a peer but no connection, we + * return that through *_peer in case we need to create a new service call. + * * The caller must be holding the RCU read lock. */ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, - struct sk_buff *skb) + struct sk_buff *skb, + struct rxrpc_peer **_peer) { struct rxrpc_connection *conn; struct rxrpc_conn_proto k; @@ -104,6 +108,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, peer = rxrpc_lookup_peer_rcu(local, &srx); if (!peer) goto not_found; + *_peer = peer; conn = rxrpc_find_service_conn_rcu(peer, skb); if (!conn || atomic_read(&conn->usage) == 0) goto not_found; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index a569e9e010d1..800f5b8a1baa 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1128,7 +1128,8 @@ void rxrpc_data_ready(struct sock *udp_sk) struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; - struct rxrpc_sock *rx; + struct rxrpc_peer *peer = NULL; + struct rxrpc_sock *rx = NULL; struct sk_buff *skb; unsigned int channel; int ret, skew = 0; @@ -1250,7 +1251,7 @@ void rxrpc_data_ready(struct sock *udp_sk) } } - conn = rxrpc_find_connection_rcu(local, skb); + conn = rxrpc_find_connection_rcu(local, skb, &peer); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) goto wrong_security; @@ -1339,7 +1340,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto bad_message_unlock; if (sp->hdr.seq != 1) goto discard_unlock; - call = rxrpc_new_incoming_call(local, conn, skb); + call = rxrpc_new_incoming_call(local, rx, peer, conn, skb); if (!call) { rcu_read_unlock(); goto reject_packet; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 1dc7648e3eff..70083e8fb6e5 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -124,11 +124,9 @@ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { - if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { - if (atomic_read(&peer->usage) == 0) - return NULL; + if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && + atomic_read(&peer->usage) > 0) return peer; - } } return NULL; @@ -299,34 +297,23 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, } /* - * Set up a new incoming peer. The address is prestored in the preallocated - * peer. + * Set up a new incoming peer. There shouldn't be any other matching peers + * since we've already done a search in the list from the non-reentrant context + * (the data_ready handler) that is the only place we can add new peers. */ -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, - struct rxrpc_peer *prealloc) +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { - struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; - hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); - prealloc->local = local; - rxrpc_init_peer(prealloc, hash_key); + hash_key = rxrpc_peer_hash_key(local, &peer->srx); + peer->local = local; + rxrpc_init_peer(peer, hash_key); spin_lock(&rxnet->peer_hash_lock); - - /* Need to check that we aren't racing with someone else */ - peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - peer = prealloc; - hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); - list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - } - + hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); + list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); - return peer; } /* -- cgit v1.2.3 From 37a675e768d7606fe8a53e0c459c9b53e121ac20 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: rxrpc: Fix transport sockopts to get IPv4 errors on an IPv6 socket It seems that enabling IPV6_RECVERR on an IPv6 socket doesn't also turn on IP_RECVERR, so neither local errors nor ICMP-transported remote errors from IPv4 peer addresses are returned to the AF_RXRPC protocol. Make the sockopt setting code in rxrpc_open_socket() fall through from the AF_INET6 case to the AF_INET case to turn on all the AF_INET options too in the AF_INET6 case. Fixes: f2aeed3a591f ("rxrpc: Fix error reception on AF_INET6 sockets") Signed-off-by: David Howells --- net/rxrpc/local_object.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 81de7d889ffa..94d234e9c685 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -135,10 +135,10 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } switch (local->srx.transport.family) { - case AF_INET: - /* we want to receive ICMP errors */ + case AF_INET6: + /* we want to receive ICMPv6 errors */ opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); @@ -146,19 +146,22 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + opt = IPV6_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); goto error; } - break; - case AF_INET6: + /* Fall through and set IPv4 options too otherwise we don't get + * errors from IPv4 packets sent through the IPv6 socket. + */ + + case AF_INET: /* we want to receive ICMP errors */ opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); @@ -166,8 +169,8 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* we want to set the don't fragment bit */ - opt = IPV6_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); if (ret < 0) { _debug("setsockopt failed"); -- cgit v1.2.3 From f334430316e7fd37c4821ebec627e27714bb5d76 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: rxrpc: Fix error distribution Fix error distribution by immediately delivering the errors to all the affected calls rather than deferring them to a worker thread. The problem with the latter is that retries and things can happen in the meantime when we want to stop that sooner. To this end: (1) Stop the error distributor from removing calls from the error_targets list so that peer->lock isn't needed to synchronise against other adds and removals. (2) Require the peer's error_targets list to be accessed with RCU, thereby avoiding the need to take peer->lock over distribution. (3) Don't attempt to affect a call's state if it is already marked complete. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 4 +--- net/rxrpc/ar-internal.h | 5 ----- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 4 ++-- net/rxrpc/conn_object.c | 2 +- net/rxrpc/peer_event.c | 46 +++++++++++--------------------------------- net/rxrpc/peer_object.c | 17 ---------------- 7 files changed, 16 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 196587b8f204..837393fa897b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -56,7 +56,6 @@ enum rxrpc_peer_trace { rxrpc_peer_new, rxrpc_peer_processing, rxrpc_peer_put, - rxrpc_peer_queued_error, }; enum rxrpc_conn_trace { @@ -257,8 +256,7 @@ enum rxrpc_tx_point { EM(rxrpc_peer_got, "GOT") \ EM(rxrpc_peer_new, "NEW") \ EM(rxrpc_peer_processing, "PRO") \ - EM(rxrpc_peer_put, "PUT") \ - E_(rxrpc_peer_queued_error, "QER") + E_(rxrpc_peer_put, "PUT") #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c72686193d83..ef9554131434 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -288,7 +288,6 @@ struct rxrpc_peer { struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ - struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ @@ -299,8 +298,6 @@ struct rxrpc_peer { unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ - int error_report; /* Net (+0) or local (+1000000) to distribute */ -#define RXRPC_LOCAL_ERROR_OFFSET 1000000 struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ @@ -1039,7 +1036,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); * peer_event.c */ void rxrpc_error_report(struct sock *); -void rxrpc_peer_error_distributor(struct work_struct *); void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); void rxrpc_peer_keepalive_worker(struct work_struct *); @@ -1057,7 +1053,6 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); void rxrpc_put_peer(struct rxrpc_peer *); -void __rxrpc_queue_peer_error(struct rxrpc_peer *); /* * proc.c diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9486293fef5c..799f75b6900d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -400,7 +400,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(conn->channels[chan].call, call); spin_lock(&conn->params.peer->lock); - hlist_add_head(&call->error_link, &conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f8f37188a932..8acf74fe24c0 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -710,8 +710,8 @@ int rxrpc_connect_call(struct rxrpc_call *call, } spin_lock_bh(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, - &call->conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, + &call->conn->params.peer->error_targets); spin_unlock_bh(&call->conn->params.peer->lock); out: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index b4438f98dc5c..885dae829f4a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -216,7 +216,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_cwnd = call->cong_cwnd; spin_lock_bh(&conn->params.peer->lock); - hlist_del_init(&call->error_link); + hlist_del_rcu(&call->error_link); spin_unlock_bh(&conn->params.peer->lock); if (rxrpc_is_client_call(call)) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 4f9da2f51c69..f3e6fc670da2 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -23,6 +23,8 @@ #include "ar-internal.h" static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); +static void rxrpc_distribute_error(struct rxrpc_peer *, int, + enum rxrpc_call_completion); /* * Find the peer associated with an ICMP packet. @@ -194,8 +196,6 @@ void rxrpc_error_report(struct sock *sk) rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); - /* The ref we obtained is passed off to the work item */ - __rxrpc_queue_peer_error(peer); _leave(""); } @@ -205,6 +205,7 @@ void rxrpc_error_report(struct sock *sk) static void rxrpc_store_error(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) { + enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR; struct sock_extended_err *ee; int err; @@ -255,7 +256,7 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: _proto("Rx Received local error { error=%d }", err); - err += RXRPC_LOCAL_ERROR_OFFSET; + compl = RXRPC_CALL_LOCAL_ERROR; break; case SO_EE_ORIGIN_ICMP6: @@ -264,48 +265,23 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; } - peer->error_report = err; + rxrpc_distribute_error(peer, err, compl); } /* - * Distribute an error that occurred on a peer + * Distribute an error that occurred on a peer. */ -void rxrpc_peer_error_distributor(struct work_struct *work) +static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, + enum rxrpc_call_completion compl) { - struct rxrpc_peer *peer = - container_of(work, struct rxrpc_peer, error_distributor); struct rxrpc_call *call; - enum rxrpc_call_completion compl; - int error; - - _enter(""); - - error = READ_ONCE(peer->error_report); - if (error < RXRPC_LOCAL_ERROR_OFFSET) { - compl = RXRPC_CALL_NETWORK_ERROR; - } else { - compl = RXRPC_CALL_LOCAL_ERROR; - error -= RXRPC_LOCAL_ERROR_OFFSET; - } - _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error); - - spin_lock_bh(&peer->lock); - - while (!hlist_empty(&peer->error_targets)) { - call = hlist_entry(peer->error_targets.first, - struct rxrpc_call, error_link); - hlist_del_init(&call->error_link); + hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); - - if (rxrpc_set_call_completion(call, compl, 0, -error)) + if (call->state < RXRPC_CALL_COMPLETE && + rxrpc_set_call_completion(call, compl, 0, -error)) rxrpc_notify_socket(call); } - - spin_unlock_bh(&peer->lock); - - rxrpc_put_peer(peer); - _leave(""); } /* diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 70083e8fb6e5..01a9febfa367 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -220,8 +220,6 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) atomic_set(&peer->usage, 1); peer->local = local; INIT_HLIST_HEAD(&peer->error_targets); - INIT_WORK(&peer->error_distributor, - &rxrpc_peer_error_distributor); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); @@ -402,21 +400,6 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) return peer; } -/* - * Queue a peer record. This passes the caller's ref to the workqueue. - */ -void __rxrpc_queue_peer_error(struct rxrpc_peer *peer) -{ - const void *here = __builtin_return_address(0); - int n; - - n = atomic_read(&peer->usage); - if (rxrpc_queue_work(&peer->error_distributor)) - trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here); - else - rxrpc_put_peer(peer); -} - /* * Discard a peer record. */ -- cgit v1.2.3 From a13f814a67b12a2f29d1decf4b4f4e700658a517 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 30 Aug 2018 17:56:52 +0900 Subject: netfilter: nft_set_rbtree: add missing rb_erase() in GC routine The nft_set_gc_batch_check() checks whether gc buffer is full. If gc buffer is full, gc buffer is released by the nft_set_gc_batch_complete() internally. In case of rbtree, the rb_erase() should be called before calling the nft_set_gc_batch_complete(). therefore the rb_erase() should be called before calling the nft_set_gc_batch_check() too. test commands: table ip filter { set set1 { type ipv4_addr; flags interval, timeout; gc-interval 10s; timeout 1s; elements = { 1-2, 3-4, 5-6, ... 10000-10001, } } } %nft -f test.nft splat looks like: [ 430.273885] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 430.282158] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 430.283116] CPU: 1 PID: 190 Comm: kworker/1:2 Tainted: G B 4.18.0+ #7 [ 430.283116] Workqueue: events_power_efficient nft_rbtree_gc [nf_tables_set] [ 430.313559] RIP: 0010:rb_next+0x81/0x130 [ 430.313559] Code: 08 49 bd 00 00 00 00 00 fc ff df 48 bb 00 00 00 00 00 fc ff df 48 85 c0 75 05 eb 58 48 89 d4 [ 430.313559] RSP: 0018:ffff88010cdb7680 EFLAGS: 00010207 [ 430.313559] RAX: 0000000000b84854 RBX: dffffc0000000000 RCX: ffffffff83f01973 [ 430.313559] RDX: 000000000017090c RSI: 0000000000000008 RDI: 0000000000b84864 [ 430.313559] RBP: ffff8801060d4588 R08: fffffbfff09bc349 R09: fffffbfff09bc349 [ 430.313559] R10: 0000000000000001 R11: fffffbfff09bc348 R12: ffff880100f081a8 [ 430.313559] R13: dffffc0000000000 R14: ffff880100ff8688 R15: dffffc0000000000 [ 430.313559] FS: 0000000000000000(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000 [ 430.313559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 430.313559] CR2: 0000000001551008 CR3: 000000005dc16000 CR4: 00000000001006e0 [ 430.313559] Call Trace: [ 430.313559] nft_rbtree_gc+0x112/0x5c0 [nf_tables_set] [ 430.313559] process_one_work+0xc13/0x1ec0 [ 430.313559] ? _raw_spin_unlock_irq+0x29/0x40 [ 430.313559] ? pwq_dec_nr_in_flight+0x3c0/0x3c0 [ 430.313559] ? set_load_weight+0x270/0x270 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __schedule+0x6d3/0x1f50 [ 430.313559] ? find_held_lock+0x39/0x1c0 [ 430.313559] ? __sched_text_start+0x8/0x8 [ 430.313559] ? cyc2ns_read_end+0x10/0x10 [ 430.313559] ? save_trace+0x300/0x300 [ 430.313559] ? sched_clock_local+0xd4/0x140 [ 430.313559] ? find_held_lock+0x39/0x1c0 [ 430.313559] ? worker_thread+0x353/0x1120 [ 430.313559] ? worker_thread+0x353/0x1120 [ 430.313559] ? lock_contended+0xe70/0xe70 [ 430.313559] ? __lock_acquire+0x4500/0x4500 [ 430.535635] ? do_raw_spin_unlock+0xa5/0x330 [ 430.535635] ? do_raw_spin_trylock+0x101/0x1a0 [ 430.535635] ? do_raw_spin_lock+0x1f0/0x1f0 [ 430.535635] ? _raw_spin_lock_irq+0x10/0x70 [ 430.535635] worker_thread+0x15d/0x1120 [ ... ] Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 55e2d9215c0d..0e5ec126f6ad 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -355,12 +355,11 @@ cont: static void nft_rbtree_gc(struct work_struct *work) { + struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; struct nft_set_gc_batch *gcb = NULL; - struct rb_node *node, *prev = NULL; - struct nft_rbtree_elem *rbe; struct nft_rbtree *priv; + struct rb_node *node; struct nft_set *set; - int i; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); @@ -371,7 +370,7 @@ static void nft_rbtree_gc(struct work_struct *work) rbe = rb_entry(node, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe)) { - prev = node; + rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) @@ -379,29 +378,30 @@ static void nft_rbtree_gc(struct work_struct *work) if (nft_set_elem_mark_busy(&rbe->ext)) continue; + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + rbe_prev = NULL; + } gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (!gcb) break; atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); + rbe_prev = rbe; - if (prev) { - rbe = rb_entry(prev, struct nft_rbtree_elem, node); + if (rbe_end) { atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - prev = NULL; + nft_set_gc_batch_add(gcb, rbe_end); + rb_erase(&rbe_end->node, &priv->root); + rbe_end = NULL; } node = rb_next(node); if (!node) break; } - if (gcb) { - for (i = 0; i < gcb->head.cnt; i++) { - rbe = gcb->elems[i]; - rb_erase(&rbe->node, &priv->root); - } - } + if (rbe_prev) + rb_erase(&rbe_prev->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); -- cgit v1.2.3 From 40e4f26e6a14fc1496eabb8b0004a547303114e6 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Thu, 27 Sep 2018 19:36:28 -0300 Subject: netfilter: xt_socket: check sk before checking for netns. Only check for the network namespace if the socket is available. Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 0472f3472842..ada144e5645b 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) -- cgit v1.2.3 From c24498c6827b71f80fecc9fb1b70a792053d41a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:51 -0700 Subject: netpoll: do not test NAPI_STATE_SCHED in poll_one_napi() Since we do no longer require NAPI drivers to provide an ndo_poll_controller(), napi_schedule() has not been done before poll_one_napi() invocation. So testing NAPI_STATE_SCHED is likely to cause early returns. While we are at it, remove outdated comment. Note to future bisections : This change might surface prior bugs in drivers. See commit 73f21c653f93 ("bnxt_en: Fix TX timeout during netpoll.") for one occurrence. Fixes: ac3d9dd034e5 ("netpoll: make ndo_poll_controller() optional") Signed-off-by: Eric Dumazet Tested-by: Song Liu Cc: Michael Chan Signed-off-by: David S. Miller --- net/core/netpoll.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3219a2932463..3ae899805f8b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -135,27 +135,9 @@ static void queue_process(struct work_struct *work) } } -/* - * Check whether delayed processing was scheduled for our NIC. If so, - * we attempt to grab the poll lock and use ->poll() to pump the card. - * If this fails, either we've recursed in ->poll() or it's already - * running on another CPU. - * - * Note: we don't mask interrupts with this lock because we're using - * trylock here and interrupts are already disabled in the softirq - * case. Further, we test the poll_owner to avoid recursion on UP - * systems where the lock doesn't exist. - */ static void poll_one_napi(struct napi_struct *napi) { - int work = 0; - - /* net_rx_action's ->poll() invocations and our's are - * synchronized by this test which is only made while - * holding the napi->poll_lock. - */ - if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return; + int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need -- cgit v1.2.3 From c140eb166d681f66bd7e99fb121357db1a503e7f Mon Sep 17 00:00:00 2001 From: LUU Duc Canh Date: Wed, 26 Sep 2018 21:00:54 +0200 Subject: tipc: fix failover problem We see the following scenario: 1) Link endpoint B on node 1 discovers that its peer endpoint is gone. Since there is a second working link, failover procedure is started. 2) Link endpoint A on node 1 sends a FAILOVER message to peer endpoint A on node 2. The node item 1->2 goes to state FAILINGOVER. 3) Linke endpoint A/2 receives the failover, and is supposed to take down its parallell link endpoint B/2, while producing a FAILOVER message to send back to A/1. 4) However, B/2 has already been deleted, so no FAILOVER message can created. 5) Node 1->2 remains in state FAILINGOVER forever, refusing to receive any messages that can bring B/1 up again. We are left with a non- redundant link between node 1 and 2. We fix this with letting endpoint A/2 build a dummy FAILOVER message to send to back to A/1, so that the situation can be resolved. Signed-off-by: LUU Duc Canh Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 35 +++++++++++++++++++++++++++++++++++ net/tipc/link.h | 3 +++ net/tipc/node.c | 11 +++++++++++ 3 files changed, 49 insertions(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 26cc033ee167..4ed650ce6e61 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -410,6 +410,11 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } +u32 tipc_link_state(struct tipc_link *l) +{ + return l->state; +} + /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -1385,6 +1390,36 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, __skb_queue_tail(xmitq, skb); } +void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + u32 onode = tipc_own_addr(l->net); + struct tipc_msg *hdr, *ihdr; + struct sk_buff_head tnlq; + struct sk_buff *skb; + u32 dnode = l->addr; + + skb_queue_head_init(&tnlq); + skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG, + INT_H_SIZE, BASIC_H_SIZE, + dnode, onode, 0, 0, 0); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + + hdr = buf_msg(skb); + msg_set_msgcnt(hdr, 1); + msg_set_bearer_id(hdr, l->peer_bearer_id); + + ihdr = (struct tipc_msg *)msg_data(hdr); + tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, dnode); + msg_set_errcode(ihdr, TIPC_ERR_NO_PORT); + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, xmitq); +} + /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets * with contents of the link's transmit and backlog queues. */ diff --git a/net/tipc/link.h b/net/tipc/link.h index 7bc494a33fdf..90488c538a4e 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -88,6 +88,8 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct tipc_link **link); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); +void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl, + struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); bool tipc_link_is_up(struct tipc_link *l); @@ -107,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); +u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index 68014f1b6976..b0ee25f1f2e6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -111,6 +111,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; @@ -680,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); + n->failover_sent = false; n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); @@ -1615,6 +1617,15 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } + /* If parallel link was already down, and this happened before + * the tunnel link came up, FAILOVER was never sent. Ensure that + * FAILOVER is sent to get peer out of NODE_FAILINGOVER state. + */ + if (n->state != NODE_FAILINGOVER && !n->failover_sent) { + tipc_link_create_dummy_tnl_msg(l, xmitq); + n->failover_sent = true; + } + /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; -- cgit v1.2.3 From 1ad98e9d1bdf4724c0a8532fabd84bf3c457c2bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Oct 2018 15:02:26 -0700 Subject: tcp/dccp: fix lockdep issue when SYN is backlogged In normal SYN processing, packets are handled without listener lock and in RCU protected ingress path. But syzkaller is known to be able to trick us and SYN packets might be processed in process context, after being queued into socket backlog. In commit 06f877d613be ("tcp/dccp: fix other lockdep splats accessing ireq_opt") I made a very stupid fix, that happened to work mostly because of the regular path being RCU protected. Really the thing protecting ireq->ireq_opt is RCU read lock, and the pseudo request refcnt is not relevant. This patch extends what I did in commit 449809a66c1d ("tcp/dccp: block BH for SYN processing") by adding an extra rcu_read_{lock|unlock} pair in the paths that might be taken when processing SYN from socket backlog (thus possibly in process context) Fixes: 06f877d613be ("tcp/dccp: fix other lockdep splats accessing ireq_opt") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/inet_sock.h | 3 +-- net/dccp/input.c | 4 +++- net/ipv4/tcp_input.c | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e03b93360f33..a8cd5cf9ff5b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -132,8 +132,7 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) { - return rcu_dereference_check(ireq->ireq_opt, - refcount_read(&ireq->req.rsk_refcnt) > 0); + return rcu_dereference(ireq->ireq_opt); } struct inet_cork { diff --git a/net/dccp/input.c b/net/dccp/input.c index d28d46bff6ab..85d6c879383d 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -606,11 +606,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; consume_skb(skb); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf2f7bb2802..47e08c1b5bc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6009,11 +6009,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->fin) goto discard; /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; -- cgit v1.2.3 From aeadd93f2b0a609f603ac33e574b97a9832d1b90 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Sep 2018 16:46:48 +0300 Subject: net: sched: act_ipt: check for underflow in __tcf_ipt_init() If "td->u.target_size" is larger than sizeof(struct xt_entry_target) we return -EINVAL. But we don't check whether it's smaller than sizeof(struct xt_entry_target) and that could lead to an out of bounds read. Fixes: 7ba699c604ab ("[NET_SCHED]: Convert actions from rtnetlink to new netlink API") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 23273b5303fd..8525de811616 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -135,7 +135,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); - if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { + if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { if (exists) tcf_idr_release(*a, bind); else -- cgit v1.2.3 From d949cfedbcbab4e91590576cbace2671924ad69c Mon Sep 17 00:00:00 2001 From: LUU Duc Canh Date: Wed, 26 Sep 2018 22:28:52 +0200 Subject: tipc: ignore STATE_MSG on wrong link session The initial session number when a link is created is based on a random value, taken from struct tipc_net->random. It is then incremented for each link reset to avoid mixing protocol messages from different link sessions. However, when a bearer is reset all its links are deleted, and will later be re-created using the same random value as the first time. This means that if the link never went down between creation and deletion we will still sometimes have two subsequent sessions with the same session number. In virtual environments with potentially long transmission times this has turned out to be a real problem. We now fix this by randomizing the session number each time a link is created. With a session number size of 16 bits this gives a risk of session collision of 1/64k. To reduce this further, we also introduce a sanity check on the very first STATE message arriving at a link. If this has an acknowledge value differing from 0, which is logically impossible, we ignore the message. The final risk for session collision is hence reduced to 1/4G, which should be sufficient. Signed-off-by: LUU Duc Canh Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 3 +++ net/tipc/node.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 4ed650ce6e61..fb886b525d95 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1516,6 +1516,9 @@ bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) return false; if (session != curr_session) return false; + /* Extra sanity check */ + if (!link_is_up(l) && msg_ack(hdr)) + return false; if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) return true; /* Accept only STATE with new sequence number */ diff --git a/net/tipc/node.c b/net/tipc/node.c index b0ee25f1f2e6..2afc4f8c37a7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -913,6 +913,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool reset = true; char *if_name; unsigned long intv; + u16 session; *dupl_addr = false; *respond = false; @@ -999,9 +1000,10 @@ void tipc_node_check_dest(struct net *net, u32 addr, goto exit; if_name = strchr(b->name, ':') + 1; + get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, mod(tipc_net(net)->random), + b->window, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -1625,7 +1627,6 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_link_create_dummy_tnl_msg(l, xmitq); n->failover_sent = true; } - /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; -- cgit v1.2.3 From 7f6d6558ae44bc193eb28df3617c364d3bb6df39 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Fri, 28 Sep 2018 14:55:34 -0300 Subject: Revert "openvswitch: Fix template leak in error cases." This reverts commit 90c7afc96cbbd77f44094b5b651261968e97de67. When the commit was merged, the code used nf_ct_put() to free the entry, but later on commit 76644232e612 ("openvswitch: Free tmpl with tmpl_free.") replaced that with nf_ct_tmpl_free which is a more appropriate. Now the original problem is removed. Then 44d6e2f27328 ("net: Replace NF_CT_ASSERT() with WARN_ON().") replaced a debug assert with a WARN_ON() which is trigged now. Signed-off-by: Flavio Leitner Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 86a75105af1a..0aeb34c6389d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1624,10 +1624,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } - - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); - if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -1639,6 +1635,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); -- cgit v1.2.3 From 893626d6a353d1356528f94e081246ecf233d77a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Sep 2018 12:28:41 -0700 Subject: rtnetlink: Fail dump if target netnsid is invalid Link dumps can return results from a target namespace. If the namespace id is invalid, then the dump request should fail if get_target_net fails rather than continuing with a dump of the current namespace. Fixes: 79e1ad148c844 ("rtnetlink: use netnsid to query interface") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 63ce2283a456..7f37fe9c65a5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1898,10 +1898,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); tgt_net = get_target_net(skb->sk, netnsid); - if (IS_ERR(tgt_net)) { - tgt_net = net; - netnsid = -1; - } + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) -- cgit v1.2.3 From 6fe9487892b32cb1c8b8b0d552ed7222a527fe30 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 28 Sep 2018 16:26:08 -0400 Subject: bond: take rcu lock in netpoll_send_skb_on_dev The bonding driver lacks the rcu lock when it calls down into netdev_lower_get_next_private_rcu from bond_poll_controller, which results in a trace like: WARNING: CPU: 2 PID: 179 at net/core/dev.c:6567 netdev_lower_get_next_private_rcu+0x34/0x40 CPU: 2 PID: 179 Comm: kworker/u16:15 Not tainted 4.19.0-rc5-backup+ #1 Workqueue: bond0 bond_mii_monitor RIP: 0010:netdev_lower_get_next_private_rcu+0x34/0x40 Code: 48 89 fb e8 fe 29 63 ff 85 c0 74 1e 48 8b 45 00 48 81 c3 c0 00 00 00 48 8b 00 48 39 d8 74 0f 48 89 45 00 48 8b 40 f8 5b 5d c3 <0f> 0b eb de 31 c0 eb f5 0f 1f 40 00 0f 1f 44 00 00 48 8> RSP: 0018:ffffc9000087fa68 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff880429614560 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 00000000ffffffff RDI: ffffffffa184ada0 RBP: ffffc9000087fa80 R08: 0000000000000001 R09: 0000000000000000 R10: ffffc9000087f9f0 R11: ffff880429798040 R12: ffff8804289d5980 R13: ffffffffa1511f60 R14: 00000000000000c8 R15: 00000000ffffffff FS: 0000000000000000(0000) GS:ffff88042f880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4b78fce180 CR3: 000000018180f006 CR4: 00000000001606e0 Call Trace: bond_poll_controller+0x52/0x170 netpoll_poll_dev+0x79/0x290 netpoll_send_skb_on_dev+0x158/0x2c0 netpoll_send_udp+0x2d5/0x430 write_ext_msg+0x1e0/0x210 console_unlock+0x3c4/0x630 vprintk_emit+0xfa/0x2f0 printk+0x52/0x6e ? __netdev_printk+0x12b/0x220 netdev_info+0x64/0x80 ? bond_3ad_set_carrier+0xe9/0x180 bond_select_active_slave+0x1fc/0x310 bond_mii_monitor+0x709/0x9b0 process_one_work+0x221/0x5e0 worker_thread+0x4f/0x3b0 kthread+0x100/0x140 ? process_one_work+0x5e0/0x5e0 ? kthread_delayed_work_timer_fn+0x90/0x90 ret_from_fork+0x24/0x30 We're also doing rcu dereferences a layer up in netpoll_send_skb_on_dev before we call down into netpoll_poll_dev, so just take the lock there. Suggested-by: Cong Wang Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3ae899805f8b..de1d1ba92f2d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -312,6 +312,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; + rcu_read_lock_bh(); lockdep_assert_irqs_disabled(); npinfo = rcu_dereference_bh(np->dev->npinfo); @@ -356,6 +357,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + rcu_read_unlock_bh(); } EXPORT_SYMBOL(netpoll_send_skb_on_dev); -- cgit v1.2.3 From 2ab2ddd301a22ca3c5f0b743593e4ad2953dfa53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Oct 2018 12:35:05 -0700 Subject: inet: make sure to grab rcu_read_lock before using ireq->ireq_opt Timer handlers do not imply rcu_read_lock(), so my recent fix triggered a LOCKDEP warning when SYNACK is retransmit. Lets add rcu_read_lock()/rcu_read_unlock() pairs around ireq->ireq_opt usages instead of guessing what is done by callers, since it is not worth the pain. Get rid of ireq_opt_deref() helper since it hides the logic without real benefit, since it is now a standard rcu_dereference(). Fixes: 1ad98e9d1bdf ("tcp/dccp: fix lockdep issue when SYN is backlogged") Signed-off-by: Eric Dumazet Reported-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_sock.h | 5 ----- net/dccp/ipv4.c | 4 +++- net/ipv4/inet_connection_sock.c | 5 ++++- net/ipv4/tcp_ipv4.c | 4 +++- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a8cd5cf9ff5b..a80fd0ac4563 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -130,11 +130,6 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, return sk->sk_bound_dev_if; } -static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) -{ - return rcu_dereference(ireq->ireq_opt); -} - struct inet_cork { unsigned int flags; __be32 addr; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b08feb219b44..8e08cea6f178 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -493,9 +493,11 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dfd5009f96ef..15e7f7915a21 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -544,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = ireq_opt_deref(ireq); + rcu_read_lock(); + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, @@ -558,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c09eddbb78..cd426313a298 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -943,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } -- cgit v1.2.3 From 0e1d6eca5113858ed2caea61a5adc03c595f6096 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Oct 2018 15:47:35 -0700 Subject: rtnl: limit IFLA_NUM_TX_QUEUES and IFLA_NUM_RX_QUEUES to 4096 We have an impressive number of syzkaller bugs that are linked to the fact that syzbot was able to create a networking device with millions of TX (or RX) queues. Let's limit the number of RX/TX queues to 4096, this really should cover all known cases. A separate patch will add various cond_resched() in the loops handling sysfs entries at device creation and dismantle. Tested: lpaa6:~# ip link add gre-4097 numtxqueues 4097 numrxqueues 4097 type ip6gretap RTNETLINK answers: Invalid argument lpaa6:~# time ip link add gre-4096 numtxqueues 4096 numrxqueues 4096 type ip6gretap real 0m0.180s user 0m0.000s sys 0m0.107s Fixes: 76ff5cc91935 ("rtnl: allow to specify number of rx and tx queues on device creation") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7f37fe9c65a5..448703312fed 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2835,6 +2835,12 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); + if (num_tx_queues < 1 || num_tx_queues > 4096) + return ERR_PTR(-EINVAL); + + if (num_rx_queues < 1 || num_rx_queues > 4096) + return ERR_PTR(-EINVAL); + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) -- cgit v1.2.3 From 64199fc0a46ba211362472f7f942f900af9492fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 30 Sep 2018 11:33:39 -0700 Subject: ipv4: fix use-after-free in ip_cmsg_recv_dstaddr() Caching ip_hdr(skb) before a call to pskb_may_pull() is buggy, do not do it. Fixes: 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c0fe5ad996f2..26c36cccabdc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - const struct iphdr *iph = ip_hdr(skb); __be16 *ports; int end; @@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = iph->daddr; + sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); -- cgit v1.2.3 From 2cc543f5cd6deda27ef463686fa08c16c8c0990b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 3 Oct 2018 12:45:56 +0200 Subject: sctp: fix fall-through annotation Replace "fallthru" with a proper "fall through" annotation. This fix is part of the ongoing efforts to enabling -Wimplicit-fallthrough Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d74d00b29942..42191ed9902b 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1048,7 +1048,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, if (!ctx->packet || !ctx->packet->has_cookie_echo) return; - /* fallthru */ + /* fall through */ case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: -- cgit v1.2.3