From 1d211d43167690f94f1bcadf44395799382d85d0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 May 2018 14:53:41 +0200 Subject: cfg80211: use better order for kcalloc() arguments The arguments should be (# of elements, size of each) instead of the other way around, which really ends up being mostly equivalent but smatch complains about it, so swap them. Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- net/wireless/util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index b5bb1c309914..b91597a8baa2 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1789,8 +1789,9 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { - sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)), - IEEE80211_NUM_TIDS + 1, gfp); + sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, + sizeof(*(sinfo->pertid)), + gfp); if (!sinfo->pertid) return -ENOMEM; -- cgit v1.2.3 From db0a4ad80d3aee6f6e96eddc7ef6a88f4e38d357 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 28 May 2018 15:47:37 +0200 Subject: nl80211: refactor common code in scan flags checks There's a very common pattern to check for a scan flag and then reject it if an extended feature flag isn't set, factor this out into a helper function. Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 60 ++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 07514ca011b2..6c3ded1223fb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6861,6 +6861,16 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) return regulatory_pre_cac_allowed(wdev->wiphy); } +static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag, + enum nl80211_ext_feature_index feat) +{ + if (!(flags & flag)) + return true; + if (wiphy_ext_feature_isset(wiphy, feat)) + return true; + return false; +} + static int nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, void *request, struct nlattr **attrs, @@ -6895,15 +6905,27 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_LOW_POWER) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_LOW_POWER_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN))) + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_LOW_SPAN, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_LOW_POWER, + NL80211_EXT_FEATURE_LOW_POWER_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_HIGH_ACCURACY, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) return -EOPNOTSUPP; if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { @@ -6918,26 +6940,6 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, return err; } - if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) - return -EOPNOTSUPP; - return 0; } -- cgit v1.2.3 From 00387f321537395f62d5c0eca64c2d7838f39ac3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 28 May 2018 15:47:38 +0200 Subject: mac80211: add probe request building flags Add flags to pass through to probe request building and change the "bool directed" to be one of them. Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 12 +++++++++--- net/mac80211/mlme.c | 5 +++-- net/mac80211/scan.c | 7 ++++--- net/mac80211/util.c | 18 ++++++++++-------- 4 files changed, 26 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d1978aa1c15d..ee2a25d6ecf2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2031,24 +2031,30 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); + +enum { + IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), +}; + int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef); + struct cfg80211_chan_def *chandef, + u32 flags); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - bool directed); + u32 flags); void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, u32 tx_flags, + u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel, bool scan); u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a59187c016e0..c3f2883cc0ec 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2267,7 +2267,8 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) ieee80211_send_probe_req(sdata, sdata->vif.addr, dst, ssid + 2, ssid_len, NULL, - 0, (u32) -1, true, 0, + 0, (u32) -1, + IEEE80211_PROBE_FLAG_DIRECTED, 0, ifmgd->associated->channel, false); rcu_read_unlock(); } @@ -2370,7 +2371,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, ssid + 2, ssid_len, - NULL, 0, true); + NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); return skb; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index a3b1bcc2b461..8e28d8de26aa 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -336,7 +336,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, - bands_used, req->rates, &chandef); + bands_used, req->rates, &chandef, 0); local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); @@ -552,7 +552,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, - scan_req->rates[band], false, + scan_req->rates[band], 0, tx_flags, local->hw.conf.chandef.chan, true); /* @@ -1167,7 +1167,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, - req->ie_len, bands_used, rate_masks, &chandef); + req->ie_len, bands_used, rate_masks, &chandef, + 0); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 2d82c88efd0b..fb7264edecad 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1353,7 +1353,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, - size_t *offset) + size_t *offset, u32 flags) { struct ieee80211_supported_band *sband; u8 *pos = buffer, *end = buffer + buffer_len; @@ -1518,7 +1518,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef) + struct cfg80211_chan_def *chandef, + u32 flags) { size_t pos = 0, old_pos = 0, custom_ie_offset = 0; int i; @@ -1533,7 +1534,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, ie, ie_len, i, rate_masks[i], chandef, - &custom_ie_offset); + &custom_ie_offset, + flags); ie_desc->ies[i] = buffer + old_pos; ie_desc->len[i] = pos - old_pos; old_pos = pos; @@ -1561,7 +1563,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - bool directed) + u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; @@ -1577,7 +1579,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chandef.width; - if (directed) + if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; @@ -1591,7 +1593,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), - rate_masks, &chandef); + rate_masks, &chandef, flags); skb_put(skb, ies_len); if (dst) { @@ -1609,14 +1611,14 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, u32 tx_flags, + u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel, bool scan) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, - ie, ie_len, directed); + ie, ie_len, flags); if (skb) { IEEE80211_SKB_CB(skb)->flags |= tx_flags; if (scan) -- cgit v1.2.3 From 45ad683484b61b5859ccb5a93a8254e1b4d20a29 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 28 May 2018 15:47:39 +0200 Subject: mac80211: split ieee80211_send_probe_req() This function is passed many more parameters in the scan case than in the MLME case, and differentiates the two cases inside. Split it up and make both versions static to simplify things. Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 7 ------- net/mac80211/mlme.c | 22 +++++++++++++++++----- net/mac80211/scan.c | 22 ++++++++++++++++++++-- net/mac80211/util.c | 21 --------------------- 4 files changed, 37 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ee2a25d6ecf2..2851245c569a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2050,13 +2050,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags); -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, - const u8 *src, const u8 *dst, - const u8 *ssid, size_t ssid_len, - const u8 *ie, size_t ie_len, - u32 ratemask, u32 flags, u32 tx_flags, - struct ieee80211_channel *channel, bool scan); - u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c3f2883cc0ec..a44e5b4aaeda 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2219,6 +2219,20 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, ieee80211_sta_reset_conn_monitor(sdata); } +static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, + const u8 *src, const u8 *dst, + const u8 *ssid, size_t ssid_len, + struct ieee80211_channel *channel) +{ + struct sk_buff *skb; + + skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel, + ssid, ssid_len, NULL, 0, + IEEE80211_PROBE_FLAG_DIRECTED); + if (skb) + ieee80211_tx_skb(sdata, skb); +} + static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -2265,11 +2279,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) else ssid_len = ssid[1]; - ieee80211_send_probe_req(sdata, sdata->vif.addr, dst, - ssid + 2, ssid_len, NULL, - 0, (u32) -1, - IEEE80211_PROBE_FLAG_DIRECTED, 0, - ifmgd->associated->channel, false); + ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, + ssid + 2, ssid_len, + ifmgd->associated->channel); rcu_read_unlock(); } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 8e28d8de26aa..03f66f31c5b4 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -528,6 +528,24 @@ void ieee80211_run_deferred_scan(struct ieee80211_local *local) round_jiffies_relative(0)); } +static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, + const u8 *src, const u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len, + u32 ratemask, u32 flags, u32 tx_flags, + struct ieee80211_channel *channel) +{ + struct sk_buff *skb; + + skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, + ssid, ssid_len, + ie, ie_len, flags); + if (skb) { + IEEE80211_SKB_CB(skb)->flags |= tx_flags; + ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); + } +} + static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { @@ -548,12 +566,12 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, lockdep_is_held(&local->mtx)); for (i = 0; i < scan_req->n_ssids; i++) - ieee80211_send_probe_req( + ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], 0, - tx_flags, local->hw.conf.chandef.chan, true); + tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses diff --git a/net/mac80211/util.c b/net/mac80211/util.c index fb7264edecad..0325133552ad 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1607,27 +1607,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return skb; } -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, - const u8 *src, const u8 *dst, - const u8 *ssid, size_t ssid_len, - const u8 *ie, size_t ie_len, - u32 ratemask, u32 flags, u32 tx_flags, - struct ieee80211_channel *channel, bool scan) -{ - struct sk_buff *skb; - - skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, - ssid, ssid_len, - ie, ie_len, flags); - if (skb) { - IEEE80211_SKB_CB(skb)->flags |= tx_flags; - if (scan) - ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); - else - ieee80211_tx_skb(sdata, skb); - } -} - u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) -- cgit v1.2.3 From 2e076f199097d670ce5e5492cea57f552b93bba9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 28 May 2018 15:47:40 +0200 Subject: nl80211: add scan features for improved scan privacy Add the scan flags for randomized SN and minimized probe request content for improved scan privacy. Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 15 +++++++++++++++ net/wireless/nl80211.c | 8 +++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 28b36545de24..49f718e821a3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5133,6 +5133,11 @@ enum nl80211_feature_flags { * support to nl80211. * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate * TXQs. + * @NL80211_EXT_FEATURE_SCAN_RANDOM_SN: Driver/device supports randomizing the + * SN in probe request frames if requested by %NL80211_SCAN_FLAG_RANDOM_SN. + * @NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT: Driver/device can omit all data + * except for supported rates from the probe request content if requested + * by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5167,6 +5172,8 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT, NL80211_EXT_FEATURE_TXQS, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5272,6 +5279,12 @@ enum nl80211_timeout_reason { * possible scan results. This flag hints the driver to use the best * possible scan configuration to improve the accuracy in scanning. * Latency and power use may get impacted with this flag. + * @NL80211_SCAN_FLAG_RANDOM_SN: randomize the sequence number in probe + * request frames from this scan to avoid correlation/tracking being + * possible. + * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to + * only have supported rates and no additional capabilities (unless + * added by userspace explicitly.) */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -5285,6 +5298,8 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_SPAN = 1<<8, NL80211_SCAN_FLAG_LOW_POWER = 1<<9, NL80211_SCAN_FLAG_HIGH_ACCURACY = 1<<10, + NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, + NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6c3ded1223fb..d2677259e13e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6925,7 +6925,13 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) || !nl80211_check_scan_feat(wiphy, *flags, NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_RANDOM_SN, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_MIN_PREQ_CONTENT, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT)) return -EOPNOTSUPP; if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { -- cgit v1.2.3 From b9771d41aee7aa3207b985422a1cc19e8342bc50 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 28 May 2018 15:47:41 +0200 Subject: mac80211: support scan features for improved scan privacy Support the new random SN and minimal probe request contents scan flags for the case of software scan - for hardware scan the drivers need to opt in, but may need to do only that, depending on their implementation. Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- net/mac80211/ieee80211_i.h | 14 +++++++++----- net/mac80211/main.c | 13 +++++++++++-- net/mac80211/offchannel.c | 2 +- net/mac80211/rx.c | 2 +- net/mac80211/scan.c | 35 ++++++++++++++++++++++++++++++----- net/mac80211/sta_info.c | 2 +- net/mac80211/tx.c | 21 +++++++++++++-------- net/mac80211/util.c | 4 ++++ 9 files changed, 71 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index bdf6fa78d0d2..c4e2f7d2bcb8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3486,7 +3486,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, } local_bh_disable(); - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); local_bh_enable(); ret = 0; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2851245c569a..a6c12c104c38 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -165,6 +165,7 @@ typedef unsigned __bitwise ieee80211_tx_result; #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) +#define IEEE80211_TX_NO_SEQNO BIT(0) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) @@ -1880,19 +1881,20 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, struct sk_buff *skb, + u32 txdata_flags); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band); + enum nl80211_band band, u32 txdata_flags); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band) + enum nl80211_band band, u32 txdata_flags) { rcu_read_lock(); - __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); + __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags); rcu_read_unlock(); } @@ -1910,7 +1912,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, } __ieee80211_tx_skb_tid_band(sdata, skb, tid, - chanctx_conf->def.chan->band); + chanctx_conf->def.chan->band, 0); rcu_read_unlock(); } @@ -2034,6 +2036,8 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, enum { IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), + IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1), + IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4d2e797e3f16..a6f8e3a646d4 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -557,10 +557,19 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); - if (!ops->hw_scan) + if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | NL80211_FEATURE_AP_SCAN; - + /* + * if the driver behaves correctly using the probe request + * (template) from mac80211, then both of these should be + * supported even with hw scan - but let drivers opt in. + */ + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); + } if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f1d40b6645ff..8ef4153cd299 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -262,7 +262,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, - roc->chan->band); + roc->chan->band, 0); roc->frame = NULL; } } else { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 0a38cc1cbebc..756ba176db1e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3241,7 +3241,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) } __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, - status->band); + status->band, 0); } dev_kfree_skb(rx->skb); return RX_QUEUED; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 03f66f31c5b4..ae77d1c12856 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "ieee80211_i.h" @@ -293,6 +294,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen, n_chans; + u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); @@ -331,12 +333,16 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef, req->scan_width); + if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, - bands_used, req->rates, &chandef, 0); + bands_used, req->rates, &chandef, + flags); local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); @@ -536,13 +542,24 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel) { struct sk_buff *skb; + u32 txdata_flags = 0; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); + if (skb) { + if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { + struct ieee80211_hdr *hdr = (void *)skb->data; + u16 sn = get_random_u32(); + + txdata_flags |= IEEE80211_TX_NO_SEQNO; + hdr->seq_ctrl = + cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); + } IEEE80211_SKB_CB(skb)->flags |= tx_flags; - ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); + ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band, + txdata_flags); } } @@ -553,7 +570,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; - u32 tx_flags; + u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); @@ -561,6 +578,10 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; + if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) + flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->mtx)); @@ -570,7 +591,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, - scan_req->rates[band], 0, + scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* @@ -1159,6 +1180,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; + u32 flags = 0; iebufsz = local->scan_ies_len + req->ie_len; @@ -1175,6 +1197,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, } } + if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + ie = kzalloc(num_bands * iebufsz, GFP_KERNEL); if (!ie) { ret = -ENOMEM; @@ -1186,7 +1211,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, - 0); + flags); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6428f1ac37b6..aa96fddfbfc2 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1391,7 +1391,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, } info->band = chanctx_conf->def.chan->band; - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); rcu_read_unlock(); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 44b5dfe8727d..5b93bde248fd 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -825,6 +825,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { + if (tx->flags & IEEE80211_TX_NO_SEQNO) + return TX_CONTINUE; /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ @@ -1854,7 +1856,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb); */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, - bool txpending) + bool txpending, u32 txdata_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; @@ -1872,6 +1874,8 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, led_len = skb->len; res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); + tx.flags |= txdata_flags; + if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; @@ -1933,7 +1937,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, struct sk_buff *skb, + u32 txdata_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -1968,7 +1973,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, } ieee80211_set_qos_hdr(sdata, skb); - ieee80211_tx(sdata, sta, skb, false); + ieee80211_tx(sdata, sta, skb, false, txdata_flags); } static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, @@ -2289,7 +2294,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (!ieee80211_parse_tx_radiotap(local, skb)) goto fail_rcu; - ieee80211_xmit(sdata, NULL, skb); + ieee80211_xmit(sdata, NULL, skb, 0); rcu_read_unlock(); return NETDEV_TX_OK; @@ -3648,7 +3653,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, ieee80211_tx_stats(dev, skb->len); - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); } goto out; out_free: @@ -3867,7 +3872,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, return true; } info->band = chanctx_conf->def.chan->band; - result = ieee80211_tx(sdata, NULL, skb, true); + result = ieee80211_tx(sdata, NULL, skb, true, 0); } else { struct sk_buff_head skbs; @@ -4783,7 +4788,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band) + enum nl80211_band band, u32 txdata_flags) { int ac = ieee80211_ac_from_tid(tid); @@ -4800,7 +4805,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; - ieee80211_xmit(sdata, NULL, skb); + ieee80211_xmit(sdata, NULL, skb, txdata_flags); local_bh_enable(); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 0325133552ad..b744b10465c3 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1433,6 +1433,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, chandef->chan->center_freq); } + if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) + goto done; + /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { @@ -1510,6 +1513,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); + done: return pos - buffer; } -- cgit v1.2.3 From 446faa15c6e80620826edd659e63c6760137975a Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Thu, 14 Jun 2018 09:43:06 +0800 Subject: nl80211: report 4ADDR status with GET_INTERFACE User space tools might be interested in knowing the current status of the 4ADDR property of an interface (when supported). Send the status along with the other attributes when replying to a GET_INTERFACE netlink query. Signed-off-by: Antonio Quartulli Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d2677259e13e..7b21914ae18b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2757,7 +2757,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) || nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ - (cfg80211_rdev_list_generation << 2))) + (cfg80211_rdev_list_generation << 2)) || + nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) goto nla_put_failure; if (rdev->ops->get_channel) { -- cgit v1.2.3 From c4cbaf7973a794839af080f13748335976cf3f3f Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:42 +0300 Subject: cfg80211: Add support for HE Add support for the HE in cfg80211 and also add userspace API to nl80211 to send rate information out, conforming with P802.11ax_D2.0. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 427 +++++++++++++++++++++++++++++++++++++++++++ include/net/cfg80211.h | 106 ++++++++++- include/uapi/linux/nl80211.h | 87 ++++++++- net/wireless/core.c | 21 ++- net/wireless/nl80211.c | 99 +++++++++- net/wireless/util.c | 82 +++++++++ 6 files changed, 817 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8fe7e4306816..e6a6503bfa33 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1539,6 +1539,106 @@ struct ieee80211_vht_operation { __le16 basic_mcs_set; } __packed; +/** + * struct ieee80211_he_cap_elem - HE capabilities element + * + * This structure is the "HE capabilities element" fixed fields as + * described in P802.11ax_D2.0 section 9.4.2.237.2 and 9.4.2.237.3 + */ +struct ieee80211_he_cap_elem { + u8 mac_cap_info[5]; + u8 phy_cap_info[9]; +} __packed; + +#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 + +/** + * enum ieee80211_he_mcs_support - HE MCS support definitions + * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported + * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the rx_mcs_* + * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_he_mcs_support { + IEEE80211_HE_MCS_SUPPORT_0_7 = 0, + IEEE80211_HE_MCS_SUPPORT_0_9 = 1, + IEEE80211_HE_MCS_SUPPORT_0_11 = 2, + IEEE80211_HE_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field + * + * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field + * described in P802.11ax_D2.0 section 9.4.2.237.4 + * + * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + */ +struct ieee80211_he_mcs_nss_supp { + __le16 rx_mcs_80; + __le16 tx_mcs_80; + __le16 rx_mcs_160; + __le16 tx_mcs_160; + __le16 rx_mcs_80p80; + __le16 tx_mcs_80p80; +} __packed; + +/** + * struct ieee80211_he_operation - HE capabilities element + * + * This structure is the "HE operation element" fields as + * described in P802.11ax_D2.0 section 9.4.2.238 + */ +struct ieee80211_he_operation { + __le32 he_oper_params; + __le16 he_mcs_nss_set; + /* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */ + u8 optional[0]; +} __packed; + +/** + * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field + * + * This structure is the "MU AC Parameter Record" fields as + * described in P802.11ax_D2.0 section 9.4.2.240 + */ +struct ieee80211_he_mu_edca_param_ac_rec { + u8 aifsn; + u8 ecw_min_max; + u8 mu_edca_timer; +} __packed; + +/** + * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element + * + * This structure is the "MU EDCA Parameter Set element" fields as + * described in P802.11ax_D2.0 section 9.4.2.240 + */ +struct ieee80211_mu_edca_param_set { + u8 mu_qos_info; + struct ieee80211_he_mu_edca_param_ac_rec ac_be; + struct ieee80211_he_mu_edca_param_ac_rec ac_bk; + struct ieee80211_he_mu_edca_param_ac_rec ac_vi; + struct ieee80211_he_mu_edca_param_ac_rec ac_vo; +} __packed; /* 802.11ac VHT Capabilities */ #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 @@ -1577,6 +1677,328 @@ struct ieee80211_vht_operation { #define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 #define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 +/* 802.11ax HE MAC capabilities */ +#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 +#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 +#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 + +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_1 0x00 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_2 0x10 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_3 0x20 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_4 0x30 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_5 0x40 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_6 0x50 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_7 0x60 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8 0x70 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_MASK 0x70 + +/* Link adaptation is split between byte HE_MAC_CAP1 and + * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE + * in which case the following values apply: + * 0 = No feedback. + * 1 = reserved. + * 2 = Unsolicited feedback. + * 3 = both + */ +#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 + +#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 +#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 +#define IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED 0x04 +#define IEEE80211_HE_MAC_CAP2_BSR 0x08 +#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 +#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 +#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 +#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 + +#define IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU 0x01 +#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 +#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 + +/* The maximum length of an A-MDPU is defined by the combination of the Maximum + * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the + * same field in the HE capabilities. + */ +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT 0x00 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1 0x08 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2 0x10 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED 0x18 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK 0x18 +#define IEEE80211_HE_MAC_CAP3_A_AMSDU_FRAG 0x20 +#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 +#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 + +#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 +#define IEEE80211_HE_MAC_CAP4_QTP 0x02 +#define IEEE80211_HE_MAC_CAP4_BQR 0x04 +#define IEEE80211_HE_MAC_CAP4_SR_RESP 0x08 +#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 +#define IEEE80211_HE_MAC_CAP4_OPS 0x20 +#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU 0x40 + +/* 802.11ax HE PHY capabilities */ +#define IEEE80211_HE_PHY_CAP0_DUAL_BAND 0x01 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe + +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f +#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 +#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 +#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 +/* Midamble RX Max NSTS is split between byte #2 and byte #3 */ +#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS 0x80 + +#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS 0x01 +#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 +#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 +#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 + +/* Note that the meaning of UL MU below is different between an AP and a non-AP + * sta, where in the AP case it indicates support for Rx and in the non-AP sta + * case it indicates support for Tx. + */ +#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 +#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 + +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 +#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA 0x40 +#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 + +#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 +#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 + +/* Minimal allowed value of Max STS under 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c + +/* Minimal allowed value of Max STS above 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 + +#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 +#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 + +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 +#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB 0x04 +#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB 0x08 +#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 +#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 + +#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR 0x01 +#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR 0x02 +#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 +#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 +#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 + +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 +#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 +#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_2X_AND_1XLTF 0x20 + +/* 802.11ax HE TX/RX MCS NSS Support */ +#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 + +/* TX/RX HE MCS Support field Highest MCS subfield encoding */ +enum ieee80211_he_highest_mcs_supported_subfield_enc { + HIGHEST_MCS_SUPPORTED_MCS7 = 0, + HIGHEST_MCS_SUPPORTED_MCS8, + HIGHEST_MCS_SUPPORTED_MCS9, + HIGHEST_MCS_SUPPORTED_MCS10, + HIGHEST_MCS_SUPPORTED_MCS11, +}; + +/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ +static inline u8 +ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) +{ + u8 count = 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) + count += 4; + + return count; +} + +/* 802.11ax HE PPE Thresholds */ +#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) +#define IEEE80211_PPE_THRES_NSS_POS (0) +#define IEEE80211_PPE_THRES_NSS_MASK (7) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ + (BIT(5) | BIT(6)) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) +#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) + +/* + * Calculate 802.11ax HE capabilities IE PPE field size + * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* + */ +static inline u8 +ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u8 n; + + if ((phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + return 0; + + n = hweight8(ppe_thres_hdr & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + return n; +} + +/* HE Operation defines */ +#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x0000003f +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x000001c0 +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET 6 +#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000200 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x000ffc00 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 10 +#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x000100000 +#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x000200000 +#define IEEE80211_HE_OPERATION_MULTI_BSSID_AP 0x10000000 +#define IEEE80211_HE_OPERATION_TX_BSSID_INDICATOR 0x20000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x40000000 + +/* + * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size + * @he_oper_ie: byte data of the He Operations IE, stating from the the byte + * after the ext ID byte. It is assumed that he_oper_ie has at least + * sizeof(struct ieee80211_he_operation) bytes, checked already in + * ieee802_11_parse_elems_crc() + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_oper_size(const u8 *he_oper_ie) +{ + struct ieee80211_he_operation *he_oper = (void *)he_oper_ie; + u8 oper_len = sizeof(struct ieee80211_he_operation); + u32 he_oper_params; + + /* Make sure the input is not NULL */ + if (!he_oper_ie) + return 0; + + /* Calc required length */ + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + oper_len += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_MULTI_BSSID_AP) + oper_len++; + + /* Add the first byte (extension ID) to the total length */ + oper_len++; + + return oper_len; +} + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -1992,6 +2414,11 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_FILS_WRAPPED_DATA = 8, WLAN_EID_EXT_FILS_PUBLIC_KEY = 12, WLAN_EID_EXT_FILS_NONCE = 13, + WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14, + WLAN_EID_EXT_HE_CAPABILITY = 35, + WLAN_EID_EXT_HE_OPERATION = 36, + WLAN_EID_EXT_UORA = 37, + WLAN_EID_EXT_HE_MU_EDCA = 38, }; /* Action category code */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5fbfe61f41c6..9ba1f289c439 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -285,6 +285,41 @@ struct ieee80211_sta_vht_cap { struct ieee80211_vht_mcs_info vht_mcs; }; +#define IEEE80211_HE_PPE_THRES_MAX_LEN 25 + +/** + * struct ieee80211_sta_he_cap - STA's HE capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11ax HE capabilities for a STA. + * + * @has_he: true iff HE data is valid. + * @he_cap_elem: Fixed portion of the HE capabilities element. + * @he_mcs_nss_supp: The supported NSS/MCS combinations. + * @ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_he_cap { + bool has_he; + struct ieee80211_he_cap_elem he_cap_elem; + struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp; + u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN]; +}; + +/** + * struct ieee80211_sband_iftype_data + * + * This structure encapsulates sband data that is relevant for the + * interface types defined in @types_mask. Each type in the + * @types_mask must be unique across all instances of iftype_data. + * + * @types_mask: interface types mask + * @he_cap: holds the HE capabilities + */ +struct ieee80211_sband_iftype_data { + u16 types_mask; + struct ieee80211_sta_he_cap he_cap; +}; + /** * struct ieee80211_supported_band - frequency band definition * @@ -301,6 +336,11 @@ struct ieee80211_sta_vht_cap { * @n_bitrates: Number of bitrates in @bitrates * @ht_cap: HT capabilities in this band * @vht_cap: VHT capabilities in this band + * @n_iftype_data: number of iftype data entries + * @iftype_data: interface type data entries. Note that the bits in + * @types_mask inside this structure cannot overlap (i.e. only + * one occurrence of each type is allowed across all instances of + * iftype_data). */ struct ieee80211_supported_band { struct ieee80211_channel *channels; @@ -310,8 +350,55 @@ struct ieee80211_supported_band { int n_bitrates; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + u16 n_iftype_data; + const struct ieee80211_sband_iftype_data *iftype_data; }; +/** + * ieee80211_get_sband_iftype_data - return sband data for a given iftype + * @sband: the sband to search for the STA on + * @iftype: enum nl80211_iftype + * + * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found + */ +static inline const struct ieee80211_sband_iftype_data * +ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, + u8 iftype) +{ + int i; + + if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + return NULL; + + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *data = + &sband->iftype_data[i]; + + if (data->types_mask & BIT(iftype)) + return data; + } + + return NULL; +} + +/** + * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA + * @sband: the sband to search for the STA on + * + * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_he_cap * +ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_STATION); + + if (data && data->he_cap.has_he) + return &data->he_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * @@ -899,6 +986,8 @@ enum station_parameters_apply_mask { * @opmode_notif: operating mode field from Operating Mode Notification * @opmode_notif_used: information if operating mode field is used * @support_p2p_ps: information if station supports P2P PS mechanism + * @he_capa: HE capabilities of station + * @he_capa_len: the length of the HE capabilities */ struct station_parameters { const u8 *supported_rates; @@ -926,6 +1015,8 @@ struct station_parameters { u8 opmode_notif; bool opmode_notif_used; int support_p2p_ps; + const struct ieee80211_he_cap_elem *he_capa; + u8 he_capa_len; }; /** @@ -1000,12 +1091,14 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval * @RATE_INFO_FLAGS_60G: 60GHz MCS + * @RATE_INFO_FLAGS_HE_MCS: HE MCS information */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), RATE_INFO_FLAGS_VHT_MCS = BIT(1), RATE_INFO_FLAGS_SHORT_GI = BIT(2), RATE_INFO_FLAGS_60G = BIT(3), + RATE_INFO_FLAGS_HE_MCS = BIT(4), }; /** @@ -1019,6 +1112,7 @@ enum rate_info_flags { * @RATE_INFO_BW_40: 40 MHz bandwidth * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth + * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1027,6 +1121,7 @@ enum rate_info_bw { RATE_INFO_BW_40, RATE_INFO_BW_80, RATE_INFO_BW_160, + RATE_INFO_BW_HE_RU, }; /** @@ -1035,10 +1130,14 @@ enum rate_info_bw { * Information about a receiving or transmitting bitrate * * @flags: bitflag of flags from &enum rate_info_flags - * @mcs: mcs index if struct describes a 802.11n bitrate + * @mcs: mcs index if struct describes an HT/VHT/HE rate * @legacy: bitrate in 100kbit/s for 802.11abg - * @nss: number of streams (VHT only) + * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) + * @he_gi: HE guard interval (from &enum nl80211_he_gi) + * @he_dcm: HE DCM value + * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, + * only valid if bw is %RATE_INFO_BW_HE_RU) */ struct rate_info { u8 flags; @@ -1046,6 +1145,9 @@ struct rate_info { u16 legacy; u8 nss; u8 bw; + u8 he_gi; + u8 he_dcm; + u8 he_ru_alloc; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 49f718e821a3..f82ce3c89ab7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2237,6 +2237,9 @@ enum nl80211_commands { * enforced. * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes * a flow is assigned on each round of the DRR scheduler. + * @NL80211_ATTR_HE_CAPABILITY: HE Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if %NL80211_STA_FLAG_WME is set. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2677,6 +2680,8 @@ enum nl80211_attrs { NL80211_ATTR_TXQ_MEMORY_LIMIT, NL80211_ATTR_TXQ_QUANTUM, + NL80211_ATTR_HE_CAPABILITY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -2726,7 +2731,8 @@ enum nl80211_attrs { #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY 24 #define NL80211_HT_CAPABILITY_LEN 26 #define NL80211_VHT_CAPABILITY_LEN 12 - +#define NL80211_HE_MIN_CAPABILITY_LEN 16 +#define NL80211_HE_MAX_CAPABILITY_LEN 51 #define NL80211_MAX_NR_CIPHER_SUITES 5 #define NL80211_MAX_NR_AKM_SUITES 2 @@ -2853,6 +2859,38 @@ struct nl80211_sta_flag_update { __u32 set; } __attribute__((packed)); +/** + * enum nl80211_he_gi - HE guard interval + * @NL80211_RATE_INFO_HE_GI_0_8: 0.8 usec + * @NL80211_RATE_INFO_HE_GI_1_6: 1.6 usec + * @NL80211_RATE_INFO_HE_GI_3_2: 3.2 usec + */ +enum nl80211_he_gi { + NL80211_RATE_INFO_HE_GI_0_8, + NL80211_RATE_INFO_HE_GI_1_6, + NL80211_RATE_INFO_HE_GI_3_2, +}; + +/** + * enum nl80211_he_ru_alloc - HE RU allocation values + * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_52: 52-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_106: 106-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_242: 242-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_484: 484-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_996: 996-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_2x996: 2x996-tone RU allocation + */ +enum nl80211_he_ru_alloc { + NL80211_RATE_INFO_HE_RU_ALLOC_26, + NL80211_RATE_INFO_HE_RU_ALLOC_52, + NL80211_RATE_INFO_HE_RU_ALLOC_106, + NL80211_RATE_INFO_HE_RU_ALLOC_242, + NL80211_RATE_INFO_HE_RU_ALLOC_484, + NL80211_RATE_INFO_HE_RU_ALLOC_996, + NL80211_RATE_INFO_HE_RU_ALLOC_2x996, +}; + /** * enum nl80211_rate_info - bitrate information * @@ -2885,6 +2923,13 @@ struct nl80211_sta_flag_update { * @NL80211_RATE_INFO_5_MHZ_WIDTH: 5 MHz width - note that this is * a legacy rate and will be reported as the actual bitrate, i.e. * a quarter of the base (20 MHz) rate + * @NL80211_RATE_INFO_HE_MCS: HE MCS index (u8, 0-11) + * @NL80211_RATE_INFO_HE_NSS: HE NSS value (u8, 1-8) + * @NL80211_RATE_INFO_HE_GI: HE guard interval identifier + * (u8, see &enum nl80211_he_gi) + * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1) + * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then + * non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc) * @__NL80211_RATE_INFO_AFTER_LAST: internal use */ enum nl80211_rate_info { @@ -2901,6 +2946,11 @@ enum nl80211_rate_info { NL80211_RATE_INFO_160_MHZ_WIDTH, NL80211_RATE_INFO_10_MHZ_WIDTH, NL80211_RATE_INFO_5_MHZ_WIDTH, + NL80211_RATE_INFO_HE_MCS, + NL80211_RATE_INFO_HE_NSS, + NL80211_RATE_INFO_HE_GI, + NL80211_RATE_INFO_HE_DCM, + NL80211_RATE_INFO_HE_RU_ALLOC, /* keep last */ __NL80211_RATE_INFO_AFTER_LAST, @@ -3166,6 +3216,38 @@ enum nl80211_mpath_info { NL80211_MPATH_INFO_MAX = __NL80211_MPATH_INFO_AFTER_LAST - 1 }; +/** + * enum nl80211_band_iftype_attr - Interface type data attributes + * + * @__NL80211_BAND_IFTYPE_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_BAND_IFTYPE_ATTR_IFTYPES: nested attribute containing a flag attribute + * for each interface type that supports the band data + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC: HE MAC capabilities as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY: HE PHY capabilities as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET: HE supported NSS/MCS as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE: HE PPE thresholds information as + * defined in HE capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently + * defined + * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use + */ +enum nl80211_band_iftype_attr { + __NL80211_BAND_IFTYPE_ATTR_INVALID, + + NL80211_BAND_IFTYPE_ATTR_IFTYPES, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + + /* keep last */ + __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, + NL80211_BAND_IFTYPE_ATTR_MAX = __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST - 1 +}; + /** * enum nl80211_band_attr - band attributes * @__NL80211_BAND_ATTR_INVALID: attribute number 0 is reserved @@ -3181,6 +3263,8 @@ enum nl80211_mpath_info { * @NL80211_BAND_ATTR_VHT_MCS_SET: 32-byte attribute containing the MCS set as * defined in 802.11ac * @NL80211_BAND_ATTR_VHT_CAPA: VHT capabilities, as in the HT information IE + * @NL80211_BAND_ATTR_IFTYPE_DATA: nested array attribute, with each entry using + * attributes from &enum nl80211_band_iftype_attr * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined * @__NL80211_BAND_ATTR_AFTER_LAST: internal use */ @@ -3196,6 +3280,7 @@ enum nl80211_band_attr { NL80211_BAND_ATTR_VHT_MCS_SET, NL80211_BAND_ATTR_VHT_CAPA, + NL80211_BAND_ATTR_IFTYPE_DATA, /* keep last */ __NL80211_BAND_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index 5fe35aafdd9c..d23abc619e77 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2017 Intel Deutschland GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { + u16 types = 0; + sband = wiphy->bands[band]; if (!sband) continue; @@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].band = band; } + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *iftd; + + iftd = &sband->iftype_data[i]; + + if (WARN_ON(!iftd->types_mask)) + return -EINVAL; + if (WARN_ON(types & iftd->types_mask)) + return -EINVAL; + + /* at least one piece of information must be present */ + if (WARN_ON(!iftd->he_cap.has_he)) + return -EINVAL; + + types |= iftd->types_mask; + } + have_band = true; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7b21914ae18b..0ccce338a66e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, + .len = NL80211_HE_MAX_CAPABILITY_LEN }, }; /* policy for the key attributes */ @@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg, return 0; } +static int +nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_sband_iftype_data *iftdata) +{ + const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; + + if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, + iftdata->types_mask)) + return -ENOBUFS; + + if (he_cap->has_he) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + sizeof(he_cap->he_cap_elem.mac_cap_info), + he_cap->he_cap_elem.mac_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + sizeof(he_cap->he_cap_elem.phy_cap_info), + he_cap->he_cap_elem.phy_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + sizeof(he_cap->he_mcs_nss_supp), + &he_cap->he_mcs_nss_supp) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + sizeof(he_cap->ppe_thres), he_cap->ppe_thres)) + return -ENOBUFS; + } + + return 0; +} + static int nl80211_send_band_rateinfo(struct sk_buff *msg, struct ieee80211_supported_band *sband) { @@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, sband->vht_cap.cap))) return -ENOBUFS; + if (sband->n_iftype_data) { + struct nlattr *nl_iftype_data = + nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA); + int err; + + if (!nl_iftype_data) + return -ENOBUFS; + + for (i = 0; i < sband->n_iftype_data; i++) { + struct nlattr *iftdata; + + iftdata = nla_nest_start(msg, i + 1); + if (!iftdata) + return -ENOBUFS; + + err = nl80211_send_iftype_data(msg, + &sband->iftype_data[i]); + if (err) + return err; + + nla_nest_end(msg, iftdata); + } + + nla_nest_end(msg, nl_iftype_data); + } + /* add bitrates */ nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); if (!nl_rates) @@ -4472,6 +4528,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, case RATE_INFO_BW_160: rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH; break; + case RATE_INFO_BW_HE_RU: + rate_flg = 0; + WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS)); } if (rate_flg && nla_put_flag(msg, rate_flg)) @@ -4491,6 +4550,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, if (info->flags & RATE_INFO_FLAGS_SHORT_GI && nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI)) return false; + } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm)) + return false; + if (info->bw == RATE_INFO_BW_HE_RU && + nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC, + info->he_ru_alloc)) + return false; } nla_nest_end(msg, rate); @@ -4887,7 +4959,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; if (params->supported_rates) return -EINVAL; - if (params->ext_capab || params->ht_capa || params->vht_capa) + if (params->ext_capab || params->ht_capa || params->vht_capa || + params->he_capa) return -EINVAL; } @@ -5093,6 +5166,15 @@ static int nl80211_set_station_tdls(struct genl_info *info, if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) params->vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params->he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params->he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } err = nl80211_parse_sta_channel_info(info, params); if (err) @@ -5320,6 +5402,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params.he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params.he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + /* max len is validated in nla policy */ + if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -5352,6 +5445,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { params.ht_capa = NULL; params.vht_capa = NULL; + + /* HE requires WME */ + if (params.he_capa_len) + return -EINVAL; } /* When you run into this, adjust the code below for the new flag */ diff --git a/net/wireless/util.c b/net/wireless/util.c index b91597a8baa2..4ed06b271f32 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -4,6 +4,7 @@ * * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH */ #include #include @@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) return 0; } +static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) +{ +#define SCALE 2048 + u16 mcs_divisors[12] = { + 34133, /* 16.666666... */ + 17067, /* 8.333333... */ + 11378, /* 5.555555... */ + 8533, /* 4.166666... */ + 5689, /* 2.777777... */ + 4267, /* 2.083333... */ + 3923, /* 1.851851... */ + 3413, /* 1.666666... */ + 2844, /* 1.388888... */ + 2560, /* 1.250000... */ + 2276, /* 1.111111... */ + 2048, /* 1.000000... */ + }; + u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; + u32 rates_969[3] = { 480388888, 453700000, 408333333 }; + u32 rates_484[3] = { 229411111, 216666666, 195000000 }; + u32 rates_242[3] = { 114711111, 108333333, 97500000 }; + u32 rates_106[3] = { 40000000, 37777777, 34000000 }; + u32 rates_52[3] = { 18820000, 17777777, 16000000 }; + u32 rates_26[3] = { 9411111, 8888888, 8000000 }; + u64 tmp; + u32 result; + + if (WARN_ON_ONCE(rate->mcs > 11)) + return 0; + + if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) + return 0; + if (WARN_ON_ONCE(rate->he_ru_alloc > + NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) + return 0; + if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) + return 0; + + if (rate->bw == RATE_INFO_BW_160) + result = rates_160M[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_80 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) + result = rates_969[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_40 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) + result = rates_484[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_20 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) + result = rates_242[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) + result = rates_106[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) + result = rates_52[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) + result = rates_26[rate->he_gi]; + else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", + rate->bw, rate->he_ru_alloc)) + return 0; + + /* now scale to the appropriate MCS */ + tmp = result; + tmp *= SCALE; + do_div(tmp, mcs_divisors[rate->mcs]); + result = tmp; + + /* and take NSS, DCM into account */ + result = (result * rate->nss) / 8; + if (rate->he_dcm) + result /= 2; + + return result; +} + u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) @@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_60g(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); + if (rate->flags & RATE_INFO_FLAGS_HE_MCS) + return cfg80211_calculate_bitrate_he(rate); return rate->legacy; } -- cgit v1.2.3 From b8042b3da925f390c1482bf9dc0898dc0b3ea7b5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Jun 2018 22:39:29 +0200 Subject: ieee80211: bump IEEE80211_MAX_AMPDU_BUF to support HE Bump the IEEE80211_MAX_AMPDU_BUF size to 0x100 for HE support and - for now - use IEEE80211_MAX_AMPDU_BUF_HT everywhere. This is derived from my internal patch, parts of which Luca had sent upstream. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- drivers/net/wireless/realtek/rtlwifi/base.c | 2 +- drivers/staging/rtl8188eu/include/wifi.h | 1 - drivers/staging/rtl8712/wifi.h | 1 - drivers/staging/rtl8723bs/include/wifi.h | 1 - drivers/staging/rtlwifi/base.c | 2 +- include/linux/ieee80211.h | 10 ++++++---- net/mac80211/agg-rx.c | 4 ++-- net/mac80211/agg-tx.c | 2 +- net/mac80211/ht.c | 2 +- net/mac80211/main.c | 4 ++-- 10 files changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index 39c817eddd78..31bd6f714052 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -1904,7 +1904,7 @@ void rtl_rx_ampdu_apply(struct rtl_priv *rtlpriv) reject_agg, ctrl_agg_size, agg_size); rtlpriv->hw->max_rx_aggregation_subframes = - (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF); + (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF_HT); } EXPORT_SYMBOL(rtl_rx_ampdu_apply); diff --git a/drivers/staging/rtl8188eu/include/wifi.h b/drivers/staging/rtl8188eu/include/wifi.h index 084a246eec19..6790b7c8cfb1 100644 --- a/drivers/staging/rtl8188eu/include/wifi.h +++ b/drivers/staging/rtl8188eu/include/wifi.h @@ -575,7 +575,6 @@ enum ht_cap_ampdu_factor { * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ #define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 #define OP_MODE_PURE 0 diff --git a/drivers/staging/rtl8712/wifi.h b/drivers/staging/rtl8712/wifi.h index 0ed2f44ab4e9..00a4302e9983 100644 --- a/drivers/staging/rtl8712/wifi.h +++ b/drivers/staging/rtl8712/wifi.h @@ -574,7 +574,6 @@ struct ieee80211_ht_addt_info { * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ #define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 /* Spatial Multiplexing Power Save Modes */ diff --git a/drivers/staging/rtl8723bs/include/wifi.h b/drivers/staging/rtl8723bs/include/wifi.h index 08bc79840b23..559bf2606fb7 100644 --- a/drivers/staging/rtl8723bs/include/wifi.h +++ b/drivers/staging/rtl8723bs/include/wifi.h @@ -799,7 +799,6 @@ enum HT_CAP_AMPDU_FACTOR { * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ #define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 /* Spatial Multiplexing Power Save Modes */ diff --git a/drivers/staging/rtlwifi/base.c b/drivers/staging/rtlwifi/base.c index e46e47d93d7d..094827c1879a 100644 --- a/drivers/staging/rtlwifi/base.c +++ b/drivers/staging/rtlwifi/base.c @@ -1838,7 +1838,7 @@ void rtl_rx_ampdu_apply(struct rtl_priv *rtlpriv) reject_agg, ctrl_agg_size, agg_size); rtlpriv->hw->max_rx_aggregation_subframes = - (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF); + (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF_HT); } /********************************************************* diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e6a6503bfa33..9c03a7d5e400 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1433,11 +1433,13 @@ struct ieee80211_ht_operation { #define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 /* - * A-PMDU buffer sizes - * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) + * A-MPDU buffer sizes + * According to HT size varies from 8 to 64 frames + * HE adds the ability to have up to 256 frames. */ -#define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 +#define IEEE80211_MIN_AMPDU_BUF 0x8 +#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 +#define IEEE80211_MAX_AMPDU_BUF 0x100 /* Spatial Multiplexing Power Save Modes (for capability) */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e83c19d4c292..3ffd853b483f 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -274,7 +274,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || - (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + (buf_size > IEEE80211_MAX_AMPDU_BUF_HT)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", @@ -283,7 +283,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } /* determine default buffer size */ if (buf_size == 0) - buf_size = IEEE80211_MAX_AMPDU_BUF; + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > sta->sta.max_rx_aggregation_subframes) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ac4295296514..86c6bc0432ba 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -514,7 +514,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, params.ssn, - IEEE80211_MAX_AMPDU_BUF, + IEEE80211_MAX_AMPDU_BUF_HT, tid_tx->timeout); } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 26a7ba3b698f..f849ea814993 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work) test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, + IEEE80211_MAX_AMPDU_BUF_HT, false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a6f8e3a646d4..070f77862014 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -597,8 +597,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local->hw.queues = 1; local->hw.max_rates = 1; local->hw.max_report_rates = 0; - local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; - local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; + local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; -- cgit v1.2.3 From 41cbb0f5a29592874355e4159489eb08337cd50e Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:44 +0300 Subject: mac80211: add support for HE Add support for HE in mac80211 conforming with P802.11ax_D1.4. Johannes: Fix another bug with the buf_size comparison in agg-rx.c. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 64 ++++++++-- net/mac80211/Makefile | 1 + net/mac80211/agg-rx.c | 10 +- net/mac80211/agg-tx.c | 19 ++- net/mac80211/cfg.c | 5 + net/mac80211/he.c | 55 +++++++++ net/mac80211/ieee80211_i.h | 16 +++ net/mac80211/main.c | 19 ++- net/mac80211/mlme.c | 288 ++++++++++++++++++++++++++++++++++++++++++--- net/mac80211/rx.c | 127 +++++++++++++++++++- net/mac80211/sta_info.c | 15 ++- net/mac80211/sta_info.h | 20 +++- net/mac80211/trace.h | 2 +- net/mac80211/util.c | 120 ++++++++++++++++++- 14 files changed, 716 insertions(+), 45 deletions(-) create mode 100644 net/mac80211/he.c (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 851a5e19ae32..5790f55c241d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /** @@ -162,6 +163,8 @@ enum ieee80211_ac_numbers { * @txop: maximum burst time in units of 32 usecs, 0 meaning disabled * @acm: is mandatory admission control required for the access category * @uapsd: is U-APSD mode enabled for the queue + * @mu_edca: is the MU EDCA configured + * @mu_edca_param_rec: MU EDCA Parameter Record for HE */ struct ieee80211_tx_queue_params { u16 txop; @@ -170,6 +173,8 @@ struct ieee80211_tx_queue_params { u8 aifs; bool acm; bool uapsd; + bool mu_edca; + struct ieee80211_he_mu_edca_param_ac_rec mu_edca_param_rec; }; struct ieee80211_low_level_stats { @@ -463,6 +468,15 @@ struct ieee80211_mu_group_data { * This structure keeps information about a BSS (and an association * to that BSS) that can change during the lifetime of the BSS. * + * @bss_color: 6-bit value to mark inter-BSS frame, if BSS supports HE + * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE + * @multi_sta_back_32bit: supports BA bitmap of 32-bits in Multi-STA BACK + * @uora_exists: is the UORA element advertised by AP + * @ack_enabled: indicates support to receive a multi-TID that solicits either + * ACK, BACK or both + * @uora_ocw_range: UORA element's OCW Range field + * @frame_time_rts_th: HE duration RTS threshold, in units of 32us + * @he_support: does this BSS support HE * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS * or not @@ -550,6 +564,14 @@ struct ieee80211_mu_group_data { */ struct ieee80211_bss_conf { const u8 *bssid; + u8 bss_color; + u8 htc_trig_based_pkt_ext; + bool multi_sta_back_32bit; + bool uora_exists; + bool ack_enabled; + u8 uora_ocw_range; + u16 frame_time_rts_th; + bool he_support; /* association related data */ bool assoc, ibss_joined; bool ibss_creator; @@ -1106,6 +1128,18 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this * frame * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known + * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present + * (&struct ieee80211_radiotap_he, mac80211 will fill in + * - DATA3_DATA_MCS + * - DATA3_DATA_DCM + * - DATA3_CODING + * - DATA5_GI + * - DATA5_DATA_BW_RU_ALLOC + * - DATA6_NSTS + * - DATA3_STBC + * from the RX info data, so leave those zeroed when building this data) + * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present + * (&struct ieee80211_radiotap_he_mu) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -1134,6 +1168,8 @@ enum mac80211_rx_flags { RX_FLAG_ICV_STRIPPED = BIT(23), RX_FLAG_AMPDU_EOF_BIT = BIT(24), RX_FLAG_AMPDU_EOF_BIT_KNOWN = BIT(25), + RX_FLAG_RADIOTAP_HE = BIT(26), + RX_FLAG_RADIOTAP_HE_MU = BIT(27), }; /** @@ -1164,6 +1200,7 @@ enum mac80211_rx_encoding { RX_ENC_LEGACY = 0, RX_ENC_HT, RX_ENC_VHT, + RX_ENC_HE, }; /** @@ -1198,6 +1235,9 @@ enum mac80211_rx_encoding { * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags + * @he_ru: HE RU, from &enum nl80211_he_ru_alloc + * @he_gi: HE GI, from &enum nl80211_he_gi + * @he_dcm: HE DCM value * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU @@ -1211,7 +1251,8 @@ struct ieee80211_rx_status { u32 flag; u16 freq; u8 enc_flags; - u8 encoding:2, bw:3; + u8 encoding:2, bw:3, he_ru:3; + u8 he_gi:2, he_dcm:1; u8 rate_idx; u8 nss; u8 rx_flags; @@ -1770,6 +1811,7 @@ struct ieee80211_sta_rates { * @supp_rates: Bitmap of supported rates (per band) * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities + * @he_cap: HE capabilities of this STA * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -1805,7 +1847,8 @@ struct ieee80211_sta { u16 aid; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; - u8 max_rx_aggregation_subframes; + struct ieee80211_sta_he_cap he_cap; + u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; @@ -2196,10 +2239,11 @@ enum ieee80211_hw_flags { * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an - * aggregate an HT driver will transmit. Though ADDBA will advertise - * a constant value of 64 as some older APs can crash if the window - * size is smaller (an example is LinkSys WRT120N with FW v1.0.07 - * build 002 Jun 18 2012). + * aggregate an HT/HE device will transmit. In HT AddBA we'll + * advertise a constant value of 64 as some older APs crash if + * the window size is smaller (an example is LinkSys WRT120N + * with FW v1.0.07 build 002 Jun 18 2012). + * For AddBA to HE capable peers this value will be used. * * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum * of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list. @@ -2216,6 +2260,8 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * + * @radiotap_he: HE radiotap validity flags + * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * 'units_pos' member is set to a non-negative value it must be set to * a combination of a IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a @@ -2263,8 +2309,8 @@ struct ieee80211_hw { u8 max_rates; u8 max_report_rates; u8 max_rate_tries; - u8 max_rx_aggregation_subframes; - u8 max_tx_aggregation_subframes; + u16 max_rx_aggregation_subframes; + u16 max_tx_aggregation_subframes; u8 max_tx_fragments; u8 offchannel_tx_hw_queue; u8 radiotap_mcs_details; @@ -2904,7 +2950,7 @@ struct ieee80211_ampdu_params { struct ieee80211_sta *sta; u16 tid; u16 ssn; - u8 buf_size; + u16 buf_size; bool amsdu; u16 timeout; }; diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index e3589ade62e0..bb707789ef2b 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -12,6 +12,7 @@ mac80211-y := \ scan.o offchannel.o \ ht.o agg-tx.o agg-rx.o \ vht.o \ + he.o \ ibss.o \ iface.o \ rate.o \ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3ffd853b483f..6a4f154c99f6 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, }; int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + u16 max_buf_size; if (tid >= IEEE80211_FIRST_TSPEC_TSID) { ht_dbg(sta->sdata, @@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } + if (sta->sta.he_cap.has_he) + max_buf_size = IEEE80211_MAX_AMPDU_BUF; + else + max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + /* sanity check for incoming parameters: * check if configuration can support the BA policy * and if buffer size does not exceeds max value */ /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || - (buf_size > IEEE80211_MAX_AMPDU_BUF_HT)) { + (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", @@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } /* determine default buffer size */ if (buf_size == 0) - buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + buf_size = max_buf_size; /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > sta->sta.max_rx_aggregation_subframes) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 86c6bc0432ba..69e831bc317b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; + u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, params.ssn, - IEEE80211_MAX_AMPDU_BUF_HT, - tid_tx->timeout); + buf_size, tid_tx->timeout); } /* @@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, { struct tid_ampdu_tx *tid_tx; struct ieee80211_txq *txq; - u16 capab, tid; - u8 buf_size; + u16 capab, tid, buf_size; bool amsdu; capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c4e2f7d2bcb8..02f3672e7b5e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1412,6 +1412,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); + if (params->he_capa) + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + (void *)params->he_capa, + params->he_capa_len, sta); + if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it diff --git a/net/mac80211/he.c b/net/mac80211/he.c new file mode 100644 index 000000000000..769078ed5a12 --- /dev/null +++ b/net/mac80211/he.c @@ -0,0 +1,55 @@ +/* + * HE handling + * + * Copyright(c) 2017 Intel Deutschland GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ieee80211_i.h" + +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta) +{ + struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; + u8 he_ppe_size; + u8 mcs_nss_size; + u8 he_total_size; + + memset(he_cap, 0, sizeof(*he_cap)); + + if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband)) + return; + + /* Make sure size is OK */ + mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem); + he_ppe_size = + ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) + + mcs_nss_size], + he_cap_ie_elem->phy_cap_info); + he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size + + he_ppe_size; + if (he_cap_len < he_total_size) + return; + + memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem)); + + /* HE Tx/Rx HE MCS NSS Support Field */ + memcpy(&he_cap->he_mcs_nss_supp, + &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size); + + /* Check if there are (optional) PPE Thresholds */ + if (he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) + memcpy(he_cap->ppe_thres, + &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size], + he_ppe_size); + + he_cap->has_he = true; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a6c12c104c38..172aeae21ae9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -365,6 +365,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), IEEE80211_STA_ENABLE_RRM = BIT(15), + IEEE80211_STA_DISABLE_HE = BIT(16), }; struct ieee80211_mgd_auth_data { @@ -1454,6 +1455,10 @@ struct ieee802_11_elems { const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; + const u8 *he_cap; + const struct ieee80211_he_operation *he_operation; + const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; @@ -1483,6 +1488,7 @@ struct ieee802_11_elems { u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; + u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; @@ -1825,6 +1831,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); +/* HE */ +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, @@ -2076,6 +2089,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 070f77862014..b33faba8cbbe 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -825,7 +826,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht; + bool supp_ht, supp_vht, supp_he; netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; @@ -905,6 +906,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = 0; supp_ht = false; supp_vht = false; + supp_he = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; @@ -931,6 +933,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; + if (!supp_he) + supp_he = !!ieee80211_get_he_sta_cap(sband); + if (!sband->ht_cap.ht_supported) continue; @@ -1020,6 +1025,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); + /* HE cap element is variable in size - set len to allow max size */ + /* + * TODO: 1 is added at the end of the calculation to accommodate for + * the temporary placing of the HE capabilities IE under EXT. + * Remove it once it is placed in the final place. + */ + if (supp_he) + local->scan_ies_len += + 2 + sizeof(struct ieee80211_he_cap_elem) + + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + 1; + if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a44e5b4aaeda..0322d78007ad 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, struct cfg80211_chan_def *chandef, bool tracking) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -207,7 +208,27 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, } vht_chandef = *chandef; - if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper && + (le32_to_cpu(he_oper->he_oper_params) & + IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { + struct ieee80211_vht_operation he_oper_vht_cap; + + /* + * Set only first 3 bytes (other 2 aren't used in + * ieee80211_chandef_vht_oper() anyway) + */ + memcpy(&he_oper_vht_cap, he_oper->optional, 3); + he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); + + if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap, + &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + sdata_info(sdata, + "HE AP VHT information is invalid, disable HE\n"); + ret = IEEE80211_STA_DISABLE_HE; + goto out; + } + } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -300,12 +321,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, const u8 *bssid, u32 *changed) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_supported_band *sband; - struct ieee80211_channel *chan; + struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan; + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[chan->band]; struct cfg80211_chan_def chandef; u16 ht_opmode; u32 flags; @@ -320,6 +343,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) vht_oper = NULL; + /* don't check HE if we associated as non-HE station */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE || + !ieee80211_get_he_sta_cap(sband)) + he_oper = NULL; + if (WARN_ON_ONCE(!sta)) return -EINVAL; @@ -333,12 +361,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } - chan = sdata->vif.bss_conf.chandef.chan; - sband = local->hw.wiphy->bands[chan->band]; - - /* calculate new channel (type) based on HT/VHT operation IEs */ + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, true); /* @@ -582,6 +607,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); } +/* This function determines HE capability flags for the association + * and builds the IE. + */ +static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_supported_band *sband) +{ + u8 *pos; + const struct ieee80211_sta_he_cap *he_cap = NULL; + u8 he_cap_size; + + he_cap = ieee80211_get_he_sta_cap(sband); + if (!he_cap) + return; + + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + he_cap_size = + 2 + 1 + sizeof(he_cap->he_cap_elem) + + ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + pos = skb_put(skb, he_cap_size); + ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); +} + static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -643,6 +696,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 2 * sband->n_channels + /* supported channels */ 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */ + 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ @@ -827,11 +883,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } + /* if present, add any custom IEs that go before HE */ + if (assoc_data->ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_OPMODE_NOTIF, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, + /* 11ai elements */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + + /* RIC already taken above, so no need to handle here anymore */ + noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len, + before_he, ARRAY_SIZE(before_he), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, assoc_data->ie + offset, noffset - offset); + offset = noffset; + } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); - /* if present, add any custom non-vendor IEs that go after HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + ieee80211_add_he_ie(sdata, skb, sband); + + /* if present, add any custom non-vendor IEs that go after HE */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, @@ -898,6 +984,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) @@ -929,6 +1020,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; + /* Don't send NDPs when connected HE */ + if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; @@ -1700,9 +1795,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work) } /* MLME */ -static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - const u8 *wmm_param, size_t wmm_param_len) +static bool +ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const u8 *wmm_param, size_t wmm_param_len, + const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1749,6 +1846,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; @@ -1756,6 +1856,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; @@ -1763,6 +1866,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: @@ -1771,6 +1877,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } @@ -3021,6 +3130,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + /* + * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark + * HE as disabled. If on the 5GHz band, make sure it supports VHT. + */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || + (sband->band == NL80211_BAND_5GHZ && + ifmgd->flags & IEEE80211_STA_DISABLE_VHT) || + (!elems.he_cap && !elems.he_operation)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + (!elems.he_cap || !elems.he_operation)) { + mutex_unlock(&sdata->local->sta_mtx); + sdata_info(sdata, + "HE AP is missing HE capability/operation\n"); + ret = false; + goto out; + } + /* Set up internal HT/VHT capabilities */ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -3030,6 +3158,48 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems.vht_cap_elem, sta); + if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems.he_cap) { + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + elems.he_cap, + elems.he_cap_len, + sta); + + bss_conf->he_support = sta->sta.he_cap.has_he; + } else { + bss_conf->he_support = false; + } + + if (bss_conf->he_support) { + u32 he_oper_params = + le32_to_cpu(elems.he_operation->he_oper_params); + + bss_conf->bss_color = he_oper_params & + IEEE80211_HE_OPERATION_BSS_COLOR_MASK; + bss_conf->htc_trig_based_pkt_ext = + (he_oper_params & + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) << + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET; + bss_conf->frame_time_rts_th = + (he_oper_params & + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) << + IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET; + + bss_conf->multi_sta_back_32bit = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP; + + bss_conf->ack_enabled = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_ACK_EN; + + bss_conf->uora_exists = !!elems.uora_element; + if (elems.uora_element) + bss_conf->uora_ocw_range = elems.uora_element[0]; + + /* TODO: OPEN: what happens if BSS color disable is set? */ + } + /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own @@ -3089,7 +3259,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) { + elems.wmm_param_len, + elems.mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3603,7 +3774,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) + elems.wmm_param_len, + elems.mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* @@ -3642,7 +3814,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, - elems.vht_operation, bssid, &changed)) { + elems.vht_operation, elems.he_operation, + bssid, &changed)) { mutex_unlock(&local->sta_mtx); sdata_info(sdata, "failed to follow AP %pM bandwidth change, disconnect\n", @@ -4279,6 +4452,66 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, return chains; } +static bool +ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, + const struct ieee80211_he_operation *he_op) +{ + const struct ieee80211_sta_he_cap *sta_he_cap = + ieee80211_get_he_sta_cap(sband); + u16 ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + int i; + + if (!sta_he_cap || !he_op) + return false; + + /* Need to go over for 80MHz, 160MHz and for 80+80 */ + for (i = 0; i < 3; i++) { + const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = + &sta_he_cap->he_mcs_nss_supp; + u16 sta_mcs_map_rx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); + u16 sta_mcs_map_tx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); + u8 nss; + bool verified = true; + + /* + * For each band there is a maximum of 8 spatial streams + * possible. Each of the sta_mcs_map_* is a 16-bit struct built + * of 2 bits per NSS (1-8), with the values defined in enum + * ieee80211_he_mcs_support. Need to make sure STA TX and RX + * capabilities aren't less than the AP's minimum requirements + * for this HE BSS per SS. + * It is enough to find one such band that meets the reqs. + */ + for (nss = 8; nss > 0; nss--) { + u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; + u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; + u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + + if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + /* + * Make sure the HE AP doesn't require MCSs that aren't + * supported by the client + */ + if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { + verified = false; + break; + } + } + + if (verified) + return true; + } + + /* If here, STA doesn't meet AP's HE min requirements */ + return false; +} + static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss) { @@ -4287,6 +4520,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap = NULL; const struct ieee80211_ht_operation *ht_oper = NULL; const struct ieee80211_vht_operation *vht_oper = NULL; + const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; @@ -4342,6 +4576,25 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + ieee80211_get_he_sta_cap(sband)) { + const struct cfg80211_bss_ies *ies; + const u8 *he_oper_ie; + + ies = rcu_dereference(cbss->ies); + he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, + ies->data, ies->len); + if (he_oper_ie && + he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3])) + he_oper = (void *)(he_oper_ie + 3); + else + he_oper = NULL; + + if (!he_oper || + !ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { @@ -4358,7 +4611,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), @@ -4764,8 +5017,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; netdev_info(sdata->dev, - "disabling HT/VHT due to WEP/TKIP use\n"); + "disabling HE/HT/VHT due to WEP/TKIP use\n"); } } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 756ba176db1e..a16ba568e2a3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -175,6 +175,20 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 12; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12); + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -263,6 +277,19 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, int mpdulen, chain; unsigned long chains = status->chains; struct ieee80211_vendor_radiotap rtap = {}; + struct ieee80211_radiotap_he he = {}; + struct ieee80211_radiotap_he_mu he_mu = {}; + + if (status->flag & RX_FLAG_RADIOTAP_HE) { + he = *(struct ieee80211_radiotap_he *)skb->data; + skb_pull(skb, sizeof(he)); + WARN_ON_ONCE(status->encoding != RX_ENC_HE); + } + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) { + he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data; + skb_pull(skb, sizeof(he_mu)); + } if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { rtap = *(struct ieee80211_vendor_radiotap *)skb->data; @@ -520,6 +547,89 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos++ = flags; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { +#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val)) + + if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) { + he.data6 |= HE_PREP(DATA6_NSTS, + FIELD_GET(RX_ENC_FLAG_STBC_MASK, + status->enc_flags)); + he.data3 |= HE_PREP(DATA3_STBC, 1); + } else { + he.data6 |= HE_PREP(DATA6_NSTS, status->nss); + } + +#define CHECK_GI(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ + (int)NL80211_RATE_INFO_HE_GI_##s) + + CHECK_GI(0_8); + CHECK_GI(1_6); + CHECK_GI(3_2); + + he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx); + he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm); + he.data3 |= HE_PREP(DATA3_CODING, + !!(status->enc_flags & RX_ENC_FLAG_LDPC)); + + he.data5 |= HE_PREP(DATA5_GI, status->he_gi); + + switch (status->bw) { + case RATE_INFO_BW_20: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); + break; + case RATE_INFO_BW_40: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); + break; + case RATE_INFO_BW_80: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); + break; + case RATE_INFO_BW_160: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); + break; + case RATE_INFO_BW_HE_RU: +#define CHECK_RU_ALLOC(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ + NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) + + CHECK_RU_ALLOC(26); + CHECK_RU_ALLOC(52); + CHECK_RU_ALLOC(106); + CHECK_RU_ALLOC(242); + CHECK_RU_ALLOC(484); + CHECK_RU_ALLOC(996); + CHECK_RU_ALLOC(2x996); + + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + status->he_ru + 4); + break; + default: + WARN_ONCE(1, "Invalid SU BW %d\n", status->bw); + } + + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + memcpy(pos, &he, sizeof(he)); + pos += sizeof(he); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU); + memcpy(pos, &he_mu, sizeof(he_mu)); + pos += sizeof(he_mu); + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; @@ -613,6 +723,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, rcu_dereference(local->monitor_sdata); bool only_monitor = false; + if (status->flag & RX_FLAG_RADIOTAP_HE) + rtap_space += sizeof(struct ieee80211_radiotap_he); + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) + rtap_space += sizeof(struct ieee80211_radiotap_he_mu); + if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; @@ -3386,8 +3502,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, status = IEEE80211_SKB_RXCB((rx->skb)); sband = rx->local->hw.wiphy->bands[status->band]; - if (!(status->encoding == RX_ENC_HT) && - !(status->encoding == RX_ENC_VHT)) + if (status->encoding == RX_ENC_LEGACY) rate = &sband->bitrates[status->rate_idx]; ieee80211_rx_cooked_monitor(rx, rate); @@ -4386,6 +4501,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss)) goto drop; break; + case RX_ENC_HE: + if (WARN_ONCE(status->rate_idx > 11 || + !status->nss || + status->nss > 8, + "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n", + status->rate_idx, status->nss)) + goto drop; + break; default: WARN_ON_ONCE(1); /* fall through */ diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index aa96fddfbfc2..aa8fe771a8db 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1323,6 +1323,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | @@ -1968,7 +1973,7 @@ sta_get_last_rx_stats(struct sta_info *sta) return stats; } -static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, +static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); @@ -2005,6 +2010,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } + case STA_STATS_RATE_TYPE_HE: + rinfo->flags = RATE_INFO_FLAGS_HE_MCS; + rinfo->mcs = STA_STATS_GET(HE_MCS, rate); + rinfo->nss = STA_STATS_GET(HE_NSS, rate); + rinfo->he_gi = STA_STATS_GET(HE_GI, rate); + rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); + rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); + break; } } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 81b35f623792..9a04327d71d1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -170,7 +170,7 @@ struct tid_ampdu_tx { u8 dialog_token; u8 stop_initiator; bool tx_stop; - u8 buf_size; + u16 buf_size; u16 failed_bar_ssn; bool bar_pending; @@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats { int last_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - u16 last_rate; + u32 last_rate; struct u64_stats_sync syncp; u64 bytes; u64 msdu[IEEE80211_NUM_TIDS + 1]; @@ -764,6 +764,7 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_LEGACY, STA_STATS_RATE_TYPE_HT, STA_STATS_RATE_TYPE_VHT, + STA_STATS_RATE_TYPE_HE, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) @@ -771,9 +772,14 @@ enum sta_stats_type { #define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) #define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_BW GENMASK(11, 8) #define STA_STATS_FIELD_SGI GENMASK(12, 12) #define STA_STATS_FIELD_TYPE GENMASK(15, 13) +#define STA_STATS_FIELD_HE_RU GENMASK(18, 16) +#define STA_STATS_FIELD_HE_GI GENMASK(20, 19) +#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -782,7 +788,7 @@ enum sta_stats_type { static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { - u16 r; + u32 r; r = STA_STATS_FIELD(BW, s->bw); @@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(LEGACY_BAND, s->band); r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); break; + case RX_ENC_HE: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE); + r |= STA_STATS_FIELD(HE_NSS, s->nss); + r |= STA_STATS_FIELD(HE_MCS, s->rate_idx); + r |= STA_STATS_FIELD(HE_GI, s->he_gi); + r |= STA_STATS_FIELD(HE_RU, s->he_ru); + r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 80a7edf8d314..0ab69a1964f8 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -92,7 +92,7 @@ STA_ENTRY \ __field(u16, tid) \ __field(u16, ssn) \ - __field(u8, buf_size) \ + __field(u16, buf_size) \ __field(bool, amsdu) \ __field(u16, timeout) \ __field(u16, action) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b744b10465c3..c77c84325348 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1095,6 +1095,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; + case WLAN_EID_EXTENSION: + if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA && + elen >= (sizeof(*elems->mu_edca_param_set) + 1)) { + elems->mu_edca_param_set = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) { + elems->he_cap = (void *)&pos[1]; + elems->he_cap_len = elen - 1; + } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION && + elen >= sizeof(*elems->he_operation) && + elen >= ieee80211_he_oper_size(&pos[1])) { + elems->he_operation = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) { + elems->uora_element = (void *)&pos[1]; + } + break; default: break; } @@ -1356,6 +1371,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, size_t *offset, u32 flags) { struct ieee80211_supported_band *sband; + const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; size_t noffset; int supp_rates_len, i; @@ -1463,11 +1479,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->ht_cap.cap); } - /* - * If adding more here, adjust code in main.c - * that calculates local->scan_ies_len. - */ - /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { @@ -1510,6 +1521,39 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->vht_cap.cap); } + /* insert custom IEs that go before HE */ + if (ie && ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, + WLAN_EID_AP_CSN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_he, ARRAY_SIZE(before_he), + *offset); + if (end - pos < noffset - *offset) + goto out_err; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; + } + + he_cap = ieee80211_get_he_sta_cap(sband); + if (he_cap) { + pos = ieee80211_ie_build_he_cap(pos, he_cap, end); + if (!pos) + goto out_err; + } + + /* + * If adding more here, adjust code in main.c + * that calculates local->scan_ies_len. + */ + return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); @@ -2396,6 +2440,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos; } +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end) +{ + u8 n; + u8 ie_len; + u8 *orig_pos = pos; + + /* Make sure we have place for the IE */ + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + if (!he_cap) + return orig_pos; + + n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); + ie_len = 2 + 1 + + sizeof(he_cap->he_cap_elem) + n + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + + if ((end - pos) < ie_len) + return orig_pos; + + *pos++ = WLAN_EID_EXTENSION; + pos++; /* We'll set the size later below */ + *pos++ = WLAN_EID_EXT_HE_CAPABILITY; + + /* Fixed data */ + memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem)); + pos += sizeof(he_cap->he_cap_elem); + + memcpy(pos, &he_cap->he_mcs_nss_supp, n); + pos += n; + + /* Check if PPE Threshold should be present */ + if ((he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + goto end; + + /* + * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: + * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) + */ + n = hweight8(he_cap->ppe_thres[0] & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + /* Copy PPE Thresholds */ + memcpy(pos, &he_cap->ppe_thres, n); + pos += n; + +end: + orig_pos[1] = (pos - orig_pos) - 2; + return pos; +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) -- cgit v1.2.3 From 0eb71a9da5796851fa87ddc1a534066c0fe54055 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: split rhashtable.h Due to the use of rhashtables in net namespaces, rhashtable.h is included in lots of the kernel, so a small changes can required a large recompilation. This makes development painful. This patch splits out rhashtable-types.h which just includes the major type declarations, and does not include (non-trivial) inline code. rhashtable.h is no longer included by anything in the include/ directory. Common include files only include rhashtable-types.h so a large recompilation is only triggered when that changes. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- MAINTAINERS | 2 + drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/mroute_base.h | 2 +- include/linux/rhashtable-types.h | 139 +++++++++++++++++++++++++++++ include/linux/rhashtable.h | 127 +------------------------- include/net/inet_frag.h | 2 +- include/net/netfilter/nf_flow_table.h | 2 +- include/net/sctp/structs.h | 2 +- include/net/seg6.h | 2 +- include/net/seg6_hmac.h | 2 +- ipc/msg.c | 1 + ipc/sem.c | 1 + ipc/shm.c | 1 + ipc/util.c | 1 + lib/rhashtable.c | 1 + net/ipv4/inet_fragment.c | 1 + net/ipv4/ipmr.c | 1 + net/ipv4/ipmr_base.c | 1 + net/ipv6/ip6mr.c | 1 + net/ipv6/seg6.c | 1 + net/ipv6/seg6_hmac.c | 1 + net/netfilter/nf_tables_api.c | 1 + net/sctp/input.c | 1 + net/sctp/socket.c | 1 + 26 files changed, 166 insertions(+), 133 deletions(-) create mode 100644 include/linux/rhashtable-types.h (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index edf3cf5ea691..99e5cef8172e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12162,7 +12162,9 @@ M: Herbert Xu L: netdev@vger.kernel.org S: Maintained F: lib/rhashtable.c +F: lib/test_rhashtable.c F: include/linux/rhashtable.h +F: include/linux/rhashtable-types.h RICOH R5C592 MEMORYSTICK DRIVER M: Maxim Levitsky diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 0dbe2d9e22d6..1adb968b8354 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 6cc2df7f7ac9..e1c9eea6015b 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index b5630c8eb2f3..6cea726612b7 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct user_namespace; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index d633f737b3c6..fd436cdd4725 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -2,7 +2,7 @@ #define __LINUX_MROUTE_BASE_H #include -#include +#include #include #include #include diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h new file mode 100644 index 000000000000..9740063ff13b --- /dev/null +++ b/include/linux/rhashtable-types.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Simple structures that might be needed in include + * files. + */ + +#ifndef _LINUX_RHASHTABLE_TYPES_H +#define _LINUX_RHASHTABLE_TYPES_H + +#include +#include +#include +#include + +struct rhash_head { + struct rhash_head __rcu *next; +}; + +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct bucket_table; + +/** + * struct rhashtable_compare_arg - Key for the function rhashtable_compare + * @ht: Hash table + * @key: Key to compare against + */ +struct rhashtable_compare_arg { + struct rhashtable *ht; + const void *key; +}; + +typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); +typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); +typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, + const void *obj); + +/** + * struct rhashtable_params - Hash table construction parameters + * @nelem_hint: Hint on number of elements, should be 75% of desired size + * @key_len: Length of key + * @key_offset: Offset of key in struct to be hashed + * @head_offset: Offset of rhash_head in struct to be hashed + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking + * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) + * @automatic_shrinking: Enable automatic shrinking of tables + * @nulls_base: Base value to generate nulls marker + * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) + * @obj_hashfn: Function to hash object + * @obj_cmpfn: Function to compare key with object + */ +struct rhashtable_params { + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; + unsigned int max_size; + u16 min_size; + bool automatic_shrinking; + u8 locks_mul; + u32 nulls_base; + rht_hashfn_t hashfn; + rht_obj_hashfn_t obj_hashfn; + rht_obj_cmpfn_t obj_cmpfn; +}; + +/** + * struct rhashtable - Hash table handle + * @tbl: Bucket table + * @key_len: Key length for hashfn + * @max_elems: Maximum number of elements in table + * @p: Configuration parameters + * @rhlist: True if this is an rhltable + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table + */ +struct rhashtable { + struct bucket_table __rcu *tbl; + unsigned int key_len; + unsigned int max_elems; + struct rhashtable_params p; + bool rhlist; + struct work_struct run_work; + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; +}; + +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @tbl: The table that we were walking over + */ +struct rhashtable_walker { + struct list_head list; + struct bucket_table *tbl; +}; + +/** + * struct rhashtable_iter - Hash table iterator + * @ht: Table to iterate through + * @p: Current pointer + * @list: Current hash list pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhlist_head *list; + struct rhashtable_walker walker; + unsigned int slot; + unsigned int skip; + bool end_of_table; +}; + +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); + +#endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 4e1f535c2034..48754ab07cdf 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * @@ -17,16 +18,14 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include -#include #include #include #include #include #include -#include #include +#include /* * The end of the chain is marked with a special nulls marks which has * the following format: @@ -64,15 +63,6 @@ */ #define RHT_ELASTICITY 16u -struct rhash_head { - struct rhash_head __rcu *next; -}; - -struct rhlist_head { - struct rhash_head rhead; - struct rhlist_head __rcu *next; -}; - /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -102,114 +92,6 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -/** - * struct rhashtable_compare_arg - Key for the function rhashtable_compare - * @ht: Hash table - * @key: Key to compare against - */ -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; -}; - -typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); -typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, - const void *obj); - -struct rhashtable; - -/** - * struct rhashtable_params - Hash table construction parameters - * @nelem_hint: Hint on number of elements, should be 75% of desired size - * @key_len: Length of key - * @key_offset: Offset of key in struct to be hashed - * @head_offset: Offset of rhash_head in struct to be hashed - * @max_size: Maximum size while expanding - * @min_size: Minimum size while shrinking - * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) - * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker - * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) - * @obj_hashfn: Function to hash object - * @obj_cmpfn: Function to compare key with object - */ -struct rhashtable_params { - u16 nelem_hint; - u16 key_len; - u16 key_offset; - u16 head_offset; - unsigned int max_size; - u16 min_size; - bool automatic_shrinking; - u8 locks_mul; - u32 nulls_base; - rht_hashfn_t hashfn; - rht_obj_hashfn_t obj_hashfn; - rht_obj_cmpfn_t obj_cmpfn; -}; - -/** - * struct rhashtable - Hash table handle - * @tbl: Bucket table - * @key_len: Key length for hashfn - * @max_elems: Maximum number of elements in table - * @p: Configuration parameters - * @rhlist: True if this is an rhltable - * @run_work: Deferred worker to expand/shrink asynchronously - * @mutex: Mutex to protect current/future table swapping - * @lock: Spin lock to protect walker list - * @nelems: Number of elements in table - */ -struct rhashtable { - struct bucket_table __rcu *tbl; - unsigned int key_len; - unsigned int max_elems; - struct rhashtable_params p; - bool rhlist; - struct work_struct run_work; - struct mutex mutex; - spinlock_t lock; - atomic_t nelems; -}; - -/** - * struct rhltable - Hash table with duplicate objects in a list - * @ht: Underlying rhtable - */ -struct rhltable { - struct rhashtable ht; -}; - -/** - * struct rhashtable_walker - Hash table walker - * @list: List entry on list of walkers - * @tbl: The table that we were walking over - */ -struct rhashtable_walker { - struct list_head list; - struct bucket_table *tbl; -}; - -/** - * struct rhashtable_iter - Hash table iterator - * @ht: Table to iterate through - * @p: Current pointer - * @list: Current hash list pointer - * @walker: Associated rhashtable walker - * @slot: Current slot - * @skip: Number of entries to skip in slot - */ -struct rhashtable_iter { - struct rhashtable *ht; - struct rhash_head *p; - struct rhlist_head *list; - struct rhashtable_walker walker; - unsigned int slot; - unsigned int skip; - bool end_of_table; -}; - static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) { return NULLS_MARKER(ht->p.nulls_base + hash); @@ -376,11 +258,6 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, } #endif /* CONFIG_PROVE_LOCKING */ -int rhashtable_init(struct rhashtable *ht, - const struct rhashtable_params *params); -int rhltable_init(struct rhltable *hlt, - const struct rhashtable_params *params); - void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index ed07e3786d98..f4272a29dc44 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,7 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ -#include +#include struct netns_frags { /* sysctls */ diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ba9fa4592f2b..0e355f4a3d76 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index dbe1b911a24d..e0f962d27386 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -48,7 +48,7 @@ #define __sctp_structs_h__ #include -#include +#include #include /* linux/in.h needs this!! */ #include /* We get struct sockaddr_in. */ #include /* We get struct in6_addr */ diff --git a/include/net/seg6.h b/include/net/seg6.h index e029e301faa5..2567941a2f32 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 69c3a106056b..7fda469e2758 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 diff --git a/ipc/msg.c b/ipc/msg.c index 3b6545302598..203281198079 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/ipc/sem.c b/ipc/sem.c index 5af1943ad782..29c0347ef11d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include "util.h" diff --git a/ipc/shm.c b/ipc/shm.c index 051a3e1fb8df..d4daf78df6da 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -43,6 +43,7 @@ #include #include #include +#include #include diff --git a/ipc/util.c b/ipc/util.c index 4e81182fa0ac..fdffff41f65b 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -63,6 +63,7 @@ #include #include #include +#include #include diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9427b5766134..c9fafea7dc6e 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -28,6 +28,7 @@ #include #include #include +#include #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9e35b81d093..316518f87294 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f79b9803a16..82f914122f1b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index cafb0506c8c9..1ad9aa62a97b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -2,6 +2,7 @@ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ +#include #include /* Sets everything common except 'dev', since that is done under locking */ diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0d0f0053bb11..d0b7e0249c13 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 0fdf2a55e746..8d0ba757a46c 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 33fb35cbfac1..b1791129a875 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 896d4a36081d..3f211e1025c1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sctp/input.c b/net/sctp/input.c index ba8a6e6c36fa..9bbc5f92c941 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -56,6 +56,7 @@ #include #include #include +#include /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d20f7addee19..0e91e83eea5a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 9f9a707738aa7a8b9f78a641b83927ada256a626 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: remove nulls_base and related code. This "feature" is unused, undocumented, and untested and so doesn't really belong. A patch is under development to properly implement support for detecting when a search gets diverted down a different chain, which the common purpose of nulls markers. This patch actually fixes a bug too. The table resizing allows a table to grow to 2^31 buckets, but the hash is truncated to 27 bits - any growth beyond 2^27 is wasteful an ineffective. This patch results in NULLS_MARKER(0) being used for all chains, and leaves the use of rht_is_a_null() to test for it. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable-types.h | 2 -- include/linux/rhashtable.h | 33 +++------------------------------ lib/rhashtable.c | 8 -------- lib/test_rhashtable.c | 5 +---- net/core/xdp.c | 4 ++-- 5 files changed, 6 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 9740063ff13b..763d613ce2c2 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -50,7 +50,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @min_size: Minimum size while shrinking * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object * @obj_cmpfn: Function to compare key with object @@ -64,7 +63,6 @@ struct rhashtable_params { u16 min_size; bool automatic_shrinking; u8 locks_mul; - u32 nulls_base; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; rht_obj_cmpfn_t obj_cmpfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 48754ab07cdf..d9f719af7936 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -28,25 +28,8 @@ #include /* * The end of the chain is marked with a special nulls marks which has - * the following format: - * - * +-------+-----------------------------------------------------+-+ - * | Base | Hash |1| - * +-------+-----------------------------------------------------+-+ - * - * Base (4 bits) : Reserved to distinguish between multiple tables. - * Specified via &struct rhashtable_params.nulls_base. - * Hash (27 bits): Full hash (unmasked) of first element added to bucket - * 1 (1 bit) : Nulls marker (always set) - * - * The remaining bits of the next pointer remain unused for now. + * the least significant bit set. */ -#define RHT_BASE_BITS 4 -#define RHT_HASH_BITS 27 -#define RHT_BASE_SHIFT RHT_HASH_BITS - -/* Base bits plus 1 bit for nulls marker */ -#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) /* Maximum chain length before rehash * @@ -92,24 +75,14 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) -{ - return NULLS_MARKER(ht->p.nulls_base + hash); -} - #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ - ((ptr) = (typeof(ptr)) rht_marker(ht, hash)) + ((ptr) = (typeof(ptr)) NULLS_MARKER(0)) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } -static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr) -{ - return ((unsigned long) ptr) >> 1; -} - static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { @@ -119,7 +92,7 @@ static inline void *rht_obj(const struct rhashtable *ht, static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { - return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1); + return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, diff --git a/lib/rhashtable.c b/lib/rhashtable.c index c9fafea7dc6e..688693c919be 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -995,7 +995,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, - * .nulls_base = (1U << RHT_BASE_SHIFT), * }; * * Configuration Example 2: Variable length keys @@ -1029,9 +1028,6 @@ int rhashtable_init(struct rhashtable *ht, (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; - if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) - return -EINVAL; - memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); @@ -1096,10 +1092,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) { int err; - /* No rhlist NULLs marking for now. */ - if (params->nulls_base) - return -EINVAL; - err = rhashtable_init(&hlt->ht, params); hlt->ht.rhlist = true; return err; diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 6ca59ffcacbe..82ac39ce5310 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed) { const struct test_obj_rhl *obj = data; - return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE; + return (obj->value.id % 10); } static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) @@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = { .key_offset = offsetof(struct test_obj, value), .key_len = sizeof(struct test_obj_val), .hashfn = jhash, - .nulls_base = (3U << RHT_BASE_SHIFT), }; static struct rhashtable_params test_rht_params_dup = { @@ -296,8 +295,6 @@ static int __init test_rhltable(unsigned int entries) if (!obj_in_table) goto out_free; - /* nulls_base not supported in rhlist interface */ - test_rht_params.nulls_base = 0; err = rhltable_init(&rhlt, &test_rht_params); if (WARN_ON(err)) goto out_free; diff --git a/net/core/xdp.c b/net/core/xdp.c index 9d1f22072d5d..31c58719b5a9 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -45,8 +45,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) != sizeof(u32)); - /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ - return key << RHT_HASH_RESERVED_SPACE; + /* Use cyclic increasing ID as direct hash key */ + return key; } static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, -- cgit v1.2.3 From 3f6c65d6255a872846c44182c82c78d3dc6239f5 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 19 Jun 2018 21:42:50 -0700 Subject: tcp: ignore rcv_rtt sample with old ts ecr value When receiving multiple packets with the same ts ecr value, only try to compute rcv_rtt sample with the earliest received packet. This is because the rcv_rtt calculated by later received packets could possibly include long idle time or other types of delay. For example: (1) server sends last packet of reply with TS val V1 (2) client ACKs last packet of reply with TS ecr V1 (3) long idle time passes (4) client sends next request data packet with TS ecr V1 (again!) At this time, the rcv_rtt computed on server with TS ecr V1 will be inflated with the idle time and should get ignored. Signed-off-by: Wei Wang Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 14 +++++++++++--- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 72705eaf4b84..3dbea6610304 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -350,6 +350,7 @@ struct tcp_sock { #endif /* Receiver side RTT estimation */ + u32 rcv_rtt_last_tsecr; struct { u32 rtt_us; u32 seq; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 141acd92e58a..47c45d5be9f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2563,6 +2563,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 355d3dffd021..76ca88f63b70 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -582,9 +582,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->rx_opt.rcv_tsecr && - (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) + return; + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; + + if (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; @@ -5475,6 +5478,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); + /* When receiving pure ack in fast path, update + * last ts ecr directly instead of calling + * tcp_rcv_rtt_measure_ts() + */ + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); -- cgit v1.2.3 From cadefe5f584abaac40dce72009e4de738cbff467 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jun 2018 16:07:35 -0400 Subject: tcp_bbr: fix bbr pacing rate for internal pacing This commit makes BBR use only the MSS (without any headers) to calculate pacing rates when internal TCP-layer pacing is used. This is necessary to achieve the correct pacing behavior in this case, since tcp_internal_pacing() uses only the payload length to calculate pacing delays. Signed-off-by: Kevin Yang Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 11 +++++++++++ net/ipv4/tcp_bbr.c | 6 +++++- net/ipv4/tcp_output.c | 14 -------------- 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0448e7c5d2b4..822ee49ed0f9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1184,6 +1184,17 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) return tp->is_cwnd_limited; } +/* BBR congestion control needs pacing. + * Same remark for SO_MAX_PACING_RATE. + * sch_fq packet scheduler is efficiently handling pacing, + * but is not always installed/used. + * Return true if TCP stack should pace packets itself. + */ +static inline bool tcp_needs_internal_pacing(const struct sock *sk) +{ + return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; +} + /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window. * We do not want to add fuel to the fire, or abort too early, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 58e2f479ffb4..3b5f45b9e81e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -205,7 +205,11 @@ static u32 bbr_bw(const struct sock *sk) */ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { - rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + unsigned int mss = tcp_sk(sk)->mss_cache; + + if (!tcp_needs_internal_pacing(sk)) + mss = tcp_mss_to_mtu(sk, mss); + rate *= mss; rate *= gain; rate >>= BBR_SCALE; rate *= USEC_PER_SEC; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e08b409c71e..f8f6129160dd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -973,17 +973,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -/* BBR congestion control needs pacing. - * Same remark for SO_MAX_PACING_RATE. - * sch_fq packet scheduler is efficiently handling pacing, - * but is not always installed/used. - * Return true if TCP stack should pace packets itself. - */ -static bool tcp_needs_internal_pacing(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; -} - static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) { u64 len_ns; @@ -995,9 +984,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) if (!rate || rate == ~0U) return; - /* Should account for header sizes as sch_fq does, - * but lets make things simple. - */ len_ns = (u64)skb->len * NSEC_PER_SEC; do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, -- cgit v1.2.3 From 6c1f0a1ffb7c2b0501521b9fc1f53b4109f1791b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 22 Jun 2018 10:51:00 -0700 Subject: net: drivers/net: Convert random_ether_addr to eth_random_addr random_ether_addr is a #define for eth_random_addr which is generally preferred in kernel code by ~3:1 Convert the uses of random_ether_addr to enable removing the #define Miscellanea: o Convert &vfmac[0] to equivalent vfmac and avoid unnecessary line wrap Signed-off-by: Joe Perches Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_main.c | 5 ++--- drivers/net/ethernet/cortina/gemini.c | 2 +- drivers/net/ethernet/hisilicon/hip04_eth.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/microchip/lan743x_main.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c | 2 +- drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +- drivers/net/ethernet/sfc/ef10_sriov.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/netcp_core.c | 4 ++-- drivers/net/ntb_netdev.c | 2 +- drivers/net/usb/lan78xx.c | 2 +- drivers/net/wireless/ath/ath9k/hw.c | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- 14 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 8a815bb57177..7cb4e753829b 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -3569,9 +3569,8 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) for (j = 0; j < octeon_dev->sriov_info.max_vfs; j++) { u8 vfmac[ETH_ALEN]; - random_ether_addr(&vfmac[0]); - if (__liquidio_set_vf_mac(netdev, j, - &vfmac[0], false)) { + eth_random_addr(vfmac); + if (__liquidio_set_vf_mac(netdev, j, vfmac, false)) { dev_err(&octeon_dev->pci_dev->dev, "Error setting VF%d MAC address\n", j); diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 6d7404f66f84..ce1f04fdbf70 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -2435,7 +2435,7 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev) port->mac_addr[0], port->mac_addr[1], port->mac_addr[2]); dev_info(dev, "using a random ethernet address\n"); - random_ether_addr(netdev->dev_addr); + eth_random_addr(netdev->dev_addr); } gmac_write_mac_address(netdev); diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index 340e28211135..14374a856d30 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -904,7 +904,7 @@ static int hip04_mac_probe(struct platform_device *pdev) hip04_config_port(ndev, SPEED_100, DUPLEX_FULL); hip04_config_fifo(priv); - random_ether_addr(ndev->dev_addr); + eth_random_addr(ndev->dev_addr); hip04_update_mac_address(ndev); ret = hip04_alloc_ring(ndev, d); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index c944bd10b03d..95e9dfbe9839 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11978,7 +11978,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi) snprintf(netdev->name, IFNAMSIZ, "%.*sv%%d", IFNAMSIZ - 4, pf->vsi[pf->lan_vsi]->netdev->name); - random_ether_addr(mac_addr); + eth_random_addr(mac_addr); spin_lock_bh(&vsi->mac_filter_hash_lock); i40e_add_mac_filter(vsi, mac_addr); diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index dd947e4dd3ce..e1747a490066 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -828,7 +828,7 @@ static int lan743x_mac_init(struct lan743x_adapter *adapter) } if (!mac_address_valid) - random_ether_addr(adapter->mac_address); + eth_random_addr(adapter->mac_address); lan743x_mac_set_address(adapter, adapter->mac_address); ether_addr_copy(netdev->dev_addr, adapter->mac_address); return 0; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c index 0c744b9c6e0a..77e386ebff09 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c @@ -212,7 +212,7 @@ int qlcnic_sriov_init(struct qlcnic_adapter *adapter, int num_vfs) vp->max_tx_bw = MAX_BW; vp->min_tx_bw = MIN_BW; vp->spoofchk = false; - random_ether_addr(vp->mac); + eth_random_addr(vp->mac); dev_info(&adapter->pdev->dev, "MAC Address %pM is configured for VF %d\n", vp->mac, i); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index b9a7548ec6a0..0afc3d335d56 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -210,7 +210,7 @@ void rmnet_vnd_setup(struct net_device *rmnet_dev) rmnet_dev->netdev_ops = &rmnet_vnd_ops; rmnet_dev->mtu = RMNET_DFLT_PACKET_SIZE; rmnet_dev->needed_headroom = RMNET_NEEDED_HEADROOM; - random_ether_addr(rmnet_dev->dev_addr); + eth_random_addr(rmnet_dev->dev_addr); rmnet_dev->tx_queue_len = RMNET_TX_QUEUE_LEN; /* Raw IP mode */ diff --git a/drivers/net/ethernet/sfc/ef10_sriov.c b/drivers/net/ethernet/sfc/ef10_sriov.c index 019cef1d3cf7..8820be83ce85 100644 --- a/drivers/net/ethernet/sfc/ef10_sriov.c +++ b/drivers/net/ethernet/sfc/ef10_sriov.c @@ -199,7 +199,7 @@ static int efx_ef10_sriov_alloc_vf_vswitching(struct efx_nic *efx) return -ENOMEM; for (i = 0; i < efx->vf_count; i++) { - random_ether_addr(nic_data->vf[i].mac); + eth_random_addr(nic_data->vf[i].mac); nic_data->vf[i].efx = NULL; nic_data->vf[i].vlan = EFX_EF10_NO_VLAN; diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 358edab9e72e..093998124149 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2927,7 +2927,7 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv) dev_info(cpsw->dev, "cpsw: Detected MACID = %pM\n", priv_sl2->mac_addr); } else { - random_ether_addr(priv_sl2->mac_addr); + eth_random_addr(priv_sl2->mac_addr); dev_info(cpsw->dev, "cpsw: Random MACID = %pM\n", priv_sl2->mac_addr); } diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index e40aa3e31af2..6ebf110cd594 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -2052,7 +2052,7 @@ static int netcp_create_interface(struct netcp_device *netcp_device, if (is_valid_ether_addr(efuse_mac_addr)) ether_addr_copy(ndev->dev_addr, efuse_mac_addr); else - random_ether_addr(ndev->dev_addr); + eth_random_addr(ndev->dev_addr); devm_iounmap(dev, efuse); devm_release_mem_region(dev, res.start, size); @@ -2061,7 +2061,7 @@ static int netcp_create_interface(struct netcp_device *netcp_device, if (mac_addr) ether_addr_copy(ndev->dev_addr, mac_addr); else - random_ether_addr(ndev->dev_addr); + eth_random_addr(ndev->dev_addr); } ret = of_property_read_string(node_interface, "rx-channel", diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c index 9f6f7ccd44f7..b12023bc2cab 100644 --- a/drivers/net/ntb_netdev.c +++ b/drivers/net/ntb_netdev.c @@ -430,7 +430,7 @@ static int ntb_netdev_probe(struct device *client_dev) ndev->hw_features = ndev->features; ndev->watchdog_timeo = msecs_to_jiffies(NTB_TX_TIMEOUT_MS); - random_ether_addr(ndev->perm_addr); + eth_random_addr(ndev->perm_addr); memcpy(ndev->dev_addr, ndev->perm_addr, ndev->addr_len); ndev->netdev_ops = &ntb_netdev_ops; diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 8dff87ec6d99..a89570f34937 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1720,7 +1720,7 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev) "MAC address read from EEPROM"); } else { /* generate random MAC */ - random_ether_addr(addr); + eth_random_addr(addr); netif_dbg(dev, ifup, dev->net, "MAC address set to random addr"); } diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c index e60bea4604e4..1665066f4e24 100644 --- a/drivers/net/wireless/ath/ath9k/hw.c +++ b/drivers/net/wireless/ath/ath9k/hw.c @@ -496,7 +496,7 @@ static void ath9k_hw_init_macaddr(struct ath_hw *ah) ath_err(common, "eeprom contains invalid mac address: %pM\n", common->macaddr); - random_ether_addr(common->macaddr); + eth_random_addr(common->macaddr); ath_err(common, "random mac address will be used: %pM\n", common->macaddr); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index a2de5a44bd41..ff9659af6b91 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1449,7 +1449,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) * detection frames. Set the locally administered bit to avoid * collisions with users mac addresses. */ - random_ether_addr(bat_priv->bla.loopdetect_addr); + eth_random_addr(bat_priv->bla.loopdetect_addr); bat_priv->bla.loopdetect_addr[0] = 0xba; bat_priv->bla.loopdetect_addr[1] = 0xbe; bat_priv->bla.loopdetect_lasttime = jiffies; -- cgit v1.2.3 From 951a06e78d5af9ffda9f00139fef1186c202f8ae Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 26 May 2018 11:40:32 +0200 Subject: batman-adv: Drop "experimental" from BATMAN_V Kconfig The Kconfig option BATMAN_ADV_BATMAN_V is now enabled by default when the BATMAN_ADV is enabled. A feature which is enabled by default for a module should not be considered experimental. Reported-by: Joe Perches Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index de8034d80623..98c7f3820d53 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -33,7 +33,7 @@ config BATMAN_ADV tools. config BATMAN_ADV_BATMAN_V - bool "B.A.T.M.A.N. V protocol (experimental)" + bool "B.A.T.M.A.N. V protocol" depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y) default y help -- cgit v1.2.3 From ab4e58534dee7f273badfe21fa29cbe24553682f Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sun, 3 Jun 2018 18:52:03 +0800 Subject: batman-adv: enable DAT by default at compile time DAT (Distributed ARP Table) has been enabled by default in the out-of-tree batman-adv kernel module for several years already. It can now be enabled in the kernel too. Signed-off-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 98c7f3820d53..ff38df8bab91 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -60,7 +60,7 @@ config BATMAN_ADV_BLA config BATMAN_ADV_DAT bool "Distributed ARP Table" depends on BATMAN_ADV && INET - default n + default y help This option enables DAT (Distributed ARP Table), a DHT based mechanism that increases ARP reliability on sparse wireless -- cgit v1.2.3 From 55f949c4fa6cefb199fd6208c275a3457e39e4bc Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 26 May 2018 11:40:31 +0200 Subject: batman-adv: Remove "default n" in Kconfig The "default n" is the default value for any bool or tristate Kconfig setting. It is therefore not necessary to add it to a config entry. Reported-by: Sergei Shtylyov Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index ff38df8bab91..361116f77cb9 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -24,7 +24,6 @@ config BATMAN_ADV depends on NET select CRC16 select LIBCRC32C - default n help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The @@ -70,7 +69,6 @@ config BATMAN_ADV_DAT config BATMAN_ADV_NC bool "Network Coding" depends on BATMAN_ADV - default n help This option enables network coding, a mechanism that aims to increase the overall network throughput by fusing multiple @@ -84,7 +82,6 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) - default n help This option enables the multicast optimisation which aims to reduce the air overhead while improving the reliability of @@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS bool "batman-adv debugfs entries" depends on BATMAN_ADV depends on DEBUG_FS - default n help Enable this to export routing related debug tables via debugfs. The information for each soft-interface and used hard-interface can be -- cgit v1.2.3 From 9b42c1f179a614e11893ae4619f0304a38f481ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 12:44:26 +0200 Subject: xfrm: Extend the output_mark to support input direction and masking. We already support setting an output mark at the xfrm_state, unfortunately this does not support the input direction and masking the marks that will be applied to the skb. This change adds support applying a masked value in both directions. The existing XFRMA_OUTPUT_MARK number is reused for this purpose and as it is now bi-directional, it is renamed to XFRMA_SET_MARK. An additional XFRMA_SET_MARK_MASK attribute is added for setting the mask. If the attribute mask not provided, it is set to 0xffffffff, keeping the XFRMA_OUTPUT_MARK existing 'full mask' semantics. Co-developed-by: Tobias Brunner Co-developed-by: Eyal Birger Co-developed-by: Lorenzo Colitti Signed-off-by: Steffen Klassert Signed-off-by: Tobias Brunner Signed-off-by: Eyal Birger Signed-off-by: Lorenzo Colitti --- include/net/xfrm.h | 9 ++++++++- include/uapi/linux/xfrm.h | 4 +++- net/xfrm/xfrm_device.c | 3 ++- net/xfrm/xfrm_input.c | 2 ++ net/xfrm/xfrm_output.c | 3 +-- net/xfrm/xfrm_policy.c | 5 +++-- net/xfrm/xfrm_user.c | 48 +++++++++++++++++++++++++++++++++++++---------- 7 files changed, 57 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 557122846e0e..3dc83ba26f62 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -166,7 +166,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; - u32 output_mark; + struct xfrm_mark smark; } props; struct xfrm_lifetime_cfg lft; @@ -2012,6 +2012,13 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } +static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) +{ + struct xfrm_mark *m = &x->props.smark; + + return (m->v & m->m) | (mark & ~m->m); +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index e3af2859188b..5a6ed7ce5a29 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -305,9 +305,11 @@ enum xfrm_attr_type_t { XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ - XFRMA_OUTPUT_MARK, /* __u32 */ + XFRMA_SET_MARK, /* __u32 */ + XFRMA_SET_MARK_MASK, /* __u32 */ __XFRMA_MAX +#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ #define XFRMA_MAX (__XFRMA_MAX - 1) }; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 175941e15a6e..16c1230d20fa 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, - x->props.family, x->props.output_mark); + x->props.family, + xfrm_smark_get(0, x)); if (IS_ERR(dst)) return 0; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 352abca2605f..074810436242 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -339,6 +339,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } + skb->mark = xfrm_smark_get(skb->mark, x); + skb->sp->xvec[skb->sp->len++] = x; lock: diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 89b178a78dc7..45ba07ab3e4f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error_nolock; } - if (x->props.output_mark) - skb->mark = x->props.output_mark; + skb->mark = xfrm_smark_get(skb->mark, x); err = x->outer_mode->output(x, skb); if (err) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5f48251c1319..7637637717ec 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1607,10 +1607,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { + __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); + family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family, - xfrm[i]->props.output_mark); + &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 080035f056d9..9602cc9e05ab 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -527,6 +527,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, x->replay_maxdiff = nla_get_u32(rt); } +static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m) +{ + if (attrs[XFRMA_SET_MARK]) { + m->v = nla_get_u32(attrs[XFRMA_SET_MARK]); + if (attrs[XFRMA_SET_MARK_MASK]) + m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]); + else + m->m = 0xffffffff; + } else { + m->v = m->m = 0; + } +} + static struct xfrm_state *xfrm_state_construct(struct net *net, struct xfrm_usersa_info *p, struct nlattr **attrs, @@ -579,8 +592,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - if (attrs[XFRMA_OUTPUT_MARK]) - x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); + xfrm_smark_init(attrs, &x->props.smark); err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -824,6 +836,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) return 0; } +static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m) +{ + int ret = 0; + + if (m->v | m->m) { + ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v); + if (!ret) + ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m); + } + return ret; +} + /* Don't change this without updating xfrm_sa_len! */ static int copy_to_user_state_extra(struct xfrm_state *x, struct xfrm_usersa_info *p, @@ -887,6 +911,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = xfrm_mark_put(skb, &x->mark); if (ret) goto out; + + ret = xfrm_smark_put(skb, &x->props.smark); + if (ret) + goto out; + if (x->replay_esn) ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL, xfrm_replay_state_esn_len(x->replay_esn), @@ -900,11 +929,7 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = copy_user_offload(&x->xso, skb); if (ret) goto out; - if (x->props.output_mark) { - ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); - if (ret) - goto out; - } + if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -2493,7 +2518,8 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, + [XFRMA_SET_MARK] = { .type = NLA_U32 }, + [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2719,8 +2745,10 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) l += nla_total_size(sizeof(x->xso)); - if (x->props.output_mark) - l += nla_total_size(sizeof(x->props.output_mark)); + if (x->props.smark.v | x->props.smark.m) { + l += nla_total_size(sizeof(x->props.smark.v)); + l += nla_total_size(sizeof(x->props.smark.m)); + } /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); -- cgit v1.2.3 From 7e6526404adedf079279aa7aa11722deaca8fe2e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:07 +0200 Subject: xfrm: Add a new lookup key to match xfrm interfaces. This patch adds the xfrm interface id as a lookup key for xfrm states and policies. With this we can assign states and policies to virtual xfrm interfaces. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/xfrm.h | 21 +++++++++++++----- include/uapi/linux/xfrm.h | 1 + net/core/pktgen.c | 2 +- net/key/af_key.c | 6 +++--- net/xfrm/xfrm_policy.c | 18 +++++++++++----- net/xfrm/xfrm_state.c | 19 ++++++++++++----- net/xfrm/xfrm_user.c | 54 +++++++++++++++++++++++++++++++++++++++++------ 7 files changed, 96 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3dc83ba26f62..e8bada4d2a45 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,6 +147,7 @@ struct xfrm_state { struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; + u32 if_id; u32 tfcpad; u32 genid; @@ -574,6 +575,7 @@ struct xfrm_policy { atomic_t genid; u32 priority; u32 index; + u32 if_id; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; @@ -1533,7 +1535,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); -struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, +struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, @@ -1690,20 +1692,20 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, void *); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, - u32 id, int delete, int *err); +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, u8, + int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u8 proto, + u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); @@ -2019,6 +2021,15 @@ static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) return (m->v & m->m) | (mark & ~m->m); } +static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id) +{ + int ret = 0; + + if (if_id) + ret = nla_put_u32(skb, XFRMA_IF_ID, if_id); + return ret; +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5a6ed7ce5a29..5f3b9fec7b5f 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -307,6 +307,7 @@ enum xfrm_attr_type_t { XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ XFRMA_SET_MARK, /* __u32 */ XFRMA_SET_MARK_MASK, /* __u32 */ + XFRMA_IF_ID, /* __u32 */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 49368e21d228..6d37dbf0aa64 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2255,7 +2255,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); } else { /* slow path: we dont already have xfrm_state */ - x = xfrm_stateonly_find(pn->net, DUMMY_MARK, + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, AF_INET, diff --git a/net/key/af_key.c b/net/key/af_key.c index 8bdc1cbe490a..398ebcd614a0 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (!x) - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family); + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; @@ -2414,7 +2414,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa return err; } - xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); @@ -2663,7 +2663,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_ return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); - xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7637637717ec..fc0c69312b2c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -747,6 +747,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && + pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && @@ -798,8 +799,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, - int dir, struct xfrm_selector *sel, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, + u8 type, int dir, + struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { @@ -812,6 +814,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, ret = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && + pol->if_id == if_id && (mark & pol->mark.m) == pol->mark.v && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) { @@ -837,8 +840,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, - int dir, u32 id, int delete, int *err) +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, + u8 type, int dir, u32 id, int delete, + int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; @@ -853,6 +857,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && + pol->if_id == if_id && (mark & pol->mark.m) == pol->mark.v) { xfrm_pol_hold(pol); if (delete) { @@ -1063,6 +1068,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, bool match; if (pol->family != family || + pol->if_id != fl->flowi_xfrm.if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; @@ -1177,7 +1183,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { - if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { + if ((sk->sk_mark & pol->mark.m) != pol->mark.v || + pol->if_id != fl->flowi_xfrm.if_id) { pol = NULL; goto out; } @@ -1305,6 +1312,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; + newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8308281f3253..3803b6813fc5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -941,6 +941,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; + u32 if_id = fl->flowi_xfrm.if_id; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; @@ -955,6 +956,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && @@ -971,6 +973,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && tmpl->mode == x->props.mode && @@ -1010,6 +1013,7 @@ found: * to current session. */ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); + x->if_id = if_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { @@ -1067,7 +1071,7 @@ out: } struct xfrm_state * -xfrm_stateonly_find(struct net *net, u32 mark, +xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid) { @@ -1080,6 +1084,7 @@ xfrm_stateonly_find(struct net *net, u32 mark, if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, family) && mode == x->props.mode && @@ -1160,11 +1165,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) struct xfrm_state *x; unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; + u32 if_id = xnew->if_id; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && + x->if_id == if_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) @@ -1187,7 +1194,7 @@ EXPORT_SYMBOL(xfrm_state_insert); static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, - u32 reqid, u8 proto, + u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) @@ -1242,6 +1249,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->props.family = family; x->props.mode = mode; x->props.reqid = reqid; + x->if_id = if_id; x->mark.v = m->v; x->mark.m = m->m; x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; @@ -1296,7 +1304,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, - x->props.reqid, x->id.proto, + x->props.reqid, x->if_id, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); @@ -1395,6 +1403,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; @@ -1619,13 +1628,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, - u8 proto, const xfrm_address_t *daddr, + u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create); + x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 9602cc9e05ab..79245e1c3487 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -594,6 +594,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); + if (attrs[XFRMA_IF_ID]) + x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) goto error; @@ -929,7 +932,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = copy_user_offload(&x->xso, skb); if (ret) goto out; - + if (x->if_id) { + ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id); + if (ret) + goto out; + } if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -1278,6 +1285,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u32 mark; struct xfrm_mark m; + u32 if_id = 0; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max); @@ -1290,6 +1298,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, x = NULL; mark = xfrm_mark_get(attrs, &m); + + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { @@ -1300,7 +1312,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, - p->info.id.proto, daddr, + if_id, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; @@ -1588,6 +1600,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); + if (attrs[XFRMA_IF_ID]) + xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + return xp; error: *errp = err; @@ -1733,6 +1748,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1814,6 +1831,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, int delete; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); + u32 if_id = 0; p = nlmsg_data(nlh); delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; @@ -1826,8 +1844,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->index) - xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err); + xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; @@ -1844,7 +1865,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; } - xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel, + xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel, ctx, delete, &err); security_xfrm_policy_free(ctx); } @@ -1967,6 +1988,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) goto out_cancel; + err = xfrm_if_id_put(skb, x->if_id); + if (err) + goto out_cancel; + nlmsg_end(skb, nlh); return 0; @@ -2109,6 +2134,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -ENOENT; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); + u32 if_id = 0; err = copy_from_user_policy_type(&type, attrs); if (err) @@ -2118,8 +2144,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->index) - xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err); + xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; @@ -2136,7 +2165,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; } - xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, + xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel, ctx, 0, &err); security_xfrm_policy_free(ctx); } @@ -2520,6 +2549,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, [XFRMA_SET_MARK] = { .type = NLA_U32 }, [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, + [XFRMA_IF_ID] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2651,6 +2681,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) return err; + err = xfrm_if_id_put(skb, x->if_id); + if (err) + return err; + nlmsg_end(skb, nlh); return 0; } @@ -2749,6 +2783,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.smark.v)); l += nla_total_size(sizeof(x->props.smark.m)); } + if (x->if_id) + l += nla_total_size(sizeof(x->if_id)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -2878,6 +2914,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -2994,6 +3032,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3075,6 +3115,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) goto out_free_skb; -- cgit v1.2.3 From f203b76d78092faf248db3f851840fbecf80b40e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:12 +0200 Subject: xfrm: Add virtual xfrm interfaces This patch adds support for virtual xfrm interfaces. Packets that are routed through such an interface are guaranteed to be IPsec transformed or dropped. It is a generic virtual interface that ensures IPsec transformation, no need to know what happens behind the interface. This means that we can tunnel IPv4 and IPv6 through the same interface and support all xfrm modes (tunnel, transport and beet) on it. Co-developed-by: Lorenzo Colitti Co-developed-by: Benedict Wong Signed-off-by: Lorenzo Colitti Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/xfrm.h | 24 ++ include/uapi/linux/if_link.h | 10 + net/xfrm/Kconfig | 8 + net/xfrm/Makefile | 1 + net/xfrm/xfrm_input.c | 3 + net/xfrm/xfrm_interface.c | 972 +++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_policy.c | 43 ++ 7 files changed, 1061 insertions(+) create mode 100644 net/xfrm/xfrm_interface.c (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e8bada4d2a45..3fa578a6a819 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -293,6 +294,13 @@ struct xfrm_replay { int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; +struct xfrm_if_cb { + struct xfrm_if *(*decode_session)(struct sk_buff *skb); +}; + +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); +void xfrm_if_unregister_cb(void); + struct net_device; struct xfrm_type; struct xfrm_dst; @@ -1039,6 +1047,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); +struct xfrm_if_parms { + char name[IFNAMSIZ]; /* name of XFRM device */ + int link; /* ifindex of underlying L2 interface */ + u32 if_id; /* interface identifyer */ +}; + +struct xfrm_if { + struct xfrm_if __rcu *next; /* next interface in list */ + struct net_device *dev; /* virtual device associated with interface */ + struct net_device *phydev; /* physical device */ + struct net *net; /* netns for packet i/o */ + struct xfrm_if_parms p; /* interface parms */ + + struct gro_cells gro_cells; +}; + struct xfrm_offload { /* Output sequence number for replay protection on offloading. */ struct { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cf01b6824244..bff0af507b32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -459,6 +459,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 286ed25c1a69..53381888a7b3 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -25,6 +25,14 @@ config XFRM_USER If unsure, say Y. +config XFRM_INTERFACE + tristate "Transformation virtual interface" + depends on XFRM && IPV6 + ---help--- + This provides a virtual interface to route IPsec traffic. + + If unsure, say N. + config XFRM_SUB_POLICY bool "Transformation sub policy support" depends on XFRM diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 0bd2465a8c5a..fbc4552d17b8 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o +obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 074810436242..b89c9c7f8c5c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) seq = 0; if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } @@ -328,12 +329,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SPI_SKB_CB(skb)->daddroff); do { if (skb->sp->len == XFRM_MAX_DEPTH) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); goto drop; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c new file mode 100644 index 000000000000..31cb1c7e3881 --- /dev/null +++ b/net/xfrm/xfrm_interface.c @@ -0,0 +1,972 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM virtual interface + * + * Copyright (C) 2018 secunet Security Networks AG + * + * Author: + * Steffen Klassert + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrmi_dev_init(struct net_device *dev); +static void xfrmi_dev_setup(struct net_device *dev); +static struct rtnl_link_ops xfrmi_link_ops __read_mostly; +static unsigned int xfrmi_net_id __read_mostly; + +struct xfrmi_net { + /* lists for storing interfaces in use */ + struct xfrm_if __rcu *xfrmi[1]; +}; + +#define for_each_xfrmi_rcu(start, xi) \ + for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) + +static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if *xi; + + for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + if (x->if_id == xi->p.if_id && + (xi->dev->flags & IFF_UP)) + return xi; + } + + return NULL; +} + +static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) +{ + struct xfrmi_net *xfrmn; + int ifindex; + struct xfrm_if *xi; + + if (!skb->dev) + return NULL; + + xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id); + ifindex = skb->dev->ifindex; + + for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + if (ifindex == xi->dev->ifindex && + (xi->dev->flags & IFF_UP)) + return xi; + } + + return NULL; +} + +static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0]; + + rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); + rcu_assign_pointer(*xip, xi); +} + +static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *iter; + + for (xip = &xfrmn->xfrmi[0]; + (iter = rtnl_dereference(*xip)) != NULL; + xip = &iter->next) { + if (xi == iter) { + rcu_assign_pointer(*xip, xi->next); + break; + } + } +} + +static void xfrmi_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); +} + +static int xfrmi_create2(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + dev->rtnl_link_ops = &xfrmi_link_ops; + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(xi->p.name, dev->name); + + dev_hold(dev); + xfrmi_link(xfrmn, xi); + + return 0; + +out: + return err; +} + +static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) +{ + struct net_device *dev; + struct xfrm_if *xi; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + goto failed; + + dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup); + if (!dev) + goto failed; + + dev_net_set(dev, net); + + xi = netdev_priv(dev); + xi->p = *p; + xi->net = net; + xi->dev = dev; + xi->phydev = dev_get_by_index(net, p->link); + if (!xi->phydev) + goto failed_free; + + err = xfrmi_create2(dev); + if (err < 0) + goto failed_dev_put; + + return xi; + +failed_dev_put: + dev_put(xi->phydev); +failed_free: + free_netdev(dev); +failed: + return NULL; +} + +static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, + int create) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + for (xip = &xfrmn->xfrmi[0]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) { + if (xi->p.if_id == p->if_id) { + if (create) + return NULL; + + return xi; + } + } + if (!create) + return NULL; + return xfrmi_create(net, p); +} + +static void xfrmi_dev_uninit(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); + + xfrmi_unlink(xfrmn, xi); + dev_put(xi->phydev); + dev_put(dev); +} + +static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb->tstamp = 0; + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + nf_reset(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + ipvs_reset(skb); + secpath_reset(skb); + skb_orphan(skb); + skb->mark = 0; +} + +static int xfrmi_rcv_cb(struct sk_buff *skb, int err) +{ + struct pcpu_sw_netstats *tstats; + struct xfrm_mode *inner_mode; + struct net_device *dev; + struct xfrm_state *x; + struct xfrm_if *xi; + bool xnet; + + if (err && !skb->sp) + return 0; + + x = xfrm_input_state(skb); + + xi = xfrmi_lookup(xs_net(x), x); + if (!xi) + return 1; + + dev = xi->dev; + skb->dev = dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + xnet = !net_eq(xi->net, dev_net(skb->dev)); + + if (xnet) { + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, + inner_mode->afinfo->family)) + return -EPERM; + } + + xfrmi_scrub_packet(skb, xnet); + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + +static int +xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + unsigned int length = skb->len; + struct net_device *tdev; + struct xfrm_state *x; + int err = -1; + int mtu; + + if (!dst) + goto tx_err_link_failure; + + fl->flowi_xfrm.if_id = xi->p.if_id; + + dst_hold(dst); + dst = xfrm_lookup(xi->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + x = dst->xfrm; + if (!x) + goto tx_err_link_failure; + + if (x->if_id != xi->p.if_id) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + xi->p.name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst); + if (!skb->ignore_df && skb->len > mtu) { + skb_dst_update_pmtu(skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } else { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } + + dst_release(dst); + return -EMSGSIZE; + } + + xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = tdev; + + err = dst_output(xi->net, skb->sk, skb); + if (net_xmit_eval(err) == 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += length; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + default: + goto tx_err; + } + + fl.flowi_oif = xi->phydev->ifindex; + + ret = xfrmi_xmit2(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int xfrmi4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->protocol; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->nexthdr; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + else + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) +{ + if (xi->p.link != p->link) + return -EINVAL; + + xi->p.if_id = p->if_id; + + return 0; +} + +static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) +{ + struct net *net = dev_net(xi->dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + xfrmi_unlink(xfrmn, xi); + synchronize_net(); + err = xfrmi_change(xi, p); + xfrmi_link(xfrmn, xi); + netdev_state_change(xi->dev); + return err; +} + +static void xfrmi_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + int cpu; + + if (!dev->tstats) + return; + + for_each_possible_cpu(cpu) { + struct pcpu_sw_netstats *stats; + struct pcpu_sw_netstats tmp; + int start; + + stats = per_cpu_ptr(dev->tstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + tmp.rx_packets = stats->rx_packets; + tmp.rx_bytes = stats->rx_bytes; + tmp.tx_packets = stats->tx_packets; + tmp.tx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += tmp.rx_packets; + s->rx_bytes += tmp.rx_bytes; + s->tx_packets += tmp.tx_packets; + s->tx_bytes += tmp.tx_bytes; + } + + s->rx_dropped = dev->stats.rx_dropped; + s->tx_dropped = dev->stats.tx_dropped; +} + +static int xfrmi_get_iflink(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->phydev->ifindex; +} + + +static const struct net_device_ops xfrmi_netdev_ops = { + .ndo_init = xfrmi_dev_init, + .ndo_uninit = xfrmi_dev_uninit, + .ndo_start_xmit = xfrmi_xmit, + .ndo_get_stats64 = xfrmi_get_stats64, + .ndo_get_iflink = xfrmi_get_iflink, +}; + +static void xfrmi_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &xfrmi_netdev_ops; + dev->type = ARPHRD_NONE; + dev->hard_header_len = ETH_HLEN; + dev->min_header_len = ETH_HLEN; + dev->mtu = ETH_DATA_LEN; + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = ETH_DATA_LEN; + dev->addr_len = ETH_ALEN; + dev->flags = IFF_NOARP; + dev->needs_free_netdev = true; + dev->priv_destructor = xfrmi_dev_free; + netif_keep_dst(dev); +} + +static int xfrmi_dev_init(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device *phydev = xi->phydev; + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&xi->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + + dev->features |= NETIF_F_LLTX; + + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; + + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); + + return 0; +} + +static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void xfrmi_netlink_parms(struct nlattr *data[], + struct xfrm_if_parms *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_XFRM_LINK]) + parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); + + if (data[IFLA_XFRM_IF_ID]) + parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); +} + +static int xfrmi_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct net *net = dev_net(dev); + struct xfrm_if_parms *p; + struct xfrm_if *xi; + + xi = netdev_priv(dev); + p = &xi->p; + + xfrmi_netlink_parms(data, p); + + if (!tb[IFLA_IFNAME]) + return -EINVAL; + + nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); + + if (!xfrmi_locate(net, p, 1)) + return -EEXIST; + + return 0; +} + +static void xfrmi_dellink(struct net_device *dev, struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} + +static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + + xfrmi_netlink_parms(data, &xi->p); + + xi = xfrmi_locate(net, &xi->p, 0); + + if (xi) { + if (xi->dev != dev) + return -EEXIST; + } else + xi = netdev_priv(dev); + + return xfrmi_update(xi, &xi->p); +} + +static size_t xfrmi_get_size(const struct net_device *dev) +{ + return + /* IFLA_XFRM_LINK */ + nla_total_size(4) + + /* IFLA_XFRM_IF_ID */ + nla_total_size(4) + + 0; +} + +static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrm_if_parms *parm = &xi->p; + + if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || + nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +struct net *xfrmi_get_link_net(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return dev_net(xi->phydev); +} + +static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { + .kind = "xfrm", + .maxtype = IFLA_XFRM_MAX, + .policy = xfrmi_policy, + .priv_size = sizeof(struct xfrm_if), + .setup = xfrmi_dev_setup, + .validate = xfrmi_validate, + .newlink = xfrmi_newlink, + .dellink = xfrmi_dellink, + .changelink = xfrmi_changelink, + .get_size = xfrmi_get_size, + .fill_info = xfrmi_fill_info, + .get_link_net = xfrmi_get_link_net, +}; + +static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) +{ + struct xfrm_if *xi; + LIST_HEAD(list); + + xi = rtnl_dereference(xfrmn->xfrmi[0]); + if (!xi) + return; + + unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init xfrmi_init_net(struct net *net) +{ + return 0; +} + +static void __net_exit xfrmi_exit_net(struct net *net) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + rtnl_lock(); + xfrmi_destroy_interfaces(xfrmn); + rtnl_unlock(); +} + +static struct pernet_operations xfrmi_net_ops = { + .init = xfrmi_init_net, + .exit = xfrmi_exit_net, + .id = &xfrmi_net_id, + .size = sizeof(struct xfrmi_net), +}; + +static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static int __init xfrmi4_init(void) +{ + int err; + + err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; + + return 0; + +xfrm_proto_comp_failed: + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi4_fini(void) +{ + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +} + +static int __init xfrmi6_init(void) +{ + int err; + + err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; + + return 0; + +xfrm_proto_comp_failed: + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi6_fini(void) +{ + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +} + +static const struct xfrm_if_cb xfrm_if_cb = { + .decode_session = xfrmi_decode_session, +}; + +static int __init xfrmi_init(void) +{ + const char *msg; + int err; + + pr_info("IPsec XFRM device driver\n"); + + msg = "tunnel device"; + err = register_pernet_device(&xfrmi_net_ops); + if (err < 0) + goto pernet_dev_failed; + + msg = "xfrm4 protocols"; + err = xfrmi4_init(); + if (err < 0) + goto xfrmi4_failed; + + msg = "xfrm6 protocols"; + err = xfrmi6_init(); + if (err < 0) + goto xfrmi6_failed; + + + msg = "netlink interface"; + err = rtnl_link_register(&xfrmi_link_ops); + if (err < 0) + goto rtnl_link_failed; + + xfrm_if_register_cb(&xfrm_if_cb); + + return err; + +rtnl_link_failed: + xfrmi6_fini(); +xfrmi6_failed: + xfrmi4_fini(); +xfrmi4_failed: + unregister_pernet_device(&xfrmi_net_ops); +pernet_dev_failed: + pr_err("xfrmi init: failed to register %s\n", msg); + return err; +} + +static void __exit xfrmi_fini(void) +{ + xfrm_if_unregister_cb(); + rtnl_link_unregister(&xfrmi_link_ops); + xfrmi4_fini(); + xfrmi6_fini(); + unregister_pernet_device(&xfrmi_net_ops); +} + +module_init(xfrmi_init); +module_exit(xfrmi_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("xfrm"); +MODULE_ALIAS_NETDEV("xfrm0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("XFRM virtual interface"); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index fc0c69312b2c..d960ea6657b5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -47,6 +47,9 @@ struct xfrm_flo { static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); static struct work_struct *xfrm_pcpu_work __read_mostly; +static DEFINE_SPINLOCK(xfrm_if_cb_lock); +static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; + static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; @@ -119,6 +122,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa return afinfo; } +/* Called with rcu_read_lock(). */ +static const struct xfrm_if_cb *xfrm_if_get_cb(void) +{ + return rcu_dereference(xfrm_if_cb); +} + struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, @@ -2083,6 +2092,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (IS_ERR(xdst)) { err = PTR_ERR(xdst); + if (err == -EREMOTE) { + xfrm_pols_put(pols, num_pols); + return NULL; + } + if (err != -EAGAIN) goto error; goto make_dummy_bundle; @@ -2176,6 +2190,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); + if (err == -EREMOTE) + goto nopol; + goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; @@ -2368,12 +2385,20 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_if_cb *ifcb = xfrm_if_get_cb(); + struct xfrm_if *xi; int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); + if (ifcb) { + xi = ifcb->decode_session(skb); + if (xi) + fl->flowi_xfrm.if_id = xi->p.if_id; + } + err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); return err; @@ -2828,6 +2853,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) +{ + spin_lock(&xfrm_if_cb_lock); + rcu_assign_pointer(xfrm_if_cb, ifcb); + spin_unlock(&xfrm_if_cb_lock); +} +EXPORT_SYMBOL(xfrm_if_register_cb); + +void xfrm_if_unregister_cb(void) +{ + RCU_INIT_POINTER(xfrm_if_cb, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL(xfrm_if_unregister_cb); + #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { @@ -3008,6 +3048,9 @@ void __init xfrm_init(void) xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); + + RCU_INIT_POINTER(xfrm_if_cb, NULL); + synchronize_rcu(); } #ifdef CONFIG_AUDITSYSCALL -- cgit v1.2.3 From fe0b082fedd1d09c73c48883f04a9fe2967b5899 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 23 Jun 2018 13:46:39 -0700 Subject: net_sched: remove unused htb drop_list After commit a09ceb0e0814 ("sched: remove qdisc->drop"), it is no longer used. Cc: Florian Westphal Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2a4ab7caf553..43c4bfe625a9 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -126,7 +126,6 @@ struct htb_class { union { struct htb_class_leaf { - struct list_head drop_list; int deficit[TC_HTB_MAXDEPTH]; struct Qdisc *q; } leaf; @@ -171,7 +170,6 @@ struct htb_sched { struct qdisc_watchdog watchdog; s64 now; /* cached dequeue time */ - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ /* time of nearest event per level (row) */ s64 near_ev_cache[TC_HTB_MAXDEPTH]; @@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; htb_activate_prios(q, cl); - list_add_tail(&cl->un.leaf.drop_list, - q->drops + cl->prio); } } @@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) htb_deactivate_prios(q, cl); cl->prio_activity = 0; - list_del_init(&cl->un.leaf.drop_list); } static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, @@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch) else { if (cl->un.leaf.q) qdisc_reset(cl->un.leaf.q); - INIT_LIST_HEAD(&cl->un.leaf.drop_list); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; @@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch) sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); } static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { @@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; int err; - int i; qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); @@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); qdisc_skb_head_init(&q->direct_queue); @@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); - INIT_LIST_HEAD(&parent->un.leaf.drop_list); parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; @@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } cl->children = 0; - INIT_LIST_HEAD(&cl->un.leaf.drop_list); RB_CLEAR_NODE(&cl->pq_node); for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) -- cgit v1.2.3 From 0ef8b4567d08a557b5226a4926ffd689ef0298ad Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 25 Jun 2018 01:37:50 +0530 Subject: tls: Removed unused variable Removed unused variable 'rxm' from tls_queue(). Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f127fac88acf..727433b37bb5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -990,9 +990,6 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm; - - rxm = strp_msg(skb); ctx->decrypted = false; -- cgit v1.2.3 From e4db5b61c572475bbbcf63e3c8a2606bfccf2c9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:26:02 +0200 Subject: xfrm: policy: remove pcpu policy cache Kristian Evensen says: In a project I am involved in, we are running ipsec (Strongswan) on different mt7621-based routers. Each router is configured as an initiator and has around ~30 tunnels to different responders (running on misc. devices). Before the flow cache was removed (kernel 4.9), we got a combined throughput of around 70Mbit/s for all tunnels on one router. However, we recently switched to kernel 4.14 (4.14.48), and the total throughput is somewhere around 57Mbit/s (best-case). I.e., a drop of around 20%. Reverting the flow cache removal restores, as expected, performance levels to that of kernel 4.9. When pcpu xdst exists, it has to be validated first before it can be used. A negative hit thus increases cost vs. no-cache. As number of tunnels increases, hit rate decreases so this pcpu caching isn't a viable strategy. Furthermore, the xdst cache also needs to run with BH off, so when removing this the bh disable/enable pairs can be removed too. Kristian tested a 4.14.y backport of this change and reported increased performance: In our tests, the throughput reduction has been reduced from around -20% to -5%. We also see that the overall throughput is independent of the number of tunnels, while before the throughput was reduced as the number of tunnels increased. Reported-by: Kristian Evensen Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/xfrm/xfrm_device.c | 10 ---- net/xfrm/xfrm_policy.c | 139 +------------------------------------------------ net/xfrm/xfrm_state.c | 5 +- 4 files changed, 3 insertions(+), 152 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3fa578a6a819..a5378613a49c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -332,7 +332,6 @@ int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int fam void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c); -void xfrm_policy_cache_flush(void); void km_state_notify(struct xfrm_state *x, const struct km_event *c); struct xfrm_tmpl; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 16c1230d20fa..11d56a44e9e8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -307,12 +307,6 @@ static int xfrm_dev_register(struct net_device *dev) return xfrm_api_check(dev); } -static int xfrm_dev_unregister(struct net_device *dev) -{ - xfrm_policy_cache_flush(); - return NOTIFY_DONE; -} - static int xfrm_dev_feat_change(struct net_device *dev) { return xfrm_api_check(dev); @@ -323,7 +317,6 @@ static int xfrm_dev_down(struct net_device *dev) if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); - xfrm_policy_cache_flush(); return NOTIFY_DONE; } @@ -335,9 +328,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void case NETDEV_REGISTER: return xfrm_dev_register(dev); - case NETDEV_UNREGISTER: - return xfrm_dev_unregister(dev); - case NETDEV_FEAT_CHANGE: return xfrm_dev_feat_change(dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d960ea6657b5..ef75891450e7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -45,8 +45,6 @@ struct xfrm_flo { u8 flags; }; -static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); -static struct work_struct *xfrm_pcpu_work __read_mostly; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -1732,108 +1730,6 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, } -static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old) -{ - this_cpu_write(xfrm_last_dst, xdst); - if (old) - dst_release(&old->u.dst); -} - -static void __xfrm_pcpu_work_fn(void) -{ - struct xfrm_dst *old; - - old = this_cpu_read(xfrm_last_dst); - if (old && !xfrm_bundle_ok(old)) - xfrm_last_dst_update(NULL, old); -} - -static void xfrm_pcpu_work_fn(struct work_struct *work) -{ - local_bh_disable(); - rcu_read_lock(); - __xfrm_pcpu_work_fn(); - rcu_read_unlock(); - local_bh_enable(); -} - -void xfrm_policy_cache_flush(void) -{ - struct xfrm_dst *old; - bool found = false; - int cpu; - - might_sleep(); - - local_bh_disable(); - rcu_read_lock(); - for_each_possible_cpu(cpu) { - old = per_cpu(xfrm_last_dst, cpu); - if (old && !xfrm_bundle_ok(old)) { - if (smp_processor_id() == cpu) { - __xfrm_pcpu_work_fn(); - continue; - } - found = true; - break; - } - } - - rcu_read_unlock(); - local_bh_enable(); - - if (!found) - return; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - bool bundle_release; - - rcu_read_lock(); - old = per_cpu(xfrm_last_dst, cpu); - bundle_release = old && !xfrm_bundle_ok(old); - rcu_read_unlock(); - - if (!bundle_release) - continue; - - if (cpu_online(cpu)) { - schedule_work_on(cpu, &xfrm_pcpu_work[cpu]); - continue; - } - - rcu_read_lock(); - old = per_cpu(xfrm_last_dst, cpu); - if (old && !xfrm_bundle_ok(old)) { - per_cpu(xfrm_last_dst, cpu) = NULL; - dst_release(&old->u.dst); - } - rcu_read_unlock(); - } - - put_online_cpus(); -} - -static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst, - struct xfrm_state * const xfrm[], - int num) -{ - const struct dst_entry *dst = &xdst->u.dst; - int i; - - if (xdst->num_xfrms != num) - return false; - - for (i = 0; i < num; i++) { - if (!dst || dst->xfrm != xfrm[i]) - return false; - dst = xfrm_dst_child(dst); - } - - return xfrm_bundle_ok(xdst); -} - static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, @@ -1842,7 +1738,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; - struct xfrm_dst *xdst, *old; + struct xfrm_dst *xdst; struct dst_entry *dst; int err; @@ -1854,22 +1750,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, return ERR_PTR(err); } - xdst = this_cpu_read(xfrm_last_dst); - if (xdst && - xdst->u.dst.dev == dst_orig->dev && - xdst->num_pols == num_pols && - memcmp(xdst->pols, pols, - sizeof(struct xfrm_policy *) * num_pols) == 0 && - xfrm_xdst_can_reuse(xdst, xfrm, err)) { - dst_hold(&xdst->u.dst); - xfrm_pols_put(pols, num_pols); - while (err > 0) - xfrm_state_put(xfrm[--err]); - return xdst; - } - - old = xdst; - dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); @@ -1882,9 +1762,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); - atomic_set(&xdst->u.dst.__refcnt, 2); - xfrm_last_dst_update(xdst, old); - return xdst; } @@ -2085,11 +1962,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (num_xfrms <= 0) goto make_dummy_bundle; - local_bh_disable(); xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); - local_bh_enable(); - if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { @@ -2181,11 +2055,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } - local_bh_disable(); xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); - local_bh_enable(); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); @@ -3035,15 +2907,6 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { void __init xfrm_init(void) { - int i; - - xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), - GFP_KERNEL); - BUG_ON(!xfrm_pcpu_work); - - for (i = 0; i < NR_CPUS; i++) - INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn); - register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 3803b6813fc5..e04a510ec992 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -735,10 +735,9 @@ restart: } out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); - if (cnt) { + if (cnt) err = 0; - xfrm_policy_cache_flush(); - } + return err; } EXPORT_SYMBOL(xfrm_state_flush); -- cgit v1.2.3 From d4546c2509b1e9cd082e3682dcec98472e37ee5a Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:13:49 +0900 Subject: net: Convert GRO SKB handling to list_head. Manage pending per-NAPI GRO packets via list_head. Return an SKB pointer from the GRO receive handlers. When GRO receive handlers return non-NULL, it means that this SKB needs to be completed at this time and removed from the NAPI queue. Several operations are greatly simplified by this transformation, especially timing out the oldest SKB in the list when gro_count exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue in reverse order. Signed-off-by: David S. Miller --- drivers/net/geneve.c | 11 ++++---- drivers/net/vxlan.c | 11 ++++---- include/linux/etherdevice.h | 3 +- include/linux/netdevice.h | 32 ++++++++++----------- include/linux/skbuff.h | 3 +- include/linux/udp.h | 4 +-- include/net/inet_common.h | 2 +- include/net/tcp.h | 2 +- include/net/udp.h | 4 +-- include/net/udp_tunnel.h | 6 ++-- net/8021q/vlan.c | 13 +++++---- net/core/dev.c | 68 +++++++++++++++++++-------------------------- net/core/skbuff.c | 4 +-- net/ethernet/eth.c | 12 ++++---- net/ipv4/af_inet.c | 12 ++++---- net/ipv4/esp4_offload.c | 4 +-- net/ipv4/fou.c | 20 ++++++------- net/ipv4/gre_offload.c | 8 +++--- net/ipv4/tcp_offload.c | 14 +++++----- net/ipv4/udp_offload.c | 13 +++++---- net/ipv6/esp6_offload.c | 4 +-- net/ipv6/ip6_offload.c | 16 +++++------ net/ipv6/tcpv6_offload.c | 4 +-- net/ipv6/udp_offload.c | 4 +-- 24 files changed, 133 insertions(+), 141 deletions(-) (limited to 'net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 750eaa53bf0c..3e94375b9b01 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -418,11 +418,12 @@ static int geneve_hlen(struct genevehdr *gh) return sizeof(*gh) + gh->opt_len * 4; } -static struct sk_buff **geneve_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *geneve_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct genevehdr *gh, *gh2; unsigned int hlen, gh_len, off_gnv; const struct packet_offload *ptype; @@ -449,7 +450,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, goto out; } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aee0e60471f1..cc14e0cd5647 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -568,11 +568,12 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, return vh; } -static struct sk_buff **vxlan_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vxlan_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; int flush = 1; @@ -607,7 +608,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 79563840c295..572e11bb8696 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb); +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..f176d9873910 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -322,7 +322,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct sk_buff *gro_list; + struct list_head gro_list; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; @@ -2255,10 +2255,10 @@ static inline int gro_recursion_inc_test(struct sk_buff *skb) return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } -typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); -static inline struct sk_buff **call_gro_receive(gro_receive_t cb, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); +static inline struct sk_buff *call_gro_receive(gro_receive_t cb, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2268,12 +2268,12 @@ static inline struct sk_buff **call_gro_receive(gro_receive_t cb, return cb(head, skb); } -typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, - struct sk_buff *); -static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, + struct sk_buff *); +static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2299,8 +2299,8 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - struct sk_buff **(*gro_receive)(struct sk_buff **head, - struct sk_buff *skb); + struct sk_buff *(*gro_receive)(struct list_head *head, + struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; @@ -2568,7 +2568,7 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { @@ -2784,13 +2784,13 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, } #ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } #else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..7ccc601b55d9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -677,7 +677,8 @@ struct sk_buff { int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem & tcp stack */ + struct list_head list; }; struct sock *sk; diff --git a/include/linux/udp.h b/include/linux/udp.h index ca840345571b..320d49d85484 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -74,8 +74,8 @@ struct udp_sock { void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ - struct sk_buff ** (*gro_receive)(struct sock *sk, - struct sk_buff **head, + struct sk_buff * (*gro_receive)(struct sock *sk, + struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sock *sk, struct sk_buff *skb, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 384b90c62c0b..3ca969cbd161 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -43,7 +43,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb); +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/include/net/tcp.h b/include/net/tcp.h index 822ee49ed0f9..402a88b0e8a8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1788,7 +1788,7 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); int tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); diff --git a/include/net/udp.h b/include/net/udp.h index b1ea8b0f5e6a..5723c6128ae4 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -170,8 +170,8 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup); +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index b95a6927c718..fe680ab6b15a 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -65,9 +65,9 @@ static inline int udp_sock_create(struct net *net, typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); -typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb); +typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb, int nhoff); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..99141986efa0 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -647,13 +647,14 @@ out: return err; } -static struct sk_buff **vlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct vlan_hdr *vhdr; - unsigned int hlen, off_vlan; const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; __be16 type; int flush = 1; @@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct vlan_hdr *vhdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/core/dev.c b/net/core/dev.c index a5aa1c7444e6..aa61b9344b46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4881,36 +4881,25 @@ out: */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *prev = NULL; - - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; - } - - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + struct sk_buff *skb, *p; + list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - - prev = skb->prev; + list_del_init(&skb->list); napi_gro_complete(skb); napi->gro_count--; } - - napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; - for (p = napi->gro_list; p; p = p->next) { + list_for_each_entry(p, &napi->gro_list, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4977,12 +4966,12 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; + struct sk_buff *pp = NULL; enum gro_result ret; + int same_flow; int grow; if (netif_elide_gro(skb->dev)) @@ -5039,11 +5028,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - struct sk_buff *nskb = *pp; - - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); + list_del_init(&pp->list); + napi_gro_complete(pp); napi->gro_count--; } @@ -5054,15 +5040,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb = napi->gro_list; + struct sk_buff *nskb; - /* locate the end of the list to select the 'oldest' flow */ - while (nskb->next) { - pp = &nskb->next; - nskb = *pp; - } - *pp = NULL; - nskb->next = NULL; + nskb = list_last_entry(&napi->gro_list, struct sk_buff, list); + list_del(&nskb->list); napi_gro_complete(nskb); } else { napi->gro_count++; @@ -5071,8 +5052,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; + list_add(&skb->list, &napi->gro_list); ret = GRO_HELD; pull: @@ -5478,7 +5458,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_list) { + if (!list_empty(&n->gro_list)) { unsigned long timeout = 0; if (work_done) @@ -5687,7 +5667,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_list && !napi_disable_pending(napi) && + if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -5701,7 +5681,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - napi->gro_list = NULL; + INIT_LIST_HEAD(&napi->gro_list); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5734,6 +5714,14 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +static void gro_list_free(struct list_head *head) +{ + struct sk_buff *skb, *p; + + list_for_each_entry_safe(skb, p, head, list) + kfree_skb(skb); +} + /* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { @@ -5743,8 +5731,8 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - kfree_skb_list(napi->gro_list); - napi->gro_list = NULL; + gro_list_free(&napi->gro_list); + INIT_LIST_HEAD(&napi->gro_list); napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5787,7 +5775,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_list) { + if (!list_empty(&n->gro_list)) { /* flush too old packets * If HZ < 1000, flush all packets. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c642304f178c..b1f274f22d85 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3815,14 +3815,14 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); - struct sk_buff *lp, *p = *head; unsigned int delta_truesize; + struct sk_buff *lp; if (unlikely(p->len + len >= 65536)) return -E2BIG; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index ee28440f57c5..fd8faa0dfa61 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) } EXPORT_SYMBOL(sysfs_format_mac); -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_eth; const struct packet_offload *ptype; + unsigned int hlen, off_eth; + struct sk_buff *pp = NULL; + struct ethhdr *eh, *eh2; + struct sk_buff *p; __be16 type; int flush = 1; @@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..06b218a2870f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1384,12 +1384,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1505,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..bbeecd13e534 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include #include -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..efdc9e1f741e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..b9673c21be45 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..f5aee641f825 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -263,7 +263,7 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +277,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +302,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..ac46c1c55c99 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -399,8 +400,8 @@ out: } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 27f59b61f70f..ddfa533a84e5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) return 0; } -static struct sk_buff **esp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 5b3f2f89ef41..37ff4805b20c 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; @@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, flush--; nlen = skb_network_header_len(skb); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* */ @@ -263,8 +263,8 @@ out: return pp; } -static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ @@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, return ipv6_gro_receive(head, skb); } -static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 278e49cd67d4..e72947c99454 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,8 +15,8 @@ #include #include "ip6_offload.h" -static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 03a2ff3fe1e6..95dee9ca8d22 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -114,8 +114,8 @@ out: return segs; } -static struct sk_buff **udp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); -- cgit v1.2.3 From 07d78363dcffd9cb1bf6f06a6cac0e0847f3c1de Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:14:02 +0900 Subject: net: Convert NAPI gro list into a small hash table. Improve the performance of GRO receive by splitting flows into multiple hash chains. Suggested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +- net/core/dev.c | 105 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f176d9873910..c6b377a15869 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str); /* * Structure for NAPI scheduling similar to tasklet but with weighting */ +#define GRO_HASH_BUCKETS 8 struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means @@ -322,7 +323,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct list_head gro_list; + struct list_head gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; diff --git a/net/core/dev.c b/net/core/dev.c index aa61b9344b46..dffed642e686 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4875,15 +4875,12 @@ out: return netif_receive_skb_internal(skb); } -/* napi->gro_list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head, + bool flush_old) { struct sk_buff *skb, *p; - list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) { + list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; list_del_init(&skb->list); @@ -4891,15 +4888,33 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) napi->gro_count--; } } + +/* napi->gro_hash contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct list_head *head = &napi->gro_hash[i]; + + __napi_gro_flush_chain(napi, head, flush_old); + } +} EXPORT_SYMBOL(napi_gro_flush); -static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +static struct list_head *gro_list_prepare(struct napi_struct *napi, + struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct list_head *head; struct sk_buff *p; - list_for_each_entry(p, &napi->gro_list, list) { + head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)]; + list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4922,6 +4937,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } + + return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -4964,11 +4981,45 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } +static void gro_flush_oldest(struct napi_struct *napi) +{ + struct sk_buff *oldest = NULL; + unsigned long age = jiffies; + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct list_head *head = &napi->gro_hash[i]; + struct sk_buff *skb; + + if (list_empty(head)) + continue; + + skb = list_last_entry(head, struct sk_buff, list); + if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) { + oldest = skb; + age = NAPI_GRO_CB(skb)->age; + } + } + + /* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_count, caller is adding a new SKB to + * the chain. + */ + list_del(&oldest->list); + napi_gro_complete(oldest); +} + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; + struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -4977,7 +5028,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_list_prepare(napi, skb); + gro_head = gro_list_prepare(napi, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -5011,7 +5062,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); + pp = ptype->callbacks.gro_receive(gro_head, skb); break; } rcu_read_unlock(); @@ -5040,11 +5091,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb; - - nskb = list_last_entry(&napi->gro_list, struct sk_buff, list); - list_del(&nskb->list); - napi_gro_complete(nskb); + gro_flush_oldest(napi); } else { napi->gro_count++; } @@ -5052,7 +5099,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, &napi->gro_list); + list_add(&skb->list, gro_head); ret = GRO_HELD; pull: @@ -5458,7 +5505,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (!list_empty(&n->gro_list)) { + if (n->gro_count) { unsigned long timeout = 0; if (work_done) @@ -5667,7 +5714,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) && + if (napi->gro_count && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -5677,11 +5724,14 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { + int i; + INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - INIT_LIST_HEAD(&napi->gro_list); + for (i = 0; i < GRO_HASH_BUCKETS; i++) + INIT_LIST_HEAD(&napi->gro_hash[i]); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5714,12 +5764,16 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); -static void gro_list_free(struct list_head *head) +static void flush_gro_hash(struct napi_struct *napi) { - struct sk_buff *skb, *p; + int i; - list_for_each_entry_safe(skb, p, head, list) - kfree_skb(skb); + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct sk_buff *skb, *n; + + list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list) + kfree_skb(skb); + } } /* Must be called in process context */ @@ -5731,8 +5785,7 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - gro_list_free(&napi->gro_list); - INIT_LIST_HEAD(&napi->gro_list); + flush_gro_hash(napi); napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5775,7 +5828,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (!list_empty(&n->gro_list)) { + if (n->gro_count) { /* flush too old packets * If HZ < 1000, flush all packets. */ -- cgit v1.2.3 From fb223502ec0889444965f602f57b1f45f9e9845e Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 24 Jun 2018 10:02:54 -0400 Subject: tcp: add SNMP counter for zero-window drops It will be helpful if we could display the drops due to zero window or no enough window space. So a new SNMP MIB entry is added to track this behavior. This entry is named LINUX_MIB_TCPZEROWINDOWDROP and published in /proc/net/netstat in TcpExt line as TCPZeroWindowDrop. Signed-off-by: Yafang Shao Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 8 ++++++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 750d89120335..97517f36a5f9 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -279,6 +279,7 @@ enum LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */ + LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 77350c1256ce..225ef3433fe5 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -287,6 +287,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), + SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 76ca88f63b70..9c5b3415413f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4668,8 +4668,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (tcp_receive_window(tp) == 0) + if (tcp_receive_window(tp) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } /* Ok. In sequence. In window. */ queue_and_out: @@ -4735,8 +4737,10 @@ drop: /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ - if (!tcp_receive_window(tp)) + if (!tcp_receive_window(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } goto queue_and_out; } -- cgit v1.2.3 From 877375e4856c9d1b98aec30ff736896b333449e7 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:18 +0200 Subject: l2tp: remove pppol2tp_session_close() l2tp_core.c verifies that ->session_close() is defined before calling it. There's no need for a stub. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 55188382845c..eea5d7844473 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -424,12 +424,6 @@ static void pppol2tp_put_sk(struct rcu_head *head) sock_put(ps->__sk); } -/* Called by l2tp_core when a session socket is being closed. - */ -static void pppol2tp_session_close(struct l2tp_session *session) -{ -} - /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ @@ -573,7 +567,6 @@ static void pppol2tp_session_init(struct l2tp_session *session) struct dst_entry *dst; session->recv_skb = pppol2tp_recv; - session->session_close = pppol2tp_session_close; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) session->show = pppol2tp_show; #endif -- cgit v1.2.3 From c3612f0e901766e1caddabd18e0a34f0e6d82e20 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:19 +0200 Subject: l2tp: remove .show from struct l2tp_tunnel This callback has never been implemented. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.h | 3 --- net/l2tp/l2tp_debugfs.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index c199020f8a8a..b21c20a4e08f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -180,9 +180,6 @@ struct l2tp_tunnel { struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; -#ifdef CONFIG_DEBUG_FS - void (*show)(struct seq_file *m, void *arg); -#endif int (*recv_payload_hook)(struct sk_buff *skb); void (*old_sk_destruct)(struct sock *); struct sock *sock; /* Parent socket */ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index e87686f7d63c..b5d7dde003ef 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -177,9 +177,6 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) atomic_long_read(&tunnel->stats.rx_packets), atomic_long_read(&tunnel->stats.rx_bytes), atomic_long_read(&tunnel->stats.rx_errors)); - - if (tunnel->show != NULL) - tunnel->show(m, tunnel); } static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) -- cgit v1.2.3 From e484b1c227b6c661eba8ae424b271ed5b420ae4a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:20 +0200 Subject: l2tp: remove l2tp_tunnel_priv() This function, and the associated .priv field, are unused. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index b21c20a4e08f..15e1171ecf7b 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -187,8 +187,6 @@ struct l2tp_tunnel { * was created by userspace */ struct work_struct del_work; - - uint8_t priv[0]; /* private data */ }; struct l2tp_nl_cmd_ops { @@ -198,11 +196,6 @@ struct l2tp_nl_cmd_ops { int (*session_delete)(struct l2tp_session *session); }; -static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel) -{ - return &tunnel->priv[0]; -} - static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; -- cgit v1.2.3 From 2e67560ef6c53dae273b7c5c47a2ab4fb1ba9b30 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:22 +0200 Subject: l2tp: don't export l2tp_session_queue_purge() This function is only used in l2tp_core.c. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 3 +-- net/l2tp/l2tp_core.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 40261cb68e83..3adef4c35a3a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -783,7 +783,7 @@ EXPORT_SYMBOL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ -int l2tp_session_queue_purge(struct l2tp_session *session) +static int l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; BUG_ON(!session); @@ -794,7 +794,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session) } return 0; } -EXPORT_SYMBOL_GPL(l2tp_session_queue_purge); /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame * here. The skb is not on a list when we get here. diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 15e1171ecf7b..0a6e582f84d3 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -234,7 +234,6 @@ void l2tp_session_free(struct l2tp_session *session); void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); -int l2tp_session_queue_purge(struct l2tp_session *session); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); void l2tp_session_set_header_len(struct l2tp_session *session, int version); -- cgit v1.2.3 From d08532bb5080f234f1ac45f9fc909eb15f51834b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:23 +0200 Subject: l2tp: don't export l2tp_tunnel_closeall() This function is only used in l2tp_core.c. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 3 +-- net/l2tp/l2tp_core.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 3adef4c35a3a..96e31f2ae7cd 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1192,7 +1192,7 @@ end: /* When the tunnel is closed, all the attached sessions need to go too. */ -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1241,7 +1241,6 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 0a6e582f84d3..a5c09d3a5698 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -219,7 +219,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, -- cgit v1.2.3 From 363a341d190bde3f6d5f2786feefb9f1a7a45b95 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:24 +0200 Subject: l2tp: avoid duplicate l2tp_pernet() calls Replace 'l2tp_pernet(tunnel->l2tp_net)' with 'pn', which has been set on the preceding line. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 96e31f2ae7cd..88c3001531b4 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -322,8 +322,7 @@ int l2tp_session_register(struct l2tp_session *session, if (tunnel->version == L2TP_HDR_VER_3) { pn = l2tp_pernet(tunnel->l2tp_net); - g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net), - session->session_id); + g_head = l2tp_session_id_hash_2(pn, session->session_id); spin_lock_bh(&pn->l2tp_session_hlist_lock); -- cgit v1.2.3 From 2685fbb8044f9bd8d3b5de1fa7854fea655f2df6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Jun 2018 16:07:25 +0200 Subject: l2tp: make l2tp_xmit_core() return void It always returns 0, and nobody reads the return value anyway. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 88c3001531b4..1ea285bad84b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1007,8 +1007,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) return bufp - optr; } -static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, - struct flowi *fl, size_t data_len) +static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, + struct flowi *fl, size_t data_len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; @@ -1050,8 +1050,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, atomic_long_inc(&tunnel->stats.tx_errors); atomic_long_inc(&session->stats.tx_errors); } - - return 0; } /* If caller requires the skb to have a ppp header, the header must be -- cgit v1.2.3 From 60513bd82c825b659c05957e4f8106ba06f0797f Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:04 -0700 Subject: net: sched: pass extack pointer to block binds and cb registration Pass the extact struct from a tc qdisc add to the block bind function and, in turn, to the setup_tc ndo of binding device via the tc_block_offload struct. Pass this back to any block callback registrations to allow netlink logging of fails in the bind process. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 +++++---- drivers/net/ethernet/netronome/nfp/bpf/main.c | 2 +- .../net/ethernet/netronome/nfp/flower/offload.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/netdevsim/netdev.c | 2 +- include/net/pkt_cls.h | 11 ++++++---- net/dsa/slave.c | 2 +- net/sched/cls_api.c | 25 ++++++++++++++-------- 17 files changed, 43 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 176fc9f4d7de..b5fc6414a951 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7984,7 +7984,7 @@ static int bnxt_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, bnxt_setup_tc_block_cb, - bp, bp); + bp, bp, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, bnxt_setup_tc_block_cb, bp); return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index 05d405905906..0745f2dfc80c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -173,7 +173,7 @@ static int bnxt_vf_rep_setup_tc_block(struct net_device *dev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, bnxt_vf_rep_setup_tc_block_cb, - vf_rep, vf_rep); + vf_rep, vf_rep, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, bnxt_vf_rep_setup_tc_block_cb, vf_rep); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc03c175a3cd..96bc177d54de 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3016,7 +3016,7 @@ static int cxgb_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, cxgb_setup_tc_block_cb, - pi, dev); + pi, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cxgb_setup_tc_block_cb, pi); return 0; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 7ad2b1b0b125..426b0ccb1fc6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7554,7 +7554,7 @@ static int i40e_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, i40e_setup_tc_block_cb, - np, np); + np, np, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, i40e_setup_tc_block_cb, np); return 0; diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index dc56a8667495..5906c1c1d19d 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -2926,7 +2926,7 @@ static int i40evf_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, i40evf_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, i40evf_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 6a78d8272eb2..f1e3397bd405 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2728,7 +2728,7 @@ static int igb_setup_tc_block(struct igb_adapter *adapter, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, igb_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 3e87dbbc9024..d29bd8fc3ff3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9325,7 +9325,7 @@ static int ixgbe_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, ixgbe_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, ixgbe_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 56c1b6f5593e..134f20a182b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3371,7 +3371,7 @@ static int mlx5e_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb, priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 57987f6546e8..3f2fe95e01d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -797,7 +797,7 @@ static int mlx5e_rep_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv); return 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 968b88af2ef5..d2bc335dda11 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1503,7 +1503,8 @@ static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, static int mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, - struct tcf_block *block, bool ingress) + struct tcf_block *block, bool ingress, + struct netlink_ext_ack *extack) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_acl_block *acl_block; @@ -1518,7 +1519,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, return -ENOMEM; block_cb = __tcf_block_cb_register(block, mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp, acl_block); + mlxsw_sp, acl_block, extack); if (IS_ERR(block_cb)) { err = PTR_ERR(block_cb); goto err_cb_register; @@ -1596,11 +1597,12 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, switch (f->command) { case TC_BLOCK_BIND: err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port, - mlxsw_sp_port); + mlxsw_sp_port, f->extack); if (err) return err; err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port, - f->block, ingress); + f->block, ingress, + f->extack); if (err) { tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port); return err; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index fcdfb8e7fdea..bf46f7bff912 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -206,7 +206,7 @@ static int nfp_bpf_setup_tc_block(struct net_device *netdev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nfp_bpf_setup_tc_block_cb, - nn, nn); + nn, nn, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nfp_bpf_setup_tc_block_cb, diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index c0e74aa4cb5e..a427dab4bf49 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -627,7 +627,7 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nfp_flower_setup_tc_block_cb, - repr, repr); + repr, repr, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nfp_flower_setup_tc_block_cb, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index cba46b62a1cd..2354e30caa78 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3776,7 +3776,7 @@ static int stmmac_setup_tc_block(struct stmmac_priv *priv, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv); return 0; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index ec68f38213d9..c9dacc6fcd59 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -260,7 +260,7 @@ nsim_setup_tc_block(struct net_device *dev, struct tc_block_offload *f) switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nsim_setup_tc_block_cb, - ns, ns); + ns, ns, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nsim_setup_tc_block_cb, ns); return 0; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3c1a2c47cd4..a2c6d35ba057 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -73,10 +73,11 @@ void tcf_block_cb_incref(struct tcf_block_cb *block_cb); unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb); struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv); + void *cb_priv, + struct netlink_ext_ack *extack); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv); + void *cb_priv, struct netlink_ext_ack *extack); void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); void tcf_block_cb_unregister(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident); @@ -161,7 +162,8 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) static inline struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { return NULL; } @@ -169,7 +171,7 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, static inline int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { return 0; } @@ -596,6 +598,7 @@ struct tc_block_offload { enum tc_block_command command; enum tcf_block_binder_type binder_type; struct tcf_block *block; + struct netlink_ext_ack *extack; }; struct tc_cls_common_offload { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1e3b6a6d8a40..71536c435132 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: - return tcf_block_cb_register(f->block, cb, dev, dev); + return tcf_block_cb_register(f->block, cb, dev, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cb, dev); return 0; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cdc3c87c53e6..8c9fb4b827a1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -277,18 +277,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command) + enum tc_block_command command, + struct netlink_ext_ack *extack) { struct tc_block_offload bo = {}; bo.command = command; bo.binder_type = ei->binder_type; bo.block = block; + bo.extack = extack; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; @@ -299,10 +302,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) + if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); return -EOPNOTSUPP; + } - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; return err; @@ -322,7 +327,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; return; @@ -612,7 +617,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_chain_head_change_cb_add; - err = tcf_block_offload_bind(block, q, ei); + err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; @@ -748,7 +753,8 @@ EXPORT_SYMBOL(tcf_block_cb_decref); struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; @@ -772,11 +778,12 @@ EXPORT_SYMBOL(__tcf_block_cb_register); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; - block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, + extack); return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; } EXPORT_SYMBOL(tcf_block_cb_register); -- cgit v1.2.3 From 31533cba4327aefeafe8a7d57de0c737a3b2faa6 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:06 -0700 Subject: net: sched: cls_flower: implement offload tcf_proto_op Add the reoffload tcf_proto_op in flower to generate an offload message for each filter in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. A filter contains a flag to indicate if it is in hardware or not. To ensure the reoffload function properly maintains this flag, keep a reference counter for the number of instances of the filter that are in hardware. Only update the flag when this counter changes from or to 0. Add a generic helper function to implement this behaviour. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 15 +++++++++++++++ net/sched/cls_flower.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 18adc9142b18..7432100027b7 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -336,6 +336,21 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) block->offloadcnt--; } +static inline void +tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, + u32 *flags, bool add) +{ + if (add) { + if (!*cnt) + tcf_block_offload_inc(block, flags); + (*cnt)++; + } else { + (*cnt)--; + if (!*cnt) + tcf_block_offload_dec(block, flags); + } +} + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9e8b26a80fb3..352876bb901b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -87,6 +87,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -289,6 +290,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, fl_hw_destroy_filter(tp, f, NULL); return err; } else if (err > 0) { + f->in_hw_count = err; tcf_block_offload_inc(block, &f->flags); } @@ -1087,6 +1089,47 @@ skip: } } +static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; + struct fl_flow_mask *mask; + struct cls_fl_filter *f; + int err; + + list_for_each_entry(mask, &head->masks, list) { + list_for_each_entry(f, &mask->filters, list) { + if (tc_skip_hw(f->flags)) + continue; + + tc_cls_common_offload_init(&cls_flower.common, tp, + f->flags, extack); + cls_flower.command = add ? + TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + cls_flower.cookie = (unsigned long)f; + cls_flower.dissector = &mask->dissector; + cls_flower.mask = &f->mkey; + cls_flower.key = &f->key; + cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; + + err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + if (err) { + if (add && tc_skip_sw(f->flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &f->in_hw_count, + &f->flags, add); + } + } + + return 0; +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1438,6 +1481,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .change = fl_change, .delete = fl_delete, .walk = fl_walk, + .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 0efd1b3a13bfabc8b70e79bd22aa413d6d2ad7a5 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:07 -0700 Subject: net: sched: cls_matchall: implement offload tcf_proto_op Add the reoffload tcf_proto_op in matchall to generate an offload message for each filter in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. Ensure matchall flags correctly report if the rule is in hw by keeping a reference counter for the number of instances of the rule offloaded. Only update the flag when this counter changes from or to 0. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 47b207ef7762..af16f36ed578 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; }; @@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, mall_destroy_hw_filter(tp, head, cookie, NULL); return err; } else if (err > 0) { + head->in_hw_count = err; tcf_block_offload_inc(block, &head->flags); } @@ -235,6 +237,35 @@ skip: arg->count++; } +static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + int err; + + if (tc_skip_hw(head->flags)) + return 0; + + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); + cls_mall.command = add ? + TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; + cls_mall.exts = &head->exts; + cls_mall.cookie = (unsigned long)head; + + err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv); + if (err) { + if (add && tc_skip_sw(head->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add); + + return 0; +} + static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { @@ -289,6 +320,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = { .change = mall_change, .delete = mall_delete, .walk = mall_walk, + .reoffload = mall_reoffload, .dump = mall_dump, .bind_class = mall_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 530d995123fe647d28566d81ff9562fe6cbaff94 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:08 -0700 Subject: net: sched: cls_u32: implement offload tcf_proto_op Add the offload tcf_proto_op in cls_u32 to generate an offload message for each filter and the hashtable in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. A filter contains a flag to indicate if it is in hardware or not. To ensure the offload function properly maintains this flag, keep a reference counter for the number of instances of the filter that are in hardware. Only update the flag when this counter changes from or to 0. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index fb861f90fde6..d5d2a6dc3921 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -62,6 +62,7 @@ struct tc_u_knode { struct tc_u32_pcnt __percpu *pf; #endif u32 flags; + unsigned int in_hw_count; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32_remove_hw_knode(tp, n, NULL); return err; } else if (err > 0) { + n->in_hw_count = err; tcf_block_offload_inc(block, &n->flags); } @@ -1199,6 +1201,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } +static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack); + cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE; + cls_u32.hnode.divisor = ht->divisor; + cls_u32.hnode.handle = ht->handle; + cls_u32.hnode.prio = ht->prio; + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err && add && tc_skip_sw(ht->flags)) + return err; + + return 0; +} + +static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); + struct tcf_block *block = tp->chain->block; + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); + cls_u32.command = add ? + TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE; + cls_u32.knode.handle = n->handle; + + if (add) { + cls_u32.knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + cls_u32.knode.val = n->val; + cls_u32.knode.mask = n->mask; +#else + cls_u32.knode.val = 0; + cls_u32.knode.mask = 0; +#endif + cls_u32.knode.sel = &n->sel; + cls_u32.knode.exts = &n->exts; + if (n->ht_down) + cls_u32.knode.link_handle = ht->handle; + } + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err) { + if (add && tc_skip_sw(n->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add); + + return 0; +} + +static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned int h; + int err; + + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { + if (ht->prio != tp->prio) + continue; + + /* When adding filters to a new dev, try to offload the + * hashtable first. When removing, do the filters before the + * hashtable. + */ + if (add && !tc_skip_hw(ht->flags)) { + err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv, + extack); + if (err) + return err; + } + + for (h = 0; h <= ht->divisor; h++) { + for (n = rtnl_dereference(ht->ht[h]); + n; + n = rtnl_dereference(n->next)) { + if (tc_skip_hw(n->flags)) + continue; + + err = u32_reoffload_knode(tp, n, add, cb, + cb_priv, extack); + if (err) + return err; + } + } + + if (!add && !tc_skip_hw(ht->flags)) + u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); + } + + return 0; +} + static void u32_bind_class(void *fh, u32 classid, unsigned long cl) { struct tc_u_knode *n = fh; @@ -1336,6 +1446,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .change = u32_change, .delete = u32_delete, .walk = u32_walk, + .reoffload = u32_reoffload, .dump = u32_dump, .bind_class = u32_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 7e916b76805f11c1686a43ab5ead9a9b1a0a5945 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:09 -0700 Subject: net: sched: cls_bpf: implement offload tcf_proto_op Add the offload tcf_proto_op in cls_bpf to generate an offload message for each bpf prog in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' prog. A prog contains a flag to indicate if it is in hardware or not. To ensure the offload function properly maintains this flag, keep a reference counter for the number of instances of the prog that are in hardware. Only update the flag when this counter changes from or to 0. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1aa7f6511065..66e0ac9811f9 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -43,6 +43,7 @@ struct cls_bpf_prog { struct tcf_result res; bool exts_integrated; u32 gen_flags; + unsigned int in_hw_count; struct tcf_exts exts; u32 handle; u16 bpf_num_ops; @@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } else if (err > 0) { + prog->in_hw_count = err; tcf_block_offload_inc(block, &prog->gen_flags); } } @@ -652,6 +654,42 @@ skip: } } +static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_bpf_head *head = rtnl_dereference(tp->root); + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *prog; + int err; + + list_for_each_entry(prog, &head->plist, link) { + if (tc_skip_hw(prog->gen_flags)) + continue; + + tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, + extack); + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = add ? prog->filter : NULL; + cls_bpf.oldprog = add ? NULL : prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + + err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv); + if (err) { + if (add && tc_skip_sw(prog->gen_flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &prog->in_hw_count, + &prog->gen_flags, add); + } + + return 0; +} + static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, @@ -662,6 +700,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, + .reoffload = cls_bpf_reoffload, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; -- cgit v1.2.3 From 326367427cc09d38e4c1d145131ee2e228ac94c5 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:10 -0700 Subject: net: sched: call reoffload op on block callback reg Call the reoffload tcf_proto_op on all tcf_proto nodes in all chains of a block when a callback tries to register to a block that already has offloaded rules. If all existing rules cannot be offloaded then the registration is rejected. This replaces the previous policy of rejecting such callback registration outright. On unregistration of a callback, the rules are flushed for that given cb. The implementation of block sharing in the NFP driver, for example, duplicates shared rules to all devs bound to a block. This meant that rules could still exist in hw even after a device is unbound from a block (assuming the block still remains active). Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 +- include/net/pkt_cls.h | 6 ++- net/sched/cls_api.c | 54 ++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index d2bc335dda11..52437363766a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1542,7 +1542,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, err_block_bind: if (!tcf_block_cb_decref(block_cb)) { - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); err_cb_register: mlxsw_sp_acl_block_destroy(acl_block); } @@ -1572,7 +1572,7 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block, mlxsw_sp_port, ingress); if (!err && !tcf_block_cb_decref(block_cb)) { - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); mlxsw_sp_acl_block_destroy(acl_block); } } diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a2c6d35ba057..4070b8eb6d14 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -78,7 +78,8 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, struct netlink_ext_ack *extack); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb); void tcf_block_cb_unregister(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident); @@ -177,7 +178,8 @@ int tcf_block_cb_register(struct tcf_block *block, } static inline -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8c9fb4b827a1..bbf8dda96b0e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -751,19 +751,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) } EXPORT_SYMBOL(tcf_block_cb_decref); +static int +tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv, bool add, bool offload_in_use, + struct netlink_ext_ack *extack) +{ + struct tcf_chain *chain; + struct tcf_proto *tp; + int err; + + list_for_each_entry(chain, &block->chain_list, list) { + for (tp = rtnl_dereference(chain->filter_chain); tp; + tp = rtnl_dereference(tp->next)) { + if (tp->ops->reoffload) { + err = tp->ops->reoffload(tp, add, cb, cb_priv, + extack); + if (err && add) + goto err_playback_remove; + } else if (add && offload_in_use) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); + goto err_playback_remove; + } + } + } + + return 0; + +err_playback_remove: + tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, + extack); + return err; +} + struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; + int err; - /* At this point, playback of previous block cb calls is not supported, - * so forbid to register to block which already has some offloaded - * filters present. - */ - if (tcf_block_offload_in_use(block)) - return ERR_PTR(-EOPNOTSUPP); + /* Replay any already present rules */ + err = tcf_block_playback_offloads(block, cb, cb_priv, true, + tcf_block_offload_in_use(block), + extack); + if (err) + return ERR_PTR(err); block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) @@ -788,8 +822,12 @@ int tcf_block_cb_register(struct tcf_block *block, } EXPORT_SYMBOL(tcf_block_cb_register); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { + tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, + false, tcf_block_offload_in_use(block), + NULL); list_del(&block_cb->list); kfree(block_cb); } @@ -803,7 +841,7 @@ void tcf_block_cb_unregister(struct tcf_block *block, block_cb = tcf_block_cb_lookup(block, cb, cb_ident); if (!block_cb) return; - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); } EXPORT_SYMBOL(tcf_block_cb_unregister); -- cgit v1.2.3 From 3463e51dc337ddd6e608fd595130398e9c60680f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 25 Jun 2018 16:55:05 -0700 Subject: net/tls: Remove VLA usage on nonce It looks like the prior VLA removal, commit b16520f7493d ("net/tls: Remove VLA usage"), and a new VLA addition, commit c46234ebb4d1e ("tls: RX path for ktls"), passed in the night. This removes the newly added VLA, which happens to have its bounds based on the same max value. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 727433b37bb5..173d8b89072d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -941,7 +941,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char header[tls_ctx->rx.prepend_size]; + char header[TLS_HEADER_SIZE + MAX_IV_SIZE]; struct strp_msg *rxm = strp_msg(skb); size_t cipher_overhead; size_t data_len = 0; @@ -951,6 +951,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) return 0; + /* Sanity-check size of on-stack buffer. */ + if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) { + ret = -EINVAL; + goto read_failure; + } + /* Linearize header to local buffer */ ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); @@ -1108,7 +1114,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE) { + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { rc = -EINVAL; goto free_priv; } -- cgit v1.2.3 From 8e326289e3069dfc9fa9c209924668dd031ab8ef Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 25 Jun 2018 20:32:53 -0700 Subject: neighbour: force neigh_invalidate when NUD_FAILED update is from admin In systems where neigh gc thresh holds are set to high values, admin deleted neigh entries (eg ip neigh flush or ip neigh del) can linger around in NUD_FAILED state for a long time until periodic gc kicks in. This patch forces neigh_invalidate when NUD_FAILED neigh_update is from an admin. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e3fda9e725c..cbe85d8d4cc2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->nud_state = new; err = 0; notify = old & NUD_VALID; - if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + if (((old & (NUD_INCOMPLETE | NUD_PROBE)) || + (flags & NEIGH_UPDATE_F_ADMIN)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; -- cgit v1.2.3 From 242b1bbe5144de3577ad12da058e70ef88167146 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jun 2018 08:45:49 -0700 Subject: tcp: remove one indentation level in tcp_create_openreq_child Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 223 ++++++++++++++++++++++++----------------------- 1 file changed, 113 insertions(+), 110 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dda1341a223..dac5893a52b4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk; + struct tcp_sock *oldtp, *newtp; - if (newsk) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - - smc_check_reset_syn_req(oldtp, req, newtp); - - /* Now setup tcp_sock */ - newtp->pred_flags = 0; - - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 1; - - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - - INIT_LIST_HEAD(&newtp->tsq_node); - INIT_LIST_HEAD(&newtp->tsorted_sent_queue); - - tcp_init_wl(newtp, treq->rcv_isn); - - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; - newtp->lsndtime = tcp_jiffies32; - newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; - newtp->total_retrans = req->num_retrans; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - - tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - - if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - newtp->rx_opt.sack_ok = ireq->sack_ok; - newtp->window_clamp = req->rsk_window_clamp; - newtp->rcv_ssthresh = req->rsk_rcv_wnd; - newtp->rcv_wnd = req->rsk_rcv_wnd; - newtp->rx_opt.wscale_ok = ireq->wscale_ok; - if (newtp->rx_opt.wscale_ok) { - newtp->rx_opt.snd_wscale = ireq->snd_wscale; - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; - } else { - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp, 65535U); - } - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << - newtp->rx_opt.snd_wscale); - newtp->max_window = newtp->snd_wnd; - - if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else { - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } - newtp->tsoffset = treq->ts_off; + if (!newsk) + return NULL; + + newicsk = inet_csk(newsk); + newtp = tcp_sk(newsk); + oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); + + tcp_init_wl(newtp, treq->rcv_isn); + + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + newtp->packets_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + newtp->tlp_high_seq = 0; + newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = treq->txhash; + newtp->last_oow_ack_time = 0; + newtp->total_retrans = req->num_retrans; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = TCP_INIT_CWND; + newtp->snd_cwnd_cnt = 0; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + tcp_init_xmit_timers(newsk); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->rx_opt.sack_ok = ireq->sack_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; + newtp->md5sig_info = NULL; /*XXX*/ + if (newtp->af_specific->md5_lookup(sk, newsk)) + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; - newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); - newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; - - __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); - } + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); -- cgit v1.2.3 From a408194aa050f9a820f5a64301c7a08880f8af7d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 26 Jun 2018 18:41:36 +0200 Subject: l2tp: define helper for parsing struct sockaddr_pppol2tp* 'sockaddr_len' is checked against various values when entering pppol2tp_connect(), to verify its validity. It is used again later, to find out which sockaddr structure was passed from user space. This patch combines these two operations into one new function in order to simplify pppol2tp_connect(). A new structure, l2tp_connect_info, is used to pass sockaddr data back to pppol2tp_connect(), to avoid passing too many parameters to l2tp_sockaddr_get_info(). Also, the first parameter is void* in order to avoid casting between all sockaddr_* structures manually. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 173 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 103 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index eea5d7844473..d3a9355ac8ac 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -588,40 +588,113 @@ static void pppol2tp_session_init(struct l2tp_session *session) } } +struct l2tp_connect_info { + u8 version; + int fd; + u32 tunnel_id; + u32 peer_tunnel_id; + u32 session_id; + u32 peer_session_id; +}; + +static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, + struct l2tp_connect_info *info) +{ + switch (sa_len) { + case sizeof(struct sockaddr_pppol2tp): + { + const struct sockaddr_pppol2tp *sa_v2in4 = sa; + + if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 2; + info->fd = sa_v2in4->pppol2tp.fd; + info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel; + info->session_id = sa_v2in4->pppol2tp.s_session; + info->peer_session_id = sa_v2in4->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpv3): + { + const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa; + + if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 3; + info->fd = sa_v3in4->pppol2tp.fd; + info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel; + info->session_id = sa_v3in4->pppol2tp.s_session; + info->peer_session_id = sa_v3in4->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpin6): + { + const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa; + + if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 2; + info->fd = sa_v2in6->pppol2tp.fd; + info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel; + info->session_id = sa_v2in6->pppol2tp.s_session; + info->peer_session_id = sa_v2in6->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpv3in6): + { + const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa; + + if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 3; + info->fd = sa_v3in6->pppol2tp.fd; + info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel; + info->session_id = sa_v3in6->pppol2tp.s_session; + info->peer_session_id = sa_v3in6->pppol2tp.d_session; + + break; + } + default: + return -EINVAL; + } + + return 0; +} + /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; - struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr; struct pppox_sock *po = pppox_sk(sk); struct l2tp_session *session = NULL; + struct l2tp_connect_info info; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; struct l2tp_session_cfg cfg = { 0, }; - int error = 0; - u32 tunnel_id, peer_tunnel_id; - u32 session_id, peer_session_id; bool drop_refcnt = false; bool drop_tunnel = false; bool new_session = false; bool new_tunnel = false; - int ver = 2; - int fd; - - lock_sock(sk); - - error = -EINVAL; + int error; - if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6)) - goto end; + error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info); + if (error < 0) + return error; - if (sp->sa_protocol != PX_PROTO_OL2TP) - goto end; + lock_sock(sk); /* Check for already bound sockets */ error = -EBUSY; @@ -633,56 +706,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (sk->sk_user_data) goto end; /* socket is already attached */ - /* Get params from socket address. Handle L2TPv2 and L2TPv3. - * This is nasty because there are different sockaddr_pppol2tp - * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use - * the sockaddr size to determine which structure the caller - * is using. - */ - peer_tunnel_id = 0; - if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) { - fd = sp->pppol2tp.fd; - tunnel_id = sp->pppol2tp.s_tunnel; - peer_tunnel_id = sp->pppol2tp.d_tunnel; - session_id = sp->pppol2tp.s_session; - peer_session_id = sp->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) { - struct sockaddr_pppol2tpv3 *sp3 = - (struct sockaddr_pppol2tpv3 *) sp; - ver = 3; - fd = sp3->pppol2tp.fd; - tunnel_id = sp3->pppol2tp.s_tunnel; - peer_tunnel_id = sp3->pppol2tp.d_tunnel; - session_id = sp3->pppol2tp.s_session; - peer_session_id = sp3->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) { - struct sockaddr_pppol2tpin6 *sp6 = - (struct sockaddr_pppol2tpin6 *) sp; - fd = sp6->pppol2tp.fd; - tunnel_id = sp6->pppol2tp.s_tunnel; - peer_tunnel_id = sp6->pppol2tp.d_tunnel; - session_id = sp6->pppol2tp.s_session; - peer_session_id = sp6->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) { - struct sockaddr_pppol2tpv3in6 *sp6 = - (struct sockaddr_pppol2tpv3in6 *) sp; - ver = 3; - fd = sp6->pppol2tp.fd; - tunnel_id = sp6->pppol2tp.s_tunnel; - peer_tunnel_id = sp6->pppol2tp.d_tunnel; - session_id = sp6->pppol2tp.s_session; - peer_session_id = sp6->pppol2tp.d_session; - } else { - error = -EINVAL; - goto end; /* bad socket address */ - } - /* Don't bind if tunnel_id is 0 */ error = -EINVAL; - if (tunnel_id == 0) + if (!info.tunnel_id) goto end; - tunnel = l2tp_tunnel_get(sock_net(sk), tunnel_id); + tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id); if (tunnel) drop_tunnel = true; @@ -690,7 +719,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * peer_session_id is 0. Otherwise look up tunnel using supplied * tunnel id. */ - if ((session_id == 0) && (peer_session_id == 0)) { + if (!info.session_id && !info.peer_session_id) { if (tunnel == NULL) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, @@ -700,12 +729,16 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* Prevent l2tp_tunnel_register() from trying to set up * a kernel socket. */ - if (fd < 0) { + if (info.fd < 0) { error = -EBADF; goto end; } - error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); + error = l2tp_tunnel_create(sock_net(sk), info.fd, + info.version, + info.tunnel_id, + info.peer_tunnel_id, &tcfg, + &tunnel); if (error < 0) goto end; @@ -734,9 +767,9 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, tunnel->recv_payload_hook = pppol2tp_recv_payload_hook; if (tunnel->peer_tunnel_id == 0) - tunnel->peer_tunnel_id = peer_tunnel_id; + tunnel->peer_tunnel_id = info.peer_tunnel_id; - session = l2tp_session_get(sock_net(sk), tunnel, session_id); + session = l2tp_session_get(sock_net(sk), tunnel, info.session_id); if (session) { drop_refcnt = true; @@ -765,8 +798,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), - tunnel, session_id, - peer_session_id, &cfg); + tunnel, info.session_id, + info.peer_session_id, &cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto end; -- cgit v1.2.3 From 0a9fe5c375b57fab6d18ed0a6a7f935eefb09db3 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Wed, 27 Jun 2018 10:32:19 -0700 Subject: netem: slotting with non-uniform distribution Extend slotting with support for non-uniform distributions. This is similar to netem's non-uniform distribution delay feature. Commit f043efeae2f1 ("netem: support delivering packets in delayed time slots") added the slotting feature to approximate the behaviors of media with packet aggregation but only supported a uniform distribution for delays between transmission attempts. Tests with TCP BBR with emulated wifi links with non-uniform distributions produced more useful results. Syntax: slot dist DISTRIBUTION DELAY JITTER [packets MAX_PACKETS] \ [bytes MAX_BYTES] The syntax and use of the distribution table is the same as in the non-uniform distribution delay feature. A file DISTRIBUTION must be present in TC_LIB_DIR (e.g. /usr/lib/tc) containing numbers scaled by NETEM_DIST_SCALE. A random value x is selected from the table and it takes DELAY + ( x * JITTER ) as delay. Correlation between values is not supported. Examples: Normal distribution delay with mean = 800us and stdev = 100us. > tc qdisc add dev eth0 root netem slot dist normal 800us 100us Optionally set the max slot size in bytes and/or packets. > tc qdisc add dev eth0 root netem slot dist normal 800us 100us \ bytes 64k packets 42 Signed-off-by: Yousuk Seung Acked-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 3 ++ net/sched/sch_netem.c | 73 ++++++++++++++++++++++++++++-------------- 2 files changed, 52 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 37b5096ae97b..bad3c03bcf43 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -539,6 +539,7 @@ enum { TCA_NETEM_LATENCY64, TCA_NETEM_JITTER64, TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, __TCA_NETEM_MAX, }; @@ -581,6 +582,8 @@ struct tc_netem_slot { __s64 max_delay; __s32 max_packets; __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ }; enum { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7d6801fc5340..ad18a2052416 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -68,6 +68,11 @@ Fabio Ludovici */ +struct disttable { + u32 size; + s16 table[0]; +}; + struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; @@ -99,10 +104,7 @@ struct netem_sched_data { u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; - struct disttable { - u32 size; - s16 table[0]; - } *delay_dist; + struct disttable *delay_dist; enum { CLG_RANDOM, @@ -142,6 +144,7 @@ struct netem_sched_data { s32 bytes_left; } slot; + struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block @@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state) u64 value, rho; unsigned long answer; - if (state->rho == 0) /* no correlation */ + if (!state || state->rho == 0) /* no correlation */ return prandom_u32(); value = prandom_u32(); @@ -601,10 +604,19 @@ finish_segs: static void get_slot_next(struct netem_sched_data *q, u64 now) { - q->slot.slot_next = now + q->slot_config.min_delay + - (prandom_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + s64 next_delay; + + if (!q->slot_dist) + next_delay = q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + else + next_delay = tabledist(q->slot_config.dist_delay, + (s32)(q->slot_config.dist_jitter), + NULL, q->slot_dist); + + q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } @@ -721,9 +733,9 @@ static void dist_free(struct disttable *d) * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) +static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, + const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); spinlock_t *root_lock; @@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - swap(q->delay_dist, d); + swap(*tbl, d); spin_unlock_bh(root_lock); dist_free(d); @@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_bytes = INT_MAX; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; - if (q->slot_config.min_delay | q->slot_config.max_delay) + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; @@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); - if (ret) { - /* recover clg and loss_model, in case of - * q->clg and q->loss_model were modified - * in get_loss_clg() - */ - q->clg = old_clg; - q->loss_model = old_loss_model; - return ret; - } + ret = get_dist_table(sch, &q->delay_dist, + tb[TCA_NETEM_DELAY_DIST]); + if (ret) + goto get_table_failure; + } + + if (tb[TCA_NETEM_SLOT_DIST]) { + ret = get_dist_table(sch, &q->slot_dist, + tb[TCA_NETEM_SLOT_DIST]); + if (ret) + goto get_table_failure; } sch->limit = qopt->limit; @@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, get_slot(q, tb[TCA_NETEM_SLOT]); return ret; + +get_table_failure: + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, @@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch) if (q->qdisc) qdisc_destroy(q->qdisc); dist_free(q->delay_dist); + dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, @@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; - if (q->slot_config.min_delay | q->slot_config.max_delay) { + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; -- cgit v1.2.3 From 80f0f574cc615b2c61bdfb0e3c2449478d63c488 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:30 -0400 Subject: net sched actions: fix coding style in pedit action Fix coding style issues in tc pedit action detected by the checkpatch script. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 8a925c72db5f..e4b29ee79ba8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -136,15 +136,15 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; - struct nlattr *pattr; - struct tc_pedit *parm; - int ret = 0, err; - struct tcf_pedit *p; struct tc_pedit_key *keys = NULL; struct tcf_pedit_key_ex *keys_ex; + struct tc_pedit *parm; + struct nlattr *pattr; + struct tcf_pedit *p; + int ret = 0, err; int ksize; - if (nla == NULL) + if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); @@ -175,7 +175,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return ret; p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); - if (keys == NULL) { + if (!keys) { tcf_idr_release(*a, bind); kfree(keys_ex); return -ENOMEM; @@ -220,6 +220,7 @@ static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); kfree(p->tcfp_keys_ex); } @@ -284,7 +285,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_header_type htype = + TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { @@ -316,16 +318,15 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, hoffset + tkey->at); goto bad; } - d = skb_header_pointer(skb, hoffset + tkey->at, 1, - &_d); + d = skb_header_pointer(skb, hoffset + tkey->at, + 1, &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { - pr_info("tc filter pedit" - " offset must be on 32 bit boundaries\n"); + pr_info("tc filter pedit offset must be on 32 bit boundaries\n"); goto bad; } @@ -335,7 +336,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, goto bad; } - ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data); + ptr = skb_header_pointer(skb, hoffset + offset, + 4, &_data); if (!ptr) goto bad; /* just do it, baby */ @@ -358,8 +360,9 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } goto done; - } else + } else { WARN(1, "pedit BUG: index %d\n", p->tcf_index); + } bad: p->tcf_qstats.overlimits++; -- cgit v1.2.3 From 544377cd2545f33cc6cd5458301749d828adacb0 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:32 -0400 Subject: net sched actions: fix sparse warning The variable _data in include/asm-generic/sections.h defines sections, this causes sparse warning in pedit: net/sched/act_pedit.c:293:35: warning: symbol '_data' shadows an earlier one ./include/asm-generic/sections.h:36:13: originally declared here Therefore rename the variable. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e4b29ee79ba8..9c2d8a31a5c5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -290,7 +290,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, _data; + u32 *ptr, hdata; int offset = tkey->off; int hoffset; u32 val; @@ -337,7 +337,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } ptr = skb_header_pointer(skb, hoffset + offset, - 4, &_data); + 4, &hdata); if (!ptr) goto bad; /* just do it, baby */ @@ -355,7 +355,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &_data) + if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } -- cgit v1.2.3 From 6ff7586e382cb4274adefd56501d428ea39a5af3 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:33 -0400 Subject: net sched actions: use sizeof operator for buffer length Replace constant integer with sizeof() to clearly indicate the destination buffer length in skb_header_pointer() calls. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 9c2d8a31a5c5..3b775f54cee5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -319,7 +319,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, - 1, &_d); + sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; @@ -337,7 +337,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } ptr = skb_header_pointer(skb, hoffset + offset, - 4, &hdata); + sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ -- cgit v1.2.3 From 95b0d2dc13c7e7ea51675836680732e8c16e378a Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:34 -0400 Subject: net sched actions: fix misleading text strings in pedit action Change "tc filter pedit .." to "tc actions pedit .." in error messages to clearly refer to pedit action. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3b775f54cee5..caa6927a992c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -305,7 +305,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { - pr_info("tc filter pedit bad header type specified (0x%x)\n", + pr_info("tc action pedit bad header type specified (0x%x)\n", htype); goto bad; } @@ -314,7 +314,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, char *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc filter pedit 'at' offset %d out of bounds\n", + pr_info("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } @@ -326,12 +326,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } if (offset % 4) { - pr_info("tc filter pedit offset must be on 32 bit boundaries\n"); + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc filter pedit offset %d out of bounds\n", + pr_info("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } @@ -349,7 +349,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, val = (*ptr + tkey->val) & ~tkey->mask; break; default: - pr_info("tc filter pedit bad command (%d)\n", + pr_info("tc action pedit bad command (%d)\n", cmd); goto bad; } -- cgit v1.2.3 From 430527415398cf7e741f5e2f11324a8df9093327 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:35 -0400 Subject: net sched actions: avoid bitwise operation on signed value in pedit Since char can be unsigned or signed, and bitwise operators may have implementation-dependent results when performed on signed operands, declare 'u8 *' operand instead. Suggested-by: Davide Caratti Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index caa6927a992c..ab151346d3d4 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -311,7 +311,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } if (tkey->offmask) { - char *d, _d; + u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info("tc action pedit 'at' offset %d out of bounds\n", -- cgit v1.2.3 From f564650106a6e85702660fefd59fdff0877ab46a Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 27 Jun 2018 10:34:25 -0300 Subject: netfilter: check if the socket netns is correct. Netfilter assumes that if the socket is present in the skb, then it can be used because that reference is cleaned up while the skb is crossing netns. We want to change that to preserve the socket reference in a future patch, so this is a preparation updating netfilter to check if the socket netns matches before use it. Signed-off-by: Flavio Leitner Acked-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/netfilter/nf_log.h | 3 ++- net/ipv4/netfilter/nf_log_ipv4.c | 8 ++++---- net/ipv6/netfilter/nf_log_ipv6.c | 8 ++++---- net/netfilter/nf_conntrack_broadcast.c | 2 +- net/netfilter/nf_log_common.c | 5 +++-- net/netfilter/nf_nat_core.c | 6 +++++- net/netfilter/nft_meta.c | 9 ++++++--- net/netfilter/nft_socket.c | 5 ++++- net/netfilter/xt_cgroup.c | 6 ++++-- net/netfilter/xt_owner.c | 2 +- net/netfilter/xt_recent.c | 3 ++- net/netfilter/xt_socket.c | 8 ++++++++ 12 files changed, 44 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index e811ac07ea94..0d3920896d50 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -106,7 +106,8 @@ int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, u8 proto, int fragment, unsigned int offset, unsigned int logflags); -void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..1e6f28c97d3a 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct nf_log_buf *m, +static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); - dump_ipv4_packet(m, info, skb, + dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); nf_log_buf_add(m, "] "); } @@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv4_mac_header(m, loginfo, skb); - dump_ipv4_packet(m, loginfo, skb, 0); + dump_ipv4_packet(net, m, loginfo, skb, 0); nf_log_buf_close(m); } diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index b397a8fe88b9..c6bf580d0f33 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct nf_log_buf *m, +static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) @@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (recurse) { nf_log_buf_add(m, "["); - dump_ipv6_packet(m, info, skb, + dump_ipv6_packet(net, m, info, skb, ptr + sizeof(_icmp6h), 0); nf_log_buf_add(m, "] "); } @@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && recurse) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (recurse && skb->mark) @@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv6_mac_header(m, loginfo, skb); - dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); nf_log_buf_close(m); } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index a1086bdec242..5423b197d98a 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, __be32 mask = 0; /* we're only interested in locally generated packets */ - if (skb->sk == NULL) + if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk))) goto out; if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) goto out; diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index dc61399e30be..a8c5c846aec1 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); -void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) +void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) { - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 46f9df99d276..86df2a1666fd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -108,6 +108,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) struct flowi fl; unsigned int hh_len; struct dst_entry *dst; + struct sock *sk = skb->sk; int err; err = xfrm_decode_session(skb, &fl, family); @@ -119,7 +120,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1105a23bda5e..2b94dcc43456 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKGID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 74e1b3bd6954..998c2b546f6d 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -23,6 +23,9 @@ static void nft_socket_eval(const struct nft_expr *expr, struct sock *sk = skb->sk; u32 *dest = ®s->data[priv->dreg]; + if (sk && !net_eq(nft_net(pkt), sock_net(sk))) + sk = NULL; + if (!sk) switch(nft_pf(pkt)) { case NFPROTO_IPV4: @@ -39,7 +42,7 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } - if(!sk) { + if (!sk) { nft_reg_store8(dest, 0); return; } diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 7df2dece57d3..5d92e1781980 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -72,8 +72,9 @@ static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v0 *info = par->matchinfo; + struct sock *sk = skb->sk; - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ @@ -85,8 +86,9 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_cgroup_info_v1 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; - if (!skb->sk || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 3d705c688a27..46686fb73784 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk = skb_to_full_sk(skb); struct net *net = xt_net(par); - if (sk == NULL || sk->sk_socket == NULL) + if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 07085c22b19c..f44de4bc2100 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) } /* use TTL as seen before forwarding */ - if (xt_out(par) != NULL && skb->sk == NULL) + if (xt_out(par) != NULL && + (!skb->sk || !net_eq(net, sock_net(skb->sk)))) ttl++; spin_lock_bh(&recent_lock); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5c0779c4fa3c..0472f3472842 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; @@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; -- cgit v1.2.3 From 9c4c325252c54b34d53b3d0ffd535182b744e03d Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 27 Jun 2018 10:34:26 -0300 Subject: skbuff: preserve sock reference when scrubbing the skb. The sock reference is lost when scrubbing the packet and that breaks TSQ (TCP Small Queues) and XPS (Transmit Packet Steering) causing performance impacts of about 50% in a single TCP stream when crossing network namespaces. XPS breaks because the queue mapping stored in the socket is not available, so another random queue might be selected when the stack needs to transmit something like a TCP ACK, or TCP Retransmissions. That causes packet re-ordering and/or performance issues. TSQ breaks because it orphans the packet while it is still in the host, so packets are queued contributing to the buffer bloat problem. Preserving the sock reference fixes both issues. The socket is orphaned anyways in the receiving path before any relevant action and on TX side the netfilter checks if the reference is local before use it. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 10 +++++----- net/core/skbuff.c | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ce8fbf5aa63c..f4c042be0216 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -733,11 +733,11 @@ tcp_limit_output_bytes - INTEGER Controls TCP Small Queue limit per tcp socket. TCP bulk sender tends to increase packets in flight until it gets losses notifications. With SNDBUF autotuning, this can - result in a large amount of packets queued in qdisc/device - on the local machine, hurting latency of other flows, for - typical pfifo_fast qdiscs. - tcp_limit_output_bytes limits the number of bytes on qdisc - or device to reduce artificial RTT/cwnd and reduce bufferbloat. + result in a large amount of packets queued on the local machine + (e.g.: qdiscs, CPU backlog, or device) hurting latency of other + flows, for typical pfifo_fast qdiscs. tcp_limit_output_bytes + limits the number of bytes on qdisc or device to reduce artificial + RTT/cwnd and reduce bufferbloat. Default: 262144 tcp_challenge_ack_limit - INTEGER diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b1f274f22d85..f59e98ca72c5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4911,7 +4911,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) return; ipvs_reset(skb); - skb_orphan(skb); skb->mark = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); -- cgit v1.2.3 From f7a2ba5ab9c5e7cf9036ec68d3528ccdf9e81b0a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Jun 2018 14:38:59 -0700 Subject: ila: Fix use of rhashtable walk in ila_xlat.c Perform better EAGAIN handling, handle case where ila_dump_info fails and we missed objects in the dump, and add a skip index to skip over ila entires in a list on a rhashtable node that have already been visited (by a previous call to ila_nl_dump). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_xlat.c | 70 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 10ae13560b40..40f3f640e856 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -475,24 +475,31 @@ out_free: struct ila_dump_iter { struct rhashtable_iter rhiter; + int skip; }; static int ila_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); - struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; + struct ila_dump_iter *iter; + int ret; - if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; - cb->args[0] = (long)iter; + ret = rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, + GFP_KERNEL); + if (ret) { + kfree(iter); + return ret; } - return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, - GFP_KERNEL); + iter->skip = 0; + cb->args[0] = (long)iter; + + return ret; } static int ila_nl_dump_done(struct netlink_callback *cb) @@ -510,20 +517,45 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; + int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); - for (;;) { - ila = rhashtable_walk_next(rhiter); + /* Get first entry */ + ila = rhashtable_walk_peek(rhiter); + + if (ila && !IS_ERR(ila) && skip) { + /* Skip over visited entries */ + + while (ila && skip) { + /* Skip over any ila entries in this list that we + * have already dumped. + */ + ila = rcu_access_pointer(ila->next); + skip--; + } + } + skip = 0; + + for (;;) { if (IS_ERR(ila)) { - if (PTR_ERR(ila) == -EAGAIN) - continue; ret = PTR_ERR(ila); - goto done; + if (ret == -EAGAIN) { + /* Table has changed and iter has reset. Return + * -EAGAIN to the application even if we have + * written data to the skb. The application + * needs to deal with this. + */ + + goto out_ret; + } else { + break; + } } else if (!ila) { + ret = 0; break; } @@ -532,15 +564,21 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) - goto done; + goto out; + skip++; ila = rcu_access_pointer(ila->next); } + + skip = 0; + ila = rhashtable_walk_next(rhiter); } - ret = skb->len; +out: + iter->skip = skip; + ret = (skb->len ? : ret); -done: +out_ret: rhashtable_walk_stop(rhiter); return ret; } -- cgit v1.2.3 From b893281715ab4ea0e63034165b4fa11d1bb984c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Jun 2018 14:39:00 -0700 Subject: ila: Call library function alloc_bucket_locks To allocate the array of bucket locks for the hash table we now call library function alloc_bucket_spinlocks. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_xlat.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 40f3f640e856..9cc8beedc2ca 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -31,27 +31,14 @@ struct ila_net { bool hooks_registered; }; +#define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { - unsigned int i, size; - unsigned int nr_pcpus = num_possible_cpus(); - - nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); - size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); - - if (sizeof(spinlock_t) != 0) { - ilan->locks = kvmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); - if (!ilan->locks) - return -ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&ilan->locks[i]); - } - ilan->locks_mask = size - 1; - - return 0; + return alloc_bucket_spinlocks(&ilan->locks, &ilan->locks_mask, + MAX_LOCKS, LOCKS_PER_CPU, + GFP_KERNEL); } static u32 hashrnd __read_mostly; @@ -640,7 +627,7 @@ static __net_exit void ila_exit_net(struct net *net) rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); - kvfree(ilan->locks); + free_bucket_spinlocks(ilan->locks); if (ilan->hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, -- cgit v1.2.3 From ad68147ef2878cad0cb9aba2a682c4bb8832cca7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Jun 2018 14:39:01 -0700 Subject: ila: Create main ila source file Create a main ila file that contains the module initialization functions as well as netlink definitions. Previously these were defined in ila_xlat and ila_common. This approach allows better extensibility. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/Makefile | 2 +- net/ipv6/ila/ila.h | 26 ++++++++- net/ipv6/ila/ila_common.c | 30 ---------- net/ipv6/ila/ila_main.c | 115 +++++++++++++++++++++++++++++++++++++ net/ipv6/ila/ila_xlat.c | 142 +++++++++------------------------------------- 5 files changed, 168 insertions(+), 147 deletions(-) create mode 100644 net/ipv6/ila/ila_main.c (limited to 'net') diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile index 4b32e5921e5c..b7739aba6e68 100644 --- a/net/ipv6/ila/Makefile +++ b/net/ipv6/ila/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_IPV6_ILA) += ila.o -ila-objs := ila_common.o ila_lwt.o ila_xlat.o +ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index 3c7a11b62334..faba7824ea56 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -104,9 +105,30 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, void ila_init_saved_csum(struct ila_params *p); +struct ila_net { + struct { + struct rhashtable rhash_table; + spinlock_t *locks; /* Bucket locks for entry manipulation */ + unsigned int locks_mask; + bool hooks_registered; + } xlat; +}; + int ila_lwt_init(void); void ila_lwt_fini(void); -int ila_xlat_init(void); -void ila_xlat_fini(void); + +int ila_xlat_init_net(struct net *net); +void ila_xlat_exit_net(struct net *net); + +int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_dump_start(struct netlink_callback *cb); +int ila_xlat_nl_dump_done(struct netlink_callback *cb); +int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb); + +extern unsigned int ila_net_id; + +extern struct genl_family ila_nl_family; #endif /* __ILA_H */ diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 8c88ecf29b93..579310466eac 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -154,33 +154,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, iaddr->loc = p->locator; } -static int __init ila_init(void) -{ - int ret; - - ret = ila_lwt_init(); - - if (ret) - goto fail_lwt; - - ret = ila_xlat_init(); - if (ret) - goto fail_xlat; - - return 0; -fail_xlat: - ila_lwt_fini(); -fail_lwt: - return ret; -} - -static void __exit ila_fini(void) -{ - ila_xlat_fini(); - ila_lwt_fini(); -} - -module_init(ila_init); -module_exit(ila_fini); -MODULE_AUTHOR("Tom Herbert "); -MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c new file mode 100644 index 000000000000..f6ac6b14577e --- /dev/null +++ b/net/ipv6/ila/ila_main.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "ila.h" + +static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, + [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, +}; + +static const struct genl_ops ila_nl_ops[] = { + { + .cmd = ILA_CMD_ADD, + .doit = ila_xlat_nl_cmd_add_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_DEL, + .doit = ila_xlat_nl_cmd_del_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_GET, + .doit = ila_xlat_nl_cmd_get_mapping, + .start = ila_xlat_nl_dump_start, + .dumpit = ila_xlat_nl_dump, + .done = ila_xlat_nl_dump_done, + .policy = ila_nl_policy, + }, +}; + +unsigned int ila_net_id; + +struct genl_family ila_nl_family __ro_after_init = { + .hdrsize = 0, + .name = ILA_GENL_NAME, + .version = ILA_GENL_VERSION, + .maxattr = ILA_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .ops = ila_nl_ops, + .n_ops = ARRAY_SIZE(ila_nl_ops), +}; + +static __net_init int ila_init_net(struct net *net) +{ + int err; + + err = ila_xlat_init_net(net); + if (err) + goto ila_xlat_init_fail; + + return 0; + +ila_xlat_init_fail: + return err; +} + +static __net_exit void ila_exit_net(struct net *net) +{ + ila_xlat_exit_net(net); +} + +static struct pernet_operations ila_net_ops = { + .init = ila_init_net, + .exit = ila_exit_net, + .id = &ila_net_id, + .size = sizeof(struct ila_net), +}; + +static int __init ila_init(void) +{ + int ret; + + ret = register_pernet_device(&ila_net_ops); + if (ret) + goto register_device_fail; + + ret = genl_register_family(&ila_nl_family); + if (ret) + goto register_family_fail; + + ret = ila_lwt_init(); + if (ret) + goto fail_lwt; + + return 0; + +fail_lwt: + genl_unregister_family(&ila_nl_family); +register_family_fail: + unregister_pernet_device(&ila_net_ops); +register_device_fail: + return ret; +} + +static void __exit ila_fini(void) +{ + ila_lwt_fini(); + genl_unregister_family(&ila_nl_family); + unregister_pernet_device(&ila_net_ops); +} + +module_init(ila_init); +module_exit(ila_fini); +MODULE_AUTHOR("Tom Herbert "); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 9cc8beedc2ca..d05de891dfb6 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -22,21 +22,12 @@ struct ila_map { struct rcu_head rcu; }; -static unsigned int ila_net_id; - -struct ila_net { - struct rhashtable rhash_table; - spinlock_t *locks; /* Bucket locks for entry manipulation */ - unsigned int locks_mask; - bool hooks_registered; -}; - #define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { - return alloc_bucket_spinlocks(&ilan->locks, &ilan->locks_mask, + return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, MAX_LOCKS, LOCKS_PER_CPU, GFP_KERNEL); } @@ -58,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc) static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { - return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask]; + return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, @@ -102,16 +93,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = ila_cmpfn, }; -static struct genl_family ila_nl_family; - -static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { - [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, - [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, - [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, - [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, - [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, -}; - static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { @@ -149,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc, + ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) @@ -166,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, + ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { @@ -222,7 +203,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->hooks_registered) { + if (!ilan->xlat.hooks_registered) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ @@ -231,7 +212,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) if (err) return err; - ilan->hooks_registered = true; + ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); @@ -246,12 +227,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, + head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ - err = rhashtable_lookup_insert_fast(&ilan->rhash_table, + err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; @@ -277,7 +258,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); - err = rhashtable_replace_fast(&ilan->rhash_table, + err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) @@ -303,7 +284,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, + head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; @@ -333,15 +314,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) * table */ err = rhashtable_replace_fast( - &ilan->rhash_table, &ila->node, + &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ - err = rhashtable_remove_fast(&ilan->rhash_table, - &ila->node, - rht_params); + err = rhashtable_remove_fast( + &ilan->xlat.rhash_table, + &ila->node, rht_params); } } @@ -356,7 +337,7 @@ out: return err; } -static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; @@ -369,7 +350,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) return ila_add_mapping(net, &p); } -static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; @@ -421,7 +402,7 @@ nla_put_failure: return -EMSGSIZE; } -static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); @@ -465,7 +446,7 @@ struct ila_dump_iter { int skip; }; -static int ila_nl_dump_start(struct netlink_callback *cb) +int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); @@ -476,7 +457,7 @@ static int ila_nl_dump_start(struct netlink_callback *cb) if (!iter) return -ENOMEM; - ret = rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, + ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter, GFP_KERNEL); if (ret) { kfree(iter); @@ -489,7 +470,7 @@ static int ila_nl_dump_start(struct netlink_callback *cb) return ret; } -static int ila_nl_dump_done(struct netlink_callback *cb) +int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; @@ -500,7 +481,7 @@ static int ila_nl_dump_done(struct netlink_callback *cb) return 0; } -static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; @@ -570,77 +551,35 @@ out_ret: return ret; } -static const struct genl_ops ila_nl_ops[] = { - { - .cmd = ILA_CMD_ADD, - .doit = ila_nl_cmd_add_mapping, - .policy = ila_nl_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = ILA_CMD_DEL, - .doit = ila_nl_cmd_del_mapping, - .policy = ila_nl_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = ILA_CMD_GET, - .doit = ila_nl_cmd_get_mapping, - .start = ila_nl_dump_start, - .dumpit = ila_nl_dump, - .done = ila_nl_dump_done, - .policy = ila_nl_policy, - }, -}; - -static struct genl_family ila_nl_family __ro_after_init = { - .hdrsize = 0, - .name = ILA_GENL_NAME, - .version = ILA_GENL_VERSION, - .maxattr = ILA_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, - .module = THIS_MODULE, - .ops = ila_nl_ops, - .n_ops = ARRAY_SIZE(ila_nl_ops), -}; - #define ILA_HASH_TABLE_SIZE 1024 -static __net_init int ila_init_net(struct net *net) +int ila_xlat_init_net(struct net *net) { - int err; struct ila_net *ilan = net_generic(net, ila_net_id); + int err; err = alloc_ila_locks(ilan); if (err) return err; - rhashtable_init(&ilan->rhash_table, &rht_params); + rhashtable_init(&ilan->xlat.rhash_table, &rht_params); return 0; } -static __net_exit void ila_exit_net(struct net *net) +void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); - rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); + rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); - free_bucket_spinlocks(ilan->locks); + free_bucket_spinlocks(ilan->xlat.locks); - if (ilan->hooks_registered) + if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } -static struct pernet_operations ila_net_ops = { - .init = ila_init_net, - .exit = ila_exit_net, - .id = &ila_net_id, - .size = sizeof(struct ila_net), -}; - static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; @@ -667,28 +606,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) return 0; } -int __init ila_xlat_init(void) -{ - int ret; - - ret = register_pernet_device(&ila_net_ops); - if (ret) - goto exit; - - ret = genl_register_family(&ila_nl_family); - if (ret < 0) - goto unregister; - - return 0; - -unregister: - unregister_pernet_device(&ila_net_ops); -exit: - return ret; -} - -void ila_xlat_fini(void) -{ - genl_unregister_family(&ila_nl_family); - unregister_pernet_device(&ila_net_ops); -} -- cgit v1.2.3 From b6e71bdebb12cb79f931db358066a33f5f526b6a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Jun 2018 14:39:02 -0700 Subject: ila: Flush netlink command to clear xlat table Add ILA_CMD_FLUSH netlink command to clear the ILA translation table. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 1 + net/ipv6/ila/ila.h | 1 + net/ipv6/ila/ila_main.c | 6 +++++ net/ipv6/ila/ila_xlat.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 483b77af4eb8..db45d3e49a12 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -30,6 +30,7 @@ enum { ILA_CMD_ADD, ILA_CMD_DEL, ILA_CMD_GET, + ILA_CMD_FLUSH, __ILA_CMD_MAX, }; diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index faba7824ea56..1f747bcbec29 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -123,6 +123,7 @@ void ila_xlat_exit_net(struct net *net); int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info); int ila_xlat_nl_dump_start(struct netlink_callback *cb); int ila_xlat_nl_dump_done(struct netlink_callback *cb); int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index f6ac6b14577e..18fac76b9520 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -26,6 +26,12 @@ static const struct genl_ops ila_nl_ops[] = { .policy = ila_nl_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = ILA_CMD_FLUSH, + .doit = ila_xlat_nl_cmd_flush, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, { .cmd = ILA_CMD_GET, .doit = ila_xlat_nl_cmd_get_mapping, diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index d05de891dfb6..51a15ce50a64 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -164,9 +164,9 @@ static inline void ila_release(struct ila_map *ila) kfree_rcu(ila, rcu); } -static void ila_free_cb(void *ptr, void *arg) +static void ila_free_node(struct ila_map *ila) { - struct ila_map *ila = (struct ila_map *)ptr, *next; + struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { @@ -176,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg) } } +static void ila_free_cb(void *ptr, void *arg) +{ + ila_free_node((struct ila_map *)ptr); +} + static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int @@ -365,6 +370,59 @@ int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) return 0; } +static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, + struct ila_map *ila) +{ + return ila_get_lock(ilan, ila->xp.ip.locator_match); +} + +int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct rhashtable_iter iter; + struct ila_map *ila; + spinlock_t *lock; + int ret; + + ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL); + if (ret) + goto done; + + rhashtable_walk_start(&iter); + + for (;;) { + ila = rhashtable_walk_next(&iter); + + if (IS_ERR(ila)) { + if (PTR_ERR(ila) == -EAGAIN) + continue; + ret = PTR_ERR(ila); + goto done; + } else if (!ila) { + break; + } + + lock = lock_from_ila_map(ilan, ila); + + spin_lock(lock); + + ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, + &ila->node, rht_params); + if (!ret) + ila_free_node(ila); + + spin_unlock(lock); + + if (ret) + break; + } + +done: + rhashtable_walk_stop(&iter); + return ret; +} + static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, -- cgit v1.2.3 From fe0984d38938249f3f11fc558a8845fc6f8a0105 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 17:11:14 +0200 Subject: cfg80211: track time using boottime The cfg80211 layer uses get_seconds() to read the current time in its supend handling. This function is deprecated because of the 32-bit time_t overflow, and it can cause unexpected behavior when the time changes due to settimeofday() calls or leap second updates. In many cases, we want to use monotonic time instead, however cfg80211 explicitly tracks the time spent in suspend, so this changes the driver over to use ktime_get_boottime_seconds(), which is slightly slower, but not used in a fastpath here. Signed-off-by: Arnd Bergmann Signed-off-by: Johannes Berg --- net/wireless/core.h | 2 +- net/wireless/sysfs.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index 63eb1b5fdd04..7f52ef569320 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -76,7 +76,7 @@ struct cfg80211_registered_device { struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct sk_buff *scan_msg; struct list_head sched_scan_req_list; - unsigned long suspend_at; + time64_t suspend_at; struct work_struct scan_done_wk; struct genl_info *cur_cmd_info; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 570a2b67ca10..6ab32f6a1961 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -102,7 +102,7 @@ static int wiphy_suspend(struct device *dev) struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; - rdev->suspend_at = get_seconds(); + rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); if (rdev->wiphy.registered) { @@ -130,7 +130,7 @@ static int wiphy_resume(struct device *dev) int ret = 0; /* Age scan results with time spent in suspend */ - cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); + cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); if (rdev->wiphy.registered && rdev->ops->resume) -- cgit v1.2.3 From 47aa7861b9bf8e8a540f3b11971e4a3f631e8ff4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 18 Jun 2018 07:41:34 -0500 Subject: mac80211: fix potential null pointer dereference he_op is being dereferenced before it is null checked, hence there is a potential null pointer dereference. Fix this by moving the pointer dereference after he_op has been properly null checked. Notice that, currently, he_op is already being null checked before calling this function at 4593: 4593 if (!he_oper || 4594 !ieee80211_verify_sta_he_mcs_support(sband, he_oper)) 4595 ifmgd->flags |= IEEE80211_STA_DISABLE_HE; but in case ieee80211_verify_sta_he_mcs_support is ever called without verifying he_oper is not null, we will end up having a null pointer dereference. So, we better don't take any chances. Addresses-Coverity-ID: 1470068 ("Dereference before null check") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0322d78007ad..f4513031f1bc 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4458,12 +4458,14 @@ ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_sta_cap(sband); - u16 ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + u16 ap_min_req_set; int i; if (!sta_he_cap || !he_op) return false; + ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + /* Need to go over for 80MHz, 160MHz and for 80+80 */ for (i = 0; i < 3; i++) { const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = -- cgit v1.2.3 From f0c0407d2a9fc3b2be33ec6c67ebc1f73595d2cb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 Jun 2018 09:51:39 +0200 Subject: mac80211: remove unnecessary NULL check We don't need to check if he_oper is NULL before calling ieee80211_verify_sta_he_mcs_support() as it - now - will correctly check this itself. Remove the redundant check. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f4513031f1bc..7fb9957359a3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4592,8 +4592,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, else he_oper = NULL; - if (!he_oper || - !ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } -- cgit v1.2.3 From 397c657a0644e7607c6aebea84d2b0f08ab59dfc Mon Sep 17 00:00:00 2001 From: Omer Efrat Date: Sun, 17 Jun 2018 13:06:14 +0300 Subject: cfg80211: use BIT_ULL for NL80211_STA_INFO_* attribute types The BIT macro uses unsigned long which some architectures handle as 32 bit and therefore might cause macro's shift to overflow when used on a value equals or larger than 32 (NL80211_STA_INFO_RX_DURATION and afterwards). Since 'filled' member in station_info changed to u64, BIT_ULL macro should be used with all NL80211_STA_INFO_* attribute types instead of BIT to prevent future possible bugs when one will use BIT macro for higher attributes by mistake. This commit cleans up all usages of BIT macro with the above field in cfg80211 by changing it to BIT_ULL instead. In addition, there are some places which don't use BIT nor BIT_ULL macros so align those as well. Signed-off-by: Omer Efrat Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 26 +++++++++++++------------- net/wireless/wext-compat.c | 10 +++++----- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0ccce338a66e..350d2962524c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4619,13 +4619,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, #define PUT_SINFO(attr, memb, type) do { \ BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \ - if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb)) \ goto nla_put_failure; \ } while (0) #define PUT_SINFO_U64(attr, memb) do { \ - if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb, NL80211_STA_INFO_PAD)) \ goto nla_put_failure; \ @@ -4634,14 +4634,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(CONNECTED_TIME, connected_time, u32); PUT_SINFO(INACTIVE_TIME, inactive_time, u32); - if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) | - BIT(NL80211_STA_INFO_RX_BYTES64)) && + if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES, (u32)sinfo->rx_bytes)) goto nla_put_failure; - if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) | - BIT(NL80211_STA_INFO_TX_BYTES64)) && + if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES, (u32)sinfo->tx_bytes)) goto nla_put_failure; @@ -4661,24 +4661,24 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, default: break; } - if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) { if (!nl80211_put_signal(msg, sinfo->chains, sinfo->chain_signal, NL80211_STA_INFO_CHAIN_SIGNAL)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { if (!nl80211_put_signal(msg, sinfo->chains, sinfo->chain_signal_avg, NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) { if (!nl80211_put_sta_rate(msg, &sinfo->txrate, NL80211_STA_INFO_TX_BITRATE)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) { if (!nl80211_put_sta_rate(msg, &sinfo->rxrate, NL80211_STA_INFO_RX_BITRATE)) goto nla_put_failure; @@ -4694,7 +4694,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); - if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); if (!bss_param) goto nla_put_failure; @@ -4713,7 +4713,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, nla_nest_end(msg, bss_param); } - if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) && + if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) && nla_put(msg, NL80211_STA_INFO_STA_FLAGS, sizeof(struct nl80211_sta_flag_update), &sinfo->sta_flags)) @@ -10266,7 +10266,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (err) return err; - if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) wdev->cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; } diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 05186a47878f..167f7025ac98 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,7 +1278,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))) + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) return -EOPNOTSUPP; rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); @@ -1320,7 +1320,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { int sig = sinfo.signal; wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; @@ -1334,7 +1334,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) break; } case CFG80211_SIGNAL_TYPE_UNSPEC: - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.level = sinfo.signal; @@ -1347,9 +1347,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; - if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) wstats.discard.misc = sinfo.rx_dropped_misc; - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; return &wstats; -- cgit v1.2.3 From a4217750586975dee7d6dd8829a1be24a7678b3d Mon Sep 17 00:00:00 2001 From: Omer Efrat Date: Sun, 17 Jun 2018 13:06:25 +0300 Subject: mac80211: use BIT_ULL for NL80211_STA_INFO_* attribute types The BIT macro uses unsigned long which some architectures handle as 32 bit and therefore might cause macro's shift to overflow when used on a value equals or larger than 32 (NL80211_STA_INFO_RX_DURATION and afterwards). Since 'filled' member in station_info changed to u64, BIT_ULL macro should be used with all NL80211_STA_INFO_* attribute types instead of BIT to prevent future possible bugs when one will use BIT macro for higher attributes by mistake. This commit cleans up all usages of BIT macro with the above field in mac80211 by changing it to BIT_ULL instead. Signed-off-by: Omer Efrat Signed-off-by: Johannes Berg --- net/mac80211/ethtool.c | 6 ++-- net/mac80211/sta_info.c | 84 ++++++++++++++++++++++++------------------------- 2 files changed, 45 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 690c142a7a44..5ac743816b59 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -116,16 +116,16 @@ static void ieee80211_get_stats(struct net_device *dev, data[i++] = sta->sta_state; - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; - if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index aa8fe771a8db..f34202242d24 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2114,38 +2114,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, drv_sta_statistics(local, sdata, &sta->sta, sinfo); - sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) | - BIT(NL80211_STA_INFO_STA_FLAGS) | - BIT(NL80211_STA_INFO_BSS_PARAM) | - BIT(NL80211_STA_INFO_CONNECTED_TIME) | - BIT(NL80211_STA_INFO_RX_DROP_MISC); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | + BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | + BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count; - sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); - if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | - BIT(NL80211_STA_INFO_TX_BYTES)))) { + if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->tx_stats.bytes[ac]; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->tx_stats.packets[ac]; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } - if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) | - BIT(NL80211_STA_INFO_RX_BYTES)))) { + if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { @@ -2157,10 +2157,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } } - sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->rx_stats.packets; if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { @@ -2170,17 +2170,17 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->rx_packets += cpurxs->packets; } } - sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->status_stats.retry_count; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->status_stats.retry_failed; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } sinfo->rx_dropped_misc = sta->rx_stats.dropped; @@ -2195,23 +2195,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { - sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) | - BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | + BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { - if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; - sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->pcpu_rx_stats && - !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { + !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->rx_stats_avg.signal); - sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } @@ -2220,11 +2220,11 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, * pcpu statistics */ if (last_rxstats->chains && - !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | - BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL); + !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->pcpu_rx_stats) - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; @@ -2236,15 +2236,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &sinfo->txrate); - sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) - sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { @@ -2257,18 +2257,18 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH - sinfo->filled |= BIT(NL80211_STA_INFO_LLID) | - BIT(NL80211_STA_INFO_PLID) | - BIT(NL80211_STA_INFO_PLINK_STATE) | - BIT(NL80211_STA_INFO_LOCAL_PM) | - BIT(NL80211_STA_INFO_PEER_PM) | - BIT(NL80211_STA_INFO_NONPEER_PM); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | + BIT_ULL(NL80211_STA_INFO_PLID) | + BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | + BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | + BIT_ULL(NL80211_STA_INFO_PEER_PM) | + BIT_ULL(NL80211_STA_INFO_NONPEER_PM); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; @@ -2313,7 +2313,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, thr = sta_get_expected_throughput(sta); if (thr != 0) { - sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } -- cgit v1.2.3 From b0e9a2fe3ff971950833bc0ffc383babd9443bc4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Jun 2018 15:31:00 +0800 Subject: sctp: add support for SCTP_REUSE_PORT sockopt This feature is actually already supported by sk->sk_reuse which can be set by socket level opt SO_REUSEADDR. But it's not working exactly as RFC6458 demands in section 8.1.27, like: - This option only supports one-to-one style SCTP sockets - This socket option must not be used after calling bind() or sctp_bindx(). Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs. Otherwise, the programs with SCTP_REUSE_PORT from other systems will not work in linux. To separate it from the socket level version, this patch adds 'reuse' in sctp_sock and it works pretty much as sk->sk_reuse, but with some extra setup limitations that are needed when it is being enabled. "It should be noted that the behavior of the socket-level socket option to reuse ports and/or addresses for SCTP sockets is unspecified", so it leaves SO_REUSEADDR as is for the compatibility. Note that the name SCTP_REUSE_PORT is somewhat confusing, as its functionality is nearly identical to SO_REUSEADDR, but with some extra restrictions. Here it uses 'reuse' in sctp_sock instead of 'reuseport'. As for sk->sk_reuseport support for SCTP, it will be added in another patch. Thanks to Neil to make this clear. v1->v2: - add sctp_sk->reuse to separate it from the socket level version. v2->v3: - improve changelog according to Marcelo's suggestion. Acked-by: Neil Horman Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 62 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 57 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index e0f962d27386..701a51736fa5 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -220,6 +220,7 @@ struct sctp_sock { __u32 adaptation_ind; __u32 pd_point; __u16 nodelay:1, + reuse:1, disable_fragments:1, v4mapped:1, frag_interleave:1, diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index b64d583bf053..c02986a284db 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -100,6 +100,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RECVNXTINFO 33 #define SCTP_DEFAULT_SNDINFO 34 #define SCTP_AUTH_DEACTIVATE_KEY 35 +#define SCTP_REUSE_PORT 36 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0e91e83eea5a..bf11f9cacb63 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4170,6 +4170,28 @@ out: return retval; } +static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + int val; + + if (!sctp_style(sk, TCP)) + return -EOPNOTSUPP; + + if (sctp_sk(sk)->ep->base.bind_addr.port) + return -EFAULT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->reuse = !!val; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4364,6 +4386,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_interleaving_supported(sk, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_setsockopt_reuse_port(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7197,6 +7222,26 @@ out: return retval; } +static int sctp_getsockopt_reuse_port(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = sctp_sk(sk)->reuse; + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7392,6 +7437,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_interleaving_supported(sk, len, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7429,6 +7477,7 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { + bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct sctp_bind_bucket *pp; unsigned short snum; @@ -7501,13 +7550,11 @@ pp_found: * used by other socket (pp->owner not empty); that other * socket is going to be sk2. */ - int reuse = sk->sk_reuse; struct sock *sk2; pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && sk->sk_reuse && - sk->sk_state != SCTP_SS_LISTENING) + if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) goto success; /* Run through the list of sockets bound to the port @@ -7525,7 +7572,7 @@ pp_found: ep2 = sctp_sk(sk2)->ep; if (sk == sk2 || - (reuse && sk2->sk_reuse && + (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && sk2->sk_state != SCTP_SS_LISTENING)) continue; @@ -7549,12 +7596,12 @@ pp_not_found: * SO_REUSEADDR on this socket -sk-). */ if (hlist_empty(&pp->owner)) { - if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) + if (reuse && sk->sk_state != SCTP_SS_LISTENING) pp->fastreuse = 1; else pp->fastreuse = 0; } else if (pp->fastreuse && - (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; /* We are set, so fill up all the data in the hash table @@ -7685,7 +7732,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) err = 0; sctp_unhash_endpoint(ep); sk->sk_state = SCTP_SS_CLOSED; - if (sk->sk_reuse) + if (sk->sk_reuse || sctp_sk(sk)->reuse) sctp_sk(sk)->bind_hash->fastreuse = 1; goto out; } @@ -8550,6 +8597,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; + sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; newsk->sk_destruct = sctp_destruct_sock; -- cgit v1.2.3 From a1165b591925551d7c8f1ed45484ebc2847ec8fa Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 26 Jun 2018 21:39:34 -0700 Subject: net/sched: act_tunnel_key: disambiguate metadata dst error cases Metadata may be NULL for one of two reasons: * Missing user input * Failure to allocate the metadata dst Disambiguate these case by returning -EINVAL for the former and -ENOMEM for the latter rather than -EINVAL for both cases. This is in preparation for using extended ack to provide more information to users when parsing their input. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 626dac81a48a..2edd389e7c92 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -143,10 +143,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, 0, flags, key_id, 0); + } else { + ret = -EINVAL; + goto err_out; } if (!metadata) { - ret = -EINVAL; + ret = -ENOMEM; goto err_out; } -- cgit v1.2.3 From 9d7298cd1dc55ebe053686f9bce74bfdcc812399 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 26 Jun 2018 21:39:35 -0700 Subject: net/sched: act_tunnel_key: add extended ack support Add extended ack support for the tunnel key action by using NL_SET_ERR_MSG during validation of user input. Cc: Alexander Aring Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2edd389e7c92..20e98ed8d498 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -86,16 +86,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, int ret = 0; int err; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, - NULL); - if (err < 0) + extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; + } - if (!tb[TCA_TUNNEL_KEY_PARMS]) + if (!tb[TCA_TUNNEL_KEY_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key parameters"); return -EINVAL; + } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -107,6 +113,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, break; case TCA_TUNNEL_KEY_ACT_SET: if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key id"); ret = -EINVAL; goto err_out; } @@ -144,11 +151,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, 0, flags, key_id, 0); } else { + NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); ret = -EINVAL; goto err_out; } if (!metadata) { + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst"); ret = -ENOMEM; goto err_out; } @@ -156,6 +165,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: + NL_SET_ERR_MSG(extack, "Unknown tunnel key action"); ret = -EINVAL; goto err_out; } @@ -163,14 +173,18 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_tunnel_key_ops, bind, true); - if (ret) + if (ret) { + NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); return ret; + } ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + NL_SET_ERR_MSG(extack, "TC IDR already exists"); return -EEXIST; + } } t = to_tunnel_key(*a); @@ -180,6 +194,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (unlikely(!params_new)) { if (ret == ACT_P_CREATED) tcf_idr_release(*a, bind); + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } -- cgit v1.2.3 From 256c87c17c53e60882a43dcf3e98f3bf859eaf6f Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 26 Jun 2018 21:39:36 -0700 Subject: net: check tunnel option type in tunnel flags Check the tunnel option type stored in tunnel flags when creating options for tunnels. Thereby ensuring we do not set geneve, vxlan or erspan tunnel options on interfaces that are not associated with them. Make sure all users of the infrastructure set correct flags, for the BPF helper we have to set all bits to keep backward compatibility. Signed-off-by: Pieter Jansen van Vuuren Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 ++++-- drivers/net/vxlan.c | 3 ++- include/net/ip_tunnels.h | 8 ++++++-- net/core/filter.c | 2 +- net/ipv4/ip_gre.c | 2 ++ net/ipv6/ip6_gre.c | 2 ++ net/openvswitch/flow_netlink.c | 7 ++++++- 7 files changed, 23 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 3e94375b9b01..471edd76ff55 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -236,7 +236,8 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } /* Update tunnel dst according to Geneve options. */ ip_tunnel_info_opts_set(&tun_dst->u.tun_info, - gnvh->options, gnvh->opt_len * 4); + gnvh->options, gnvh->opt_len * 4, + TUNNEL_GENEVE_OPT); } else { /* Drop packets w/ critical options, * since we don't support any... @@ -675,7 +676,8 @@ static void geneve_build_header(struct genevehdr *geneveh, geneveh->proto_type = htons(ETH_P_TEB); geneveh->rsvd2 = 0; - ip_tunnel_info_opts_get(geneveh->options, info); + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) + ip_tunnel_info_opts_get(geneveh->options, info); } static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index cc14e0cd5647..7eb30d7c8bd7 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2122,7 +2122,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; - if (info->options_len) + if (info->options_len && + info->key.tun_flags & TUNNEL_VXLAN_OPT) md = ip_tunnel_info_opts(info); ttl = info->key.ttl; tos = info->key.tos; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 90ff430f5e9d..b0d022ff6ea1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -466,10 +466,12 @@ static inline void ip_tunnel_info_opts_get(void *to, } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len) + const void *from, int len, + __be16 flags) { memcpy(ip_tunnel_info_opts(info), from, len); info->options_len = len; + info->key.tun_flags |= flags; } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) @@ -511,9 +513,11 @@ static inline void ip_tunnel_info_opts_get(void *to, } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len) + const void *from, int len, + __be16 flags) { info->options_len = 0; + info->key.tun_flags |= flags; } #endif /* CONFIG_INET */ diff --git a/net/core/filter.c b/net/core/filter.c index e7f12e9f598c..dade922678f6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3582,7 +3582,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size); + ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); return 0; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2d8efeecf619..c8ca5d8f0f75 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c8cf2fdbb13b..367177786e34 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -990,6 +990,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto tx_err; md = ip_tunnel_info_opts(tun_info); if (!md) goto tx_err; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 492ab0c36f7c..391c4073a6dc 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2516,7 +2516,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; + __be16 dst_opt_type; + dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2528,10 +2530,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; + dst_opt_type = TUNNEL_GENEVE_OPT; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + dst_opt_type = TUNNEL_VXLAN_OPT; break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + dst_opt_type = TUNNEL_ERSPAN_OPT; break; } } @@ -2574,7 +2579,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, */ ip_tunnel_info_opts_set(tun_info, TUN_METADATA_OPTS(&key, key.tun_opts_len), - key.tun_opts_len); + key.tun_opts_len, dst_opt_type); add_nested_action_end(*sfa, start); return err; -- cgit v1.2.3 From 0ed5269f9e41f495c8e9020c85f5e1644c1afc57 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 26 Jun 2018 21:39:37 -0700 Subject: net/sched: add tunnel option support to act_tunnel_key Allow setting tunnel options using the act_tunnel_key action. Options are expressed as class:type:data and multiple options may be listed using a comma delimiter. # ip link add name geneve0 type geneve dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ geneve_opts 0102:80:00800022,0102:80:00800022 \ action mirred egress redirect dev geneve0 Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 26 ++++ net/sched/act_tunnel_key.c | 214 +++++++++++++++++++++++++++++- 2 files changed, 236 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 72bbefe5d1d1..e284fec8c467 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -36,9 +36,35 @@ enum { TCA_TUNNEL_KEY_PAD, TCA_TUNNEL_KEY_ENC_DST_PORT, /* be16 */ TCA_TUNNEL_KEY_NO_CSUM, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPTS, /* Nested TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_MAX, }; #define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ + __TCA_TUNNEL_KEY_ENC_OPTS_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPTS_MAX (__TCA_TUNNEL_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, /* be16 */ + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) + #endif diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 20e98ed8d498..ea203e386a92 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,135 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, return action; } +static const struct nla_policy +enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, + .len = 128 }, +}; + +static int +tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; + int err, data_len, opt_len; + u8 *data; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); + return -EINVAL; + } + + data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + if (data_len < 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); + return -ERANGE; + } + if (data_len % 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); + return -ERANGE; + } + + opt_len = sizeof(struct geneve_opt) + data_len; + if (dst) { + struct geneve_opt *opt = dst; + + WARN_ON(dst_len < opt_len); + + opt->opt_class = + nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]); + opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]); + opt->length = data_len / 4; /* length is in units of 4 bytes */ + opt->r1 = 0; + opt->r2 = 0; + opt->r3 = 0; + + memcpy(opt + 1, data, data_len); + } + + return opt_len; +} + +static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, + int dst_len, struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + const struct nlattr *attr, *head = nla_data(nla); + + err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(attr, head, len, rem) { + switch (nla_type(attr)) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + opt_len = tunnel_key_copy_geneve_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (dst) { + dst_len -= opt_len; + dst += opt_len; + } + break; + } + } + + if (!opts_len) { + NL_SET_ERR_MSG(extack, "Empty list of tunnel options"); + return -EINVAL; + } + + if (rem > 0) { + NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes"); + return -EINVAL; + } + + return opts_len; +} + +static int tunnel_key_get_opts_len(struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + return tunnel_key_copy_opts(nla, NULL, 0, extack); +} + +static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, + int opts_len, struct netlink_ext_ack *extack) +{ + info->options_len = opts_len; + switch (nla_type(nla_data(nla))) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + default: + NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); + return -EINVAL; + } +} + static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, @@ -66,6 +196,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, @@ -81,6 +212,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + int opts_len = 0; __be64 key_id; __be16 flags; int ret = 0; @@ -128,6 +260,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); + if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) { + opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS], + extack); + if (opts_len < 0) { + ret = opts_len; + goto err_out; + } + } + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -138,7 +279,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, dst_port, flags, - key_id, 0); + key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; @@ -162,6 +303,14 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, goto err_out; } + if (opts_len) { + ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], + &metadata->u.tun_info, + opts_len, extack); + if (ret < 0) + goto err_out; + } + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: @@ -234,6 +383,61 @@ static void tunnel_key_release(struct tc_action *a) } } +static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + int len = info->options_len; + u8 *src = (u8 *)(info + 1); + struct nlattr *start; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); + if (!start) + return -EMSGSIZE; + + while (len > 0) { + struct geneve_opt *opt = (struct geneve_opt *)src; + + if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, + opt->type) || + nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, + opt->length * 4, opt + 1)) + return -EMSGSIZE; + + len -= sizeof(struct geneve_opt) + opt->length * 4; + src += sizeof(struct geneve_opt) + opt->length * 4; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct nlattr *start; + int err; + + if (!info->options_len) + return 0; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS); + if (!start) + return -EMSGSIZE; + + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + err = tunnel_key_geneve_opts_dump(skb, info); + if (err) + return err; + } else { + return -EINVAL; + } + + nla_nest_end(skb, start); + return 0; +} + static int tunnel_key_dump_addresses(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -284,8 +488,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { - struct ip_tunnel_key *key = - ¶ms->tcft_enc_metadata->u.tun_info.key; + struct ip_tunnel_info *info = + ¶ms->tcft_enc_metadata->u.tun_info; + struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || @@ -293,7 +498,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, ¶ms->tcft_enc_metadata->u.tun_info) || nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM))) + !(key->tun_flags & TUNNEL_CSUM)) || + tunnel_key_opts_dump(skb, info)) goto nla_put_failure; } -- cgit v1.2.3 From ea5d0c32498e1a08ff5f3dbeafa4d74895851b0d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 28 Jun 2018 00:22:56 -0400 Subject: tcp: add new SNMP counter for drops when try to queue in rcv queue When sk_rmem_alloc is larger than the receive buffer and we can't schedule more memory for it, the skb will be dropped. In above situation, if this skb is put into the ofo queue, LINUX_MIB_TCPOFODROP is incremented to track it. While if this skb is put into the receive queue, there's no record. So a new SNMP counter is introduced to track this behavior. LINUX_MIB_TCPRCVQDROP: Number of packets meant to be queued in rcv queue but dropped because socket rcvbuf limit hit. Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 8 ++++++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 97517f36a5f9..e5ebc83827ab 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -280,6 +280,7 @@ enum LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */ LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */ + LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 225ef3433fe5..b46e4cf9a55a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), + SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c5b3415413f..eecd359595fc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4611,8 +4611,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) skb->data_len = data_len; skb->len = size; - if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; + } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) @@ -4677,8 +4679,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); -- cgit v1.2.3 From be6a3f38ff2a2bfd2e591fdc566940a0d4d9428c Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 28 Jun 2018 19:05:04 +0200 Subject: net/smc: determine port attributes independent from pnet table For SMC it is important to know the current port state of RoCE devices. Monitoring port states has been triggered, when a RoCE device was added to the pnet table. To support future alternatives to the pnet table the monitoring of ports is made independent of the existence of a pnet table. It starts once the smc_ib_device is established. Due to this change smc_ib_remember_port_attr() is now a local function and shuffling its location and the location of its used functions makes any forward references obsolete. And the duplicate SMC_MAX_PORTS definition is removed. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 2 - net/smc/smc_ib.c | 130 ++++++++++++++++++++++++++++------------------------- net/smc/smc_ib.h | 1 - net/smc/smc_pnet.c | 7 +-- 4 files changed, 72 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index 51ae1f10d81a..7c86f716a92e 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,8 +21,6 @@ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ -#define SMC_MAX_PORTS 2 /* Max # of ports */ - extern struct proto smc_proto; extern struct proto smc_proto6; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0eed7ab9f28b..f8b159ced032 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -143,6 +143,62 @@ out: return rc; } +static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct ib_gid_attr gattr; + int rc; + + rc = ib_query_gid(smcibdev->ibdev, ibport, 0, + &smcibdev->gid[ibport - 1], &gattr); + if (rc || !gattr.ndev) + return -ENODEV; + + memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); + dev_put(gattr.ndev); + return 0; +} + +/* Create an identifier unique for this instance of SMC-R. + * The MAC-address of the first active registered IB device + * plus a random 2-byte number is used to create this identifier. + * This name is delivered to the peer during connection initialization. + */ +static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, + u8 ibport) +{ + memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); + get_random_bytes(&local_systemid[0], 2); +} + +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) +{ + return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; +} + +static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) +{ + int rc; + + memset(&smcibdev->pattr[ibport - 1], 0, + sizeof(smcibdev->pattr[ibport - 1])); + rc = ib_query_port(smcibdev->ibdev, ibport, + &smcibdev->pattr[ibport - 1]); + if (rc) + goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ + rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); + if (rc) + goto out; + if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, + sizeof(local_systemid)) && + smc_ib_port_active(smcibdev, ibport)) + /* create unique system identifier */ + smc_ib_define_local_systemid(smcibdev, ibport); +out: + return rc; +} + /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -370,62 +426,6 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; } -static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct ib_gid_attr gattr; - int rc; - - rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], &gattr); - if (rc || !gattr.ndev) - return -ENODEV; - - memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); - dev_put(gattr.ndev); - return 0; -} - -/* Create an identifier unique for this instance of SMC-R. - * The MAC-address of the first active registered IB device - * plus a random 2-byte number is used to create this identifier. - * This name is delivered to the peer during connection initialization. - */ -static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, - u8 ibport) -{ - memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], - sizeof(smcibdev->mac[ibport - 1])); - get_random_bytes(&local_systemid[0], 2); -} - -bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) -{ - return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; -} - -int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) -{ - int rc; - - memset(&smcibdev->pattr[ibport - 1], 0, - sizeof(smcibdev->pattr[ibport - 1])); - rc = ib_query_port(smcibdev->ibdev, ibport, - &smcibdev->pattr[ibport - 1]); - if (rc) - goto out; - /* the SMC protocol requires specification of the RoCE MAC address */ - rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); - if (rc) - goto out; - if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, - sizeof(local_systemid)) && - smc_ib_port_active(smcibdev, ibport)) - /* create unique system identifier */ - smc_ib_define_local_systemid(smcibdev, ibport); -out: - return rc; -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { @@ -454,9 +454,6 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smcibdev->roce_cq_recv = NULL; goto err; } - INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, - smc_ib_global_event_handler); - ib_register_event_handler(&smcibdev->event_handler); smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; return rc; @@ -472,7 +469,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) return; smcibdev->initialized = 0; smc_wr_remove_dev(smcibdev); - ib_unregister_event_handler(&smcibdev->event_handler); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); } @@ -483,6 +479,8 @@ static struct ib_client smc_ib_client; static void smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; + u8 port_cnt; + int i; if (ibdev->node_type != RDMA_NODE_IB_CA) return; @@ -498,6 +496,17 @@ static void smc_ib_add_dev(struct ib_device *ibdev) list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); ib_set_client_data(ibdev, &smc_ib_client, smcibdev); + INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, + smc_ib_global_event_handler); + ib_register_event_handler(&smcibdev->event_handler); + + /* trigger reading of the port attributes */ + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; + i < min_t(size_t, port_cnt, SMC_MAX_PORTS); + i++) + set_bit(i, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); } /* callback function for ib_register_client() */ @@ -512,6 +521,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_unlock(&smc_ib_devices.lock); smc_pnet_remove_by_ibdev(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); + ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index e90630dadf8e..2c480b352928 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -51,7 +51,6 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index d7b88b2d1b22..a82a5cad0282 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -358,9 +358,6 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) kfree(pnetelem); return rc; } - rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port); - if (rc) - smc_pnet_remove_by_pnetid(pnetelem->pnet_name); return rc; } @@ -485,10 +482,10 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); + return NOTIFY_OK; default: - break; + return NOTIFY_DONE; } - return NOTIFY_DONE; } static struct notifier_block smc_netdev_notifier = { -- cgit v1.2.3 From 0afff91c6f5ecef27715ea71e34dc2baacba1060 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 28 Jun 2018 19:05:05 +0200 Subject: net/smc: add pnetid support s390 hardware supports the definition of a so-call Physical NETwork IDentifier (short PNETID) per network device port. These PNETIDS can be used to identify network devices that are attached to the same physical network (broadcast domain). On s390 try to use the PNETID of the ethernet device port used for initial connecting, and derive the IB device port used for SMC RDMA traffic. On platforms without PNETID support fall back to the existing solution of a configured pnet table. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/smc.h | 2 + net/smc/smc_ib.c | 6 ++- net/smc/smc_ib.h | 3 ++ net/smc/smc_pnet.c | 109 +++++++++++++++++++++++++++++++++++++++++++---------- net/smc/smc_pnet.h | 14 +++++++ 5 files changed, 114 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/smc.h b/include/net/smc.h index 8381d163fefa..2173932fab9d 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -11,6 +11,8 @@ #ifndef _SMC_H #define _SMC_H +#define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ + struct smc_hashinfo { rwlock_t lock; struct hlist_head ht; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f8b159ced032..36de2fd76170 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -504,8 +504,12 @@ static void smc_ib_add_dev(struct ib_device *ibdev) port_cnt = smcibdev->ibdev->phys_port_cnt; for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); - i++) + i++) { set_bit(i, &smcibdev->port_event_mask); + /* determine pnetids of the port */ + smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i]); + } schedule_work(&smcibdev->port_event_work); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 2c480b352928..7c1223c91229 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -15,6 +15,7 @@ #include #include #include +#include #define SMC_MAX_PORTS 2 /* Max # of ports */ #define SMC_GID_SIZE sizeof(union ib_gid) @@ -40,6 +41,8 @@ struct smc_ib_device { /* ib-device infos for smc */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ + u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; + /* pnetid per port */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index a82a5cad0282..cdc6e23b6ce1 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -23,12 +23,10 @@ #include "smc_pnet.h" #include "smc_ib.h" -#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ - static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, - .len = SMC_MAX_PNET_ID_LEN - 1 + .len = SMC_MAX_PNETID_LEN - 1 }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, @@ -65,7 +63,7 @@ static struct smc_pnettable { */ struct smc_pnetentry { struct list_head list; - char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct net_device *ndev; struct smc_ib_device *smcibdev; u8 ib_port; @@ -209,7 +207,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) return false; while (--end >= bf && isspace(*end)) ; - if (end - bf >= SMC_MAX_PNET_ID_LEN) + if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) @@ -512,26 +510,70 @@ void smc_pnet_exit(void) genl_unregister_family(&smc_pnet_nl_family); } -/* PNET table analysis for a given sock: - * determine ib_device and port belonging to used internal TCP socket - * ethernet interface. +/* Determine one base device for stacked net devices. + * If the lower device level contains more than one devices + * (for instance with bonding slaves), just the first device + * is used to reach a base device. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport) +static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { - struct dst_entry *dst = sk_dst_get(sk); - struct smc_pnetentry *pnetelem; + int i, nest_lvl; - *smcibdev = NULL; - *ibport = 0; + rtnl_lock(); + nest_lvl = dev_get_nest_level(ndev); + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = netdev_lower_get_next(ndev, &lower); + } + rtnl_unlock(); + return ndev; +} + +/* Determine the corresponding IB device port based on the hardware PNETID. + * Searching stops at the first matching active IB device port. + */ +static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, + struct smc_ib_device **smcibdev, + u8 *ibport) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smc_ib_device *ibdev; + int i; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid, + SMC_MAX_PNETID_LEN) && + smc_ib_port_active(ibdev, i)) { + *smcibdev = ibdev; + *ibport = i; + break; + } + } + } + spin_unlock(&smc_ib_devices.lock); +} + +/* Lookup of coupled ib_device via SMC pnet table */ +static void smc_pnet_find_roce_by_table(struct net_device *netdev, + struct smc_ib_device **smcibdev, + u8 *ibport) +{ + struct smc_pnetentry *pnetelem; - if (!dst) - return; - if (!dst->dev) - goto out_rel; read_lock(&smc_pnettable.lock); list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { - if (dst->dev == pnetelem->ndev) { + if (netdev == pnetelem->ndev) { if (smc_ib_port_active(pnetelem->smcibdev, pnetelem->ib_port)) { *smcibdev = pnetelem->smcibdev; @@ -541,6 +583,35 @@ void smc_pnet_find_roce_resource(struct sock *sk, } } read_unlock(&smc_pnettable.lock); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport); + if (*smcibdev) + goto out_rel; + + /* lookup via SMC PNET table */ + smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport); + out_rel: dst_release(dst); +out: + return; } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5a29519db976..ad4455cde9e7 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -12,8 +12,22 @@ #ifndef _SMC_PNET_H #define _SMC_PNET_H +#if IS_ENABLED(CONFIG_HAVE_PNETID) +#include +#endif + struct smc_ib_device; +static inline int smc_pnetid_by_dev_port(struct device *dev, + unsigned short port, u8 *pnetid) +{ +#if IS_ENABLED(CONFIG_HAVE_PNETID) + return pnet_id_by_dev_port(dev, port, pnetid); +#else + return -ENOENT; +#endif +} + int smc_pnet_init(void) __init; void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); -- cgit v1.2.3 From e82f2e31f5597a3de44bd27b7427f577f637c552 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 28 Jun 2018 19:05:06 +0200 Subject: net/smc: optimize consumer cursor updates The SMC protocol requires to send a separate consumer cursor update, if it cannot be piggybacked to updates of the producer cursor. Currently the decision to send a separate consumer cursor update just considers the amount of data already received by the socket program. It does not consider the amount of data already arrived, but not yet consumed by the receiver. Basing the decision on the difference between already confirmed and already arrived data (instead of difference between already confirmed and already consumed data), may lead to a somewhat earlier consumer cursor update send in fast unidirectional traffic scenarios, and thus to better throughput. Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index cee666400752..f82886b7d1d8 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -495,7 +495,8 @@ out: void smc_tx_consumer_update(struct smc_connection *conn, bool force) { - union smc_host_cursor cfed, cons; + union smc_host_cursor cfed, cons, prod; + int sender_free = conn->rmb_desc->len; int to_confirm; smc_curs_write(&cons, @@ -505,11 +506,18 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) smc_curs_read(&conn->rx_curs_confirmed, conn), conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); + if (to_confirm > conn->rmbe_update_limit) { + smc_curs_write(&prod, + smc_curs_read(&conn->local_rx_ctrl.prod, conn), + conn); + sender_free = conn->rmb_desc->len - + smc_curs_diff(conn->rmb_desc->len, &prod, &cfed); + } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || force || ((to_confirm > conn->rmbe_update_limit) && - ((to_confirm > (conn->rmb_desc->len / 2)) || + ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && conn->alert_token_local) { /* connection healthy */ -- cgit v1.2.3 From c6ba7c9ba43de1b57e9a53946e7ff988554c84ed Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:07 +0200 Subject: net/smc: add base infrastructure for SMC-D and ISM SMC supports two variants: SMC-R and SMC-D. For data transport, SMC-R uses RDMA devices, SMC-D uses so-called Internal Shared Memory (ISM) devices. An ISM device only allows shared memory communication between SMC instances on the same machine. For example, this allows virtual machines on the same host to communicate via SMC without RDMA devices. This patch adds the base infrastructure for SMC-D and ISM devices to the existing SMC code. It contains the following: * ISM driver interface: This interface allows an ISM driver to register ISM devices in SMC. In the process, the driver provides a set of device ops for each device. SMC uses these ops to execute SMC specific operations on or transfer data over the device. * Core SMC-D link group, connection, and buffer support: Link groups, SMC connections and SMC buffers (in smc_core) are extended to support SMC-D. * SMC type checks: Some type checks are added to prevent using SMC-R specific code for SMC-D and vice versa. To actually use SMC-D, additional changes to pnetid, CLC, CDC, etc. are required. These are added in follow-up patches. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- include/net/smc.h | 62 +++++++++++ net/smc/Makefile | 2 +- net/smc/af_smc.c | 11 +- net/smc/smc_core.c | 270 +++++++++++++++++++++++++++++++++++------------ net/smc/smc_core.h | 71 +++++++++---- net/smc/smc_diag.c | 3 +- net/smc/smc_ism.c | 304 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_ism.h | 48 +++++++++ 8 files changed, 679 insertions(+), 92 deletions(-) create mode 100644 net/smc/smc_ism.c create mode 100644 net/smc/smc_ism.h (limited to 'net') diff --git a/include/net/smc.h b/include/net/smc.h index 2173932fab9d..824a7af8d654 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -20,4 +20,66 @@ struct smc_hashinfo { int smc_hash_sk(struct sock *sk); void smc_unhash_sk(struct sock *sk); + +/* SMCD/ISM device driver interface */ +struct smcd_dmb { + u64 dmb_tok; + u64 rgid; + u32 dmb_len; + u32 sba_idx; + u32 vlan_valid; + u32 vlan_id; + void *cpu_addr; + dma_addr_t dma_addr; +}; + +#define ISM_EVENT_DMB 0 +#define ISM_EVENT_GID 1 +#define ISM_EVENT_SWR 2 + +struct smcd_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + +struct smcd_dev; + +struct smcd_ops { + int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, + u32 vid); + int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*set_vlan_required)(struct smcd_dev *dev); + int (*reset_vlan_required)(struct smcd_dev *dev); + int (*signal_event)(struct smcd_dev *dev, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info); + int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size); +}; + +struct smcd_dev { + const struct smcd_ops *ops; + struct device dev; + void *priv; + u64 local_gid; + struct list_head list; + spinlock_t lock; + struct smc_connection **conn; + struct list_head vlan; + struct workqueue_struct *event_wq; +}; + +struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs); +int smcd_register_dev(struct smcd_dev *smcd); +void smcd_unregister_dev(struct smcd_dev *smcd); +void smcd_free_dev(struct smcd_dev *smcd); +void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event); +void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit); #endif /* _SMC_H */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 188104654b54..4df96b4b8130 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..8ce48799cf68 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -475,8 +475,8 @@ static int smc_connect_rdma(struct smc_sock *smc, int reason_code = 0; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, - aclc->hdr.flag); + local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, + ibport, &aclc->lcl, NULL, 0); if (local_contact < 0) { if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -491,7 +491,7 @@ static int smc_connect_rdma(struct smc_sock *smc, smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - if (smc_buf_create(smc)) + if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) @@ -894,7 +894,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, int *local_contact) { /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, + &pclc->lcl, NULL, 0); if (*local_contact < 0) { if (*local_contact == -ENOMEM) return SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -902,7 +903,7 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, } /* create send buffer and rmb */ - if (smc_buf_create(new_smc)) + if (smc_buf_create(new_smc, false)) return SMC_CLC_DECL_MEM; return 0; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index add82b0266f3..daa88db1841a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,6 +25,7 @@ #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" +#include "smc_ism.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -46,8 +47,8 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ mod_delayed_work(system_wq, &lgr->free_work, - lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : - SMC_LGR_FREE_DELAY_SERV); + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } /* Register connection's alert token in our lookup structure. @@ -153,16 +154,18 @@ static void smc_lgr_free_work(struct work_struct *work) free: spin_unlock_bh(&smc_lgr_list.lock); if (!delayed_work_pending(&lgr->free_work)) { - if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); smc_lgr_free(lgr); } } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, +static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id) + char *peer_systemid, unsigned short vlan_id, + struct smcd_dev *smcismdev, u64 peer_gid) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -170,17 +173,23 @@ static int smc_lgr_create(struct smc_sock *smc, int rc = 0; int i; + if (is_smcd && vlan_id) { + rc = smc_ism_get_vlan(smcismdev, vlan_id); + if (rc) + goto out; + } + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = -ENOMEM; goto out; } - lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->is_smcd = is_smcd; lgr->sync_err = 0; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); + rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); @@ -189,36 +198,44 @@ static int smc_lgr_create(struct smc_sock *smc, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; - + if (is_smcd) { + /* SMC-D specific settings */ + lgr->peer_gid = peer_gid; + lgr->smcd = smcismdev; + } else { + /* SMC-R specific settings */ + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* initialize link */ + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = SMC_SINGLE_LINK; + lnk->smcibdev = smcibdev; + lnk->ibport = ibport; + lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; + if (!smcibdev->initialized) + smc_ib_setup_per_ibdev(smcibdev); + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_llc_link_init(lnk); + if (rc) + goto free_lgr; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + } smc->conn.lgr = lgr; - rwlock_init(&lgr->conns_lock); spin_lock_bh(&smc_lgr_list.lock); list_add(&lgr->list, &smc_lgr_list.list); spin_unlock_bh(&smc_lgr_list.lock); @@ -264,7 +281,10 @@ void smc_conn_free(struct smc_connection *conn) { if (!conn->lgr) return; - smc_cdc_tx_dismiss_slots(conn); + if (conn->lgr->is_smcd) + smc_ism_unset_conn(conn); + else + smc_cdc_tx_dismiss_slots(conn); smc_lgr_unregister_conn(conn); smc_buf_unuse(conn); } @@ -280,8 +300,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, - struct smc_buf_desc *buf_desc) +static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; @@ -301,6 +321,25 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, kfree(buf_desc); } +static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, + struct smc_buf_desc *buf_desc) +{ + if (is_dmb) + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + else + kfree(buf_desc->cpu_addr); + kfree(buf_desc); +} + +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + if (lgr->is_smcd) + smcd_buf_free(lgr, is_rmb, buf_desc); + else + smcr_buf_free(lgr, is_rmb, buf_desc); +} + static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; @@ -332,7 +371,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + if (lgr->is_smcd) + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + else + smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } @@ -357,7 +399,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) lgr->terminating = 1; if (!list_empty(&lgr->list)) /* forget lgr */ list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -374,7 +417,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); + if (!lgr->is_smcd) + wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); smc_lgr_schedule_free_work(lgr); } @@ -392,13 +436,40 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) __smc_lgr_terminate(lgr); } spin_unlock_bh(&smc_lgr_list.lock); } +/* Called when SMC-D device is terminated or peer is lost */ +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) +{ + struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); + + /* run common cleanup function and build free list */ + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->is_smcd && lgr->smcd == dev && + (!peer_gid || lgr->peer_gid == peer_gid) && + !list_empty(&lgr->list)) { + __smc_lgr_terminate(lgr); + list_move(&lgr->list, &lgr_free_list); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + /* cancel the regular free workers and actually free lgrs */ + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); + } +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ @@ -477,10 +548,30 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) return -ENODEV; } +static bool smcr_lgr_match(struct smc_link_group *lgr, + struct smc_clc_msg_local *lcl, + enum smc_lgr_role role) +{ + return !memcmp(lgr->peer_systemid, lcl->id_for_peer, + SMC_SYSTEMID_LEN) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, + SMC_GID_SIZE) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, + sizeof(lcl->mac)) && + lgr->role == role; +} + +static bool smcd_lgr_match(struct smc_link_group *lgr, + struct smcd_dev *smcismdev, u64 peer_gid) +{ + return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; +} + /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact) + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid) { struct smc_connection *conn = &smc->conn; int local_contact = SMC_FIRST_CONTACT; @@ -502,17 +593,12 @@ int smc_conn_create(struct smc_sock *smc, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && + if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : + smcr_lgr_match(lgr, lcl, role)) && !lgr->sync_err && - (lgr->role == role) && - (lgr->vlan_id == vlan_id) && - ((role == SMC_CLNT) || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { + lgr->vlan_id == vlan_id && + (role == SMC_CLNT || + lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; @@ -535,12 +621,13 @@ int smc_conn_create(struct smc_sock *smc, create: if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, smcibdev, ibport, - lcl->id_for_peer, vlan_id); + rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, + lcl->id_for_peer, vlan_id, smcd, peer_gid); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ - rc = smc_link_determine_gid(conn->lgr); + if (!is_smcd) + rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; @@ -609,8 +696,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) +static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; struct smc_link *lnk; @@ -668,7 +755,43 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, return buf_desc; } -static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) +#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ + +static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, + bool is_dmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + int rc; + + if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) + return ERR_PTR(-EAGAIN); + + /* try to alloc a new DMB */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + if (is_dmb) { + rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); + if (rc) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + memset(buf_desc->cpu_addr, 0, bufsize); + buf_desc->len = bufsize; + } else { + buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!buf_desc->cpu_addr) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->len = bufsize; + } + return buf_desc; +} + +static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; @@ -706,7 +829,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) break; /* found reusable slot */ } - buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); + if (is_smcd) + buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); + else + buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) @@ -728,6 +855,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); + if (is_smcd) + smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; @@ -740,6 +869,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -748,6 +879,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -756,6 +889,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -764,6 +899,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -774,16 +911,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ -int smc_buf_create(struct smc_sock *smc) +int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ - rc = __smc_buf_create(smc, false); + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; /* create rmb */ - rc = __smc_buf_create(smc, true); + rc = __smc_buf_create(smc, is_smcd, true); if (rc) smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; @@ -865,7 +1002,8 @@ void smc_core_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 93cb3523bf50..cd9268a9570e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -124,15 +124,28 @@ struct smc_buf_desc { void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ u32 used; /* currently used / unused */ u8 reused : 1; /* new created / reused */ u8 regerr : 1; /* err during registration */ + union { + struct { /* SMC-R */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + }; + struct { /* SMC-D */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ + }; + }; }; struct smc_rtoken { /* address/key of remote RMB */ @@ -148,12 +161,10 @@ struct smc_rtoken { /* address/key of remote RMB */ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ +struct smcd_dev; + struct smc_link_group { struct list_head list; - enum smc_lgr_role role; /* client or server */ - struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ - char peer_systemid[SMC_SYSTEMID_LEN]; - /* unique system_id of peer */ struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ @@ -163,17 +174,35 @@ struct smc_link_group { rwlock_t sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ rwlock_t rmbs_lock; /* protects rx buffers */ - struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] - [SMC_LINKS_PER_LGR_MAX]; - /* remote addr/key pairs */ - unsigned long rtokens_used_mask[BITS_TO_LONGS( - SMC_RMBS_PER_LGR_MAX)]; - /* used rtoken elements */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + + bool is_smcd; /* SMC-R or SMC-D */ + union { + struct { /* SMC-R */ + enum smc_lgr_role role; + /* client or server */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; + /* smc link */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + unsigned long rtokens_used_mask[BITS_TO_LONGS + (SMC_RMBS_PER_LGR_MAX)]; + /* used rtoken elements */ + }; + struct { /* SMC-D */ + u64 peer_gid; + /* Peer GID (remote) */ + struct smcd_dev *smcd; + /* ISM device for VLAN reg. */ + }; + }; }; /* Find the connection associated with the given alert token in the link group. @@ -217,7 +246,8 @@ void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); -int smc_buf_create(struct smc_sock *smc); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid); +int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); @@ -227,9 +257,12 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); + void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid); +void smcd_conn_free(struct smc_connection *conn); void smc_core_exit(void); #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 839354402215..64ce107c24d9 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,7 +136,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr && + if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c new file mode 100644 index 000000000000..ca1ce42fd49f --- /dev/null +++ b/net/smc/smc_ism.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * Functions for ISM device. + * + * Copyright IBM Corp. 2018 + */ + +#include +#include +#include + +#include "smc.h" +#include "smc_core.h" +#include "smc_ism.h" + +struct smcd_dev_list smcd_dev_list = { + .list = LIST_HEAD_INIT(smcd_dev_list.list), + .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock) +}; + +/* Test if an ISM communication is possible. */ +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +{ + return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, + vlan_id); +} + +int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal, + pos->offset, data, len); + + return rc < 0 ? rc : 0; +} + +/* Set a connection using this DMBE. */ +void smc_ism_set_conn(struct smc_connection *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Unset a connection using this DMBE. */ +void smc_ism_unset_conn(struct smc_connection *conn) +{ + unsigned long flags; + + if (!conn->rmb_desc) + return; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Register a VLAN identifier with the ISM device. Use a reference count + * and add a VLAN identifier only when the first DMB using this VLAN is + * registered. + */ +int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *new_vlan, *vlan; + unsigned long flags; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + /* create new vlan entry, in case we need it */ + new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + new_vlan->vlanid = vlanid; + refcount_set(&new_vlan->refcnt, 1); + + /* if there is an existing entry, increase count and return */ + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + refcount_inc(&vlan->refcnt); + kfree(new_vlan); + goto out; + } + } + + /* no existing entry found. + * add new entry to device; might fail, e.g., if HW limit reached + */ + if (smcd->ops->add_vlan_id(smcd, vlanid)) { + kfree(new_vlan); + rc = -EIO; + goto out; + } + list_add_tail(&new_vlan->list, &smcd->vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +/* Unregister a VLAN identifier with the ISM device. Use a reference count + * and remove a VLAN identifier only when the last DMB using this VLAN is + * unregistered. + */ +int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *vlan; + unsigned long flags; + bool found = false; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + if (!refcount_dec_and_test(&vlan->refcnt)) + goto out; + found = true; + break; + } + } + if (!found) { + rc = -ENOENT; + goto out; /* VLAN id not in table */ + } + + /* Found and the last reference just gone */ + if (smcd->ops->del_vlan_id(smcd, vlanid)) + rc = -EIO; + list_del(&vlan->list); + kfree(vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = dmb_desc->token; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.cpu_addr = dmb_desc->cpu_addr; + dmb.dma_addr = dmb_desc->dma_addr; + dmb.dmb_len = dmb_desc->len; + return smcd->ops->unregister_dmb(smcd, &dmb); +} + +int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_len = dmb_len; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.vlan_id = lgr->vlan_id; + dmb.rgid = lgr->peer_gid; + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +struct smc_ism_event_work { + struct work_struct work; + struct smcd_dev *smcd; + struct smcd_event event; +}; + +/* worker for SMC-D events */ +static void smc_ism_event_work(struct work_struct *work) +{ + struct smc_ism_event_work *wrk = + container_of(work, struct smc_ism_event_work, work); + + switch (wrk->event.type) { + case ISM_EVENT_GID: /* GID event, token is peer GID */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok); + break; + case ISM_EVENT_DMB: + break; + } + kfree(wrk); +} + +static void smcd_release(struct device *dev) +{ + struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); + + kfree(smcd->conn); + kfree(smcd); +} + +struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) { + kfree(smcd); + return NULL; + } + + smcd->dev.parent = parent; + smcd->dev.release = smcd_release; + device_initialize(&smcd->dev); + dev_set_name(&smcd->dev, name); + smcd->ops = ops; + + spin_lock_init(&smcd->lock); + INIT_LIST_HEAD(&smcd->vlan); + smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", + WQ_MEM_RECLAIM, name); + return smcd; +} +EXPORT_SYMBOL_GPL(smcd_alloc_dev); + +int smcd_register_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_add_tail(&smcd->list, &smcd_dev_list.list); + spin_unlock(&smcd_dev_list.lock); + + return device_add(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_register_dev); + +void smcd_unregister_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_del(&smcd->list); + spin_unlock(&smcd_dev_list.lock); + flush_workqueue(smcd->event_wq); + destroy_workqueue(smcd->event_wq); + smc_smcd_terminate(smcd, 0); + + device_del(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_unregister_dev); + +void smcd_free_dev(struct smcd_dev *smcd) +{ + put_device(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_free_dev); + +/* SMCD Device event handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer, + * - event->type (0 --> DMB, 1 --> GID), + * - event->code (event code), + * - event->tok (either DMB token when event type 0, or GID when event type 1) + * - event->time (time of day) + * - event->info (debug info). + * + * Context: + * - Function called in IRQ context from ISM device driver event handler. + */ +void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +{ + struct smc_ism_event_work *wrk; + + /* copy event to event work queue, and let it be handled there */ + wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + if (!wrk) + return; + INIT_WORK(&wrk->work, smc_ism_event_work); + wrk->smcd = smcd; + wrk->event = *event; + queue_work(smcd->event_wq, &wrk->work); +} +EXPORT_SYMBOL_GPL(smcd_handle_event); + +/* SMCD Device interrupt handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer and DMB number. Find the connection and + * schedule the tasklet for this connection. + * + * Context: + * - Function called in IRQ context from ISM device driver IRQ handler. + */ +void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) +{ +} +EXPORT_SYMBOL_GPL(smcd_handle_irq); diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h new file mode 100644 index 000000000000..aee45b860b79 --- /dev/null +++ b/net/smc/smc_ism.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * SMC-D ISM device structure definitions. + * + * Copyright IBM Corp. 2018 + */ + +#ifndef SMCD_ISM_H +#define SMCD_ISM_H + +#include + +#include "smc.h" + +struct smcd_dev_list { /* List of SMCD devices */ + struct list_head list; + spinlock_t lock; /* Protects list of devices */ +}; + +extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ + +struct smc_ism_vlanid { /* VLAN id set on ISM device */ + struct list_head list; + unsigned short vlanid; /* Vlan id */ + refcount_t refcnt; /* Reference count */ +}; + +struct smc_ism_position { /* ISM device position to write to */ + u64 token; /* Token of DMB */ + u32 offset; /* Offset into DMBE */ + u8 index; /* Index of DMBE */ + u8 signal; /* Generate interrupt on owner side */ +}; + +struct smcd_dev; + +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +void smc_ism_set_conn(struct smc_connection *conn); +void smc_ism_unset_conn(struct smc_connection *conn); +int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, + struct smc_buf_desc *dmb_desc); +int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, + void *data, size_t len); +#endif -- cgit v1.2.3 From 1619f770589a183af56f248de261534b255122de Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:08 +0200 Subject: net/smc: add pnetid support for SMC-D and ISM SMC-D relies on PNETIDs to find usable SMC-D/ISM devices for a SMC connection. This patch adds SMC-D/ISM support to the current PNETID implementation. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- include/net/smc.h | 1 + net/smc/smc_ism.c | 2 ++ net/smc/smc_pnet.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/smc/smc_pnet.h | 2 ++ 4 files changed, 46 insertions(+) (limited to 'net') diff --git a/include/net/smc.h b/include/net/smc.h index 824a7af8d654..9ef49f8b1002 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -73,6 +73,7 @@ struct smcd_dev { struct smc_connection **conn; struct list_head vlan; struct workqueue_struct *event_wq; + u8 pnetid[SMC_MAX_PNETID_LEN]; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ca1ce42fd49f..f44e4dff244a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -13,6 +13,7 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" +#include "smc_pnet.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -227,6 +228,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; + smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); INIT_LIST_HEAD(&smcd->vlan); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index cdc6e23b6ce1..1b6c066d3495 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -22,6 +22,7 @@ #include "smc_pnet.h" #include "smc_ib.h" +#include "smc_ism.h" static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { @@ -564,6 +565,27 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, spin_unlock(&smc_ib_devices.lock); } +static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, + struct smcd_dev **smcismdev) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smcd_dev *ismdev; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(ismdev, &smcd_dev_list.list, list) { + if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) { + *smcismdev = ismdev; + break; + } + } + spin_unlock(&smcd_dev_list.lock); +} + /* Lookup of coupled ib_device via SMC pnet table */ static void smc_pnet_find_roce_by_table(struct net_device *netdev, struct smc_ib_device **smcibdev, @@ -615,3 +637,22 @@ out_rel: out: return; } + +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcismdev = NULL; + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + +out_rel: + dst_release(dst); +out: + return; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index ad4455cde9e7..1e94fd4df7bc 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -17,6 +17,7 @@ #endif struct smc_ib_device; +struct smcd_dev; static inline int smc_pnetid_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid) @@ -33,5 +34,6 @@ void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_ib_device **smcibdev, u8 *ibport); +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); #endif -- cgit v1.2.3 From c758dfddc1b5b1c9b8c64e5e4bb9bf24b74f4a59 Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:09 +0200 Subject: net/smc: add SMC-D support in CLC messages There are two types of SMC: SMC-R and SMC-D. These types are signaled within the CLC messages during the CLC handshake. This patch adds support for and checks of the SMC type. Also, SMC-R and SMC-D need to exchange different information during the CLC handshake. So, this patch extends the current message formats to support the SMC-D header fields. The Proposal message can contain both SMC-R and SMC-D information. The Accept and Confirm messages contain either SMC-R or SMC-D information. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- net/smc/af_smc.c | 9 +-- net/smc/smc_clc.c | 193 ++++++++++++++++++++++++++++++++++++++---------------- net/smc/smc_clc.h | 81 ++++++++++++++++++----- 3 files changed, 205 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8ce48799cf68..20afa94be8bb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -451,14 +451,14 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, } /* CLC handshake during connect */ -static int smc_connect_clc(struct smc_sock *smc, +static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, struct smc_ib_device *ibdev, u8 ibport) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, ibdev, ibport); + rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, NULL); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -564,7 +564,7 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); /* perform CLC handshake */ - rc = smc_connect_clc(smc, &aclc, ibdev, ibport); + rc = smc_connect_clc(smc, SMC_TYPE_R, &aclc, ibdev, ibport); if (rc) return smc_connect_decline_fallback(smc, rc); @@ -1008,7 +1008,8 @@ static void smc_listen_work(struct work_struct *work) smc_tx_init(new_smc); /* check if RDMA is available */ - if (smc_check_rdma(new_smc, &ibdev, &ibport) || + if ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || + smc_check_rdma(new_smc, &ibdev, &ibport) || smc_listen_rdma_check(new_smc, pclc) || smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, &local_contact) || diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 717449b1da0b..038d70ef7892 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -23,9 +23,15 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_ib.h" +#include "smc_ism.h" + +#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 +#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; +/* eye catcher "SMCD" EBCDIC for CLC messages */ +static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers @@ -38,10 +44,14 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; - if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: + if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && + clcm->path != SMC_TYPE_B) + return false; pclc = (struct smc_clc_msg_proposal *)clcm; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (ntohs(pclc->hdr.length) != @@ -56,10 +66,16 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: + if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D) + return false; clc = (struct smc_clc_msg_accept_confirm *)clcm; - if (ntohs(clc->hdr.length) != sizeof(*clc)) + if ((clcm->path == SMC_TYPE_R && + ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || + (clcm->path == SMC_TYPE_D && + ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) return false; - trl = &clc->trl; + trl = (struct smc_clc_msg_trail *) + ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; @@ -70,7 +86,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) default: return false; } - if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; } @@ -295,6 +312,9 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || (datlen > buflen) || + (clcm->version != SMC_CLC_V1) || + (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && + clcm->path != SMC_TYPE_B) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -356,17 +376,18 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) } /* send CLC PROPOSAL message across internal TCP socket */ -int smc_clc_send_proposal(struct smc_sock *smc, - struct smc_ib_device *smcibdev, - u8 ibport) +int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, + struct smc_ib_device *ibdev, u8 ibport, + struct smcd_dev *ismdev) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; + struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal pclc; struct smc_clc_msg_trail trl; int len, i, plen, rc; int reason_code = 0; - struct kvec vec[4]; + struct kvec vec[5]; struct msghdr msg; /* retrieve ip prefixes for CLC proposal msg */ @@ -381,18 +402,34 @@ int smc_clc_send_proposal(struct smc_sock *smc, memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.length = htons(plen); pclc.hdr.version = SMC_CLC_V1; /* SMC version */ - memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); - pclc.iparea_offset = htons(0); + pclc.hdr.path = smc_type; + if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) { + /* add SMC-R specifics */ + memcpy(pclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&pclc.lcl.gid, &ibdev->gid[ibport - 1], SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + pclc.iparea_offset = htons(0); + } + if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + /* add SMC-D specifics */ + memset(&pclc_smcd, 0, sizeof(pclc_smcd)); + plen += sizeof(pclc_smcd); + pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); + pclc_smcd.gid = ismdev->local_gid; + } + pclc.hdr.length = htons(plen); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = &pclc; vec[i++].iov_len = sizeof(pclc); + if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + vec[i].iov_base = &pclc_smcd; + vec[i++].iov_len = sizeof(pclc_smcd); + } vec[i].iov_base = &pclc_prfx; vec[i++].iov_len = sizeof(pclc_prfx); if (pclc_prfx.ipv6_prefixes_cnt > 0) { @@ -428,35 +465,56 @@ int smc_clc_send_confirm(struct smc_sock *smc) struct kvec vec; int len; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; /* send SMC Confirm CLC msg */ memset(&cclc, 0, sizeof(cclc)); - memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.type = SMC_CLC_CONFIRM; - cclc.hdr.length = htons(sizeof(cclc)); cclc.hdr.version = SMC_CLC_V1; /* SMC version */ - memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); - memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(cclc.qpn, link->roce_qp->qp_num); - cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - cclc.rmbe_alert_token = htonl(conn->alert_token_local); - cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); - cclc.rmbe_size = conn->rmbe_size_short; - cclc.rmb_dma_addr = cpu_to_be64( - (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(cclc.psn, link->psn_initial); - - memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + if (smc->conn.lgr->is_smcd) { + /* SMC-D specific settings */ + memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + cclc.hdr.path = SMC_TYPE_D; + cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + cclc.gid = conn->lgr->smcd->local_gid; + cclc.token = conn->rmb_desc->token; + cclc.dmbe_size = conn->rmbe_size_short; + cclc.dmbe_idx = 0; + memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + /* SMC-R specific settings */ + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + cclc.hdr.path = SMC_TYPE_R; + cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(cclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(cclc.qpn, link->roce_qp->qp_num); + cclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_alert_token = htonl(conn->alert_token_local); + cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); + cclc.rmbe_size = conn->rmbe_size_short; + cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + hton24(cclc.psn, link->psn_initial); + memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + } memset(&msg, 0, sizeof(msg)); vec.iov_base = &cclc; - vec.iov_len = sizeof(cclc); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); - if (len < sizeof(cclc)) { + vec.iov_len = ntohs(cclc.hdr.length); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + ntohs(cclc.hdr.length)); + if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; @@ -479,35 +537,58 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) int rc = 0; int len; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; memset(&aclc, 0, sizeof(aclc)); - memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.type = SMC_CLC_ACCEPT; - aclc.hdr.length = htons(sizeof(aclc)); aclc.hdr.version = SMC_CLC_V1; /* SMC version */ if (srv_first_contact) aclc.hdr.flag = 1; - memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); - memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(aclc.qpn, link->roce_qp->qp_num); - aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ - aclc.rmbe_alert_token = htonl(conn->alert_token_local); - aclc.qp_mtu = link->path_mtu; - aclc.rmbe_size = conn->rmbe_size_short, - aclc.rmb_dma_addr = cpu_to_be64( - (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(aclc.psn, link->psn_initial); - memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + if (new_smc->conn.lgr->is_smcd) { + /* SMC-D specific settings */ + aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + aclc.hdr.path = SMC_TYPE_D; + aclc.gid = conn->lgr->smcd->local_gid; + aclc.token = conn->rmb_desc->token; + aclc.dmbe_size = conn->rmbe_size_short; + aclc.dmbe_idx = 0; + memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + /* SMC-R specific settings */ + aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + aclc.hdr.path = SMC_TYPE_R; + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memcpy(aclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(aclc.qpn, link->roce_qp->qp_num); + aclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_alert_token = htonl(conn->alert_token_local); + aclc.qp_mtu = link->path_mtu; + aclc.rmbe_size = conn->rmbe_size_short, + aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + hton24(aclc.psn, link->psn_initial); + memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + } memset(&msg, 0, sizeof(msg)); vec.iov_base = &aclc; - vec.iov_len = sizeof(aclc); - len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); - if (len < sizeof(aclc)) { + vec.iov_len = ntohs(aclc.hdr.length); + len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, + ntohs(aclc.hdr.length)); + if (len < ntohs(aclc.hdr.length)) { if (len >= 0) new_smc->sk.sk_err = EPROTO; else diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 41ff9ea96139..100e988ad1a8 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -23,6 +23,9 @@ #define SMC_CLC_DECLINE 0x04 #define SMC_CLC_V1 0x1 /* SMC version */ +#define SMC_TYPE_R 0 /* SMC-R only */ +#define SMC_TYPE_D 1 /* SMC-D only */ +#define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ #define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ @@ -42,9 +45,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */ #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, flag : 1, - rsvd : 3; + rsvd : 1, + path : 2; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 rsvd : 3, + u8 path : 2, + rsvd : 1, flag : 1, version : 4; #endif @@ -77,6 +82,11 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ } __aligned(4); +struct smc_clc_msg_smcd { /* SMC-D GID information */ + u64 gid; /* ISM GID of requestor */ + u8 res[32]; +}; + struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; struct smc_clc_msg_local lcl; @@ -94,23 +104,45 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; - struct smc_clc_msg_local lcl; - u8 qpn[3]; /* QP number */ - __be32 rmb_rkey; /* RMB rkey */ - u8 rmbe_idx; /* Index of RMBE in RMB */ - __be32 rmbe_alert_token;/* unique connection id */ + union { + struct { /* SMC-R */ + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 rmbe_idx; /* Index of RMBE in RMB */ + __be32 rmbe_alert_token;/* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) - u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ - qp_mtu : 4; /* QP mtu */ + u8 rmbe_size : 4, /* buf size (compressed) */ + qp_mtu : 4; /* QP mtu */ #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 qp_mtu : 4, - rmbe_size : 4; + u8 qp_mtu : 4, + rmbe_size : 4; #endif - u8 reserved; - __be64 rmb_dma_addr; /* RMB virtual address */ - u8 reserved2; - u8 psn[3]; /* initial packet sequence number */ - struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* packet sequence number */ + struct smc_clc_msg_trail smcr_trl; + /* eye catcher "SMCR" EBCDIC */ + } __packed; + struct { /* SMC-D */ + u64 gid; /* Sender GID */ + u64 token; /* DMB token */ + u8 dmbe_idx; /* DMBE index */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 dmbe_size : 4, /* buf size (compressed) */ + reserved3 : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved3 : 4, + dmbe_size : 4; +#endif + u16 reserved4; + u32 linkid; /* Link identifier */ + u32 reserved5[3]; + struct smc_clc_msg_trail smcd_trl; + /* eye catcher "SMCD" EBCDIC */ + } __packed; + }; } __packed; /* format defined in RFC7609 */ struct smc_clc_msg_decline { /* clc decline message */ @@ -129,13 +161,26 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } +/* get SMC-D info from proposal message */ +static inline struct smc_clc_msg_smcd * +smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) +{ + if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) + return NULL; + + return (struct smc_clc_msg_smcd *)(prop + 1); +} + +struct smcd_dev; + int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); -int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, - u8 ibport); +int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, + struct smc_ib_device *smcibdev, u8 ibport, + struct smcd_dev *ismdev); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); -- cgit v1.2.3 From be244f28d22f77d939ba2b973c102ad2b49d3496 Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:10 +0200 Subject: net/smc: add SMC-D support in data transfer The data transfer and CDC message headers differ in SMC-R and SMC-D. This patch adds support for the SMC-D data transfer to the existing SMC code. It consists of the following: * SMC-D CDC support * SMC-D tx support * SMC-D rx support The CDC header is stored at the beginning of the receive buffer. Thus, a rx_offset variable is added for the CDC header offset within the buffer (0 for SMC-R). Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- net/smc/smc.h | 5 ++ net/smc/smc_cdc.c | 86 +++++++++++++++++++++++- net/smc/smc_cdc.h | 43 +++++++++++- net/smc/smc_core.c | 25 +++++-- net/smc/smc_ism.c | 8 +++ net/smc/smc_rx.c | 2 +- net/smc/smc_tx.c | 193 +++++++++++++++++++++++++++++++++++++++++------------ net/smc/smc_tx.h | 2 + 8 files changed, 308 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index 7c86f716a92e..8c6231011779 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -183,6 +183,11 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ + u8 rx_off; /* receive offset: + * 0 for SMC-R, 32 for SMC-D + */ + u64 peer_token; /* SMC-D token of peer */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7e8d63fc8ae..621d8cca570b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -117,7 +117,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, return rc; } -int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; @@ -130,6 +130,21 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) return smc_cdc_msg_send(conn, wr_buf, pend); } +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + int rc; + + if (conn->lgr->is_smcd) { + spin_lock_bh(&conn->send_lock); + rc = smcd_cdc_msg_send(conn); + spin_unlock_bh(&conn->send_lock); + } else { + rc = smcr_cdc_get_slot_and_msg_send(conn); + } + + return rc; +} + static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, unsigned long data) { @@ -157,6 +172,45 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) (unsigned long)conn); } +/* Send a SMC-D CDC header. + * This increments the free space available in our send buffer. + * Also update the confirmed receive buffer with what was sent to the peer. + */ +int smcd_cdc_msg_send(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smcd_cdc_msg cdc; + int rc, diff; + + memset(&cdc, 0, sizeof(cdc)); + cdc.common.type = SMC_CDC_MSG_TYPE; + cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap; + cdc.prod_count = conn->local_tx_ctrl.prod.count; + + cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap; + cdc.cons_count = conn->local_tx_ctrl.cons.count; + cdc.prod_flags = conn->local_tx_ctrl.prod_flags; + cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; + rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); + if (rc) + return rc; + smc_curs_write(&conn->rx_curs_confirmed, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn); + /* Calculate transmitted data and increment free send buffer space */ + diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, + &conn->tx_curs_sent); + /* increased by confirmed number of bytes */ + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_write(&conn->tx_curs_fin, + smc_curs_read(&conn->tx_curs_sent, conn), conn); + + smc_tx_sndbuf_nonfull(smc); + return rc; +} + /********************************* receive ***********************************/ static inline bool smc_cdc_before(u16 seq1, u16 seq2) @@ -178,7 +232,7 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, if (!sock_flag(&smc->sk, SOCK_URGINLINE)) /* we'll skip the urgent byte, so don't account for it */ (*diff_prod)--; - base = (char *)conn->rmb_desc->cpu_addr; + base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off; if (conn->urg_curs.count) conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); else @@ -276,6 +330,34 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) sock_put(&smc->sk); /* no free sk in softirq-context */ } +/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ + * handler to indicate update in the DMBE. + * + * Context: + * - tasklet context + */ +static void smcd_cdc_rx_tsklet(unsigned long data) +{ + struct smc_connection *conn = (struct smc_connection *)data; + struct smcd_cdc_msg cdc; + struct smc_sock *smc; + + if (!conn) + return; + + memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc)); + smc = container_of(conn, struct smc_sock, conn); + smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc); +} + +/* Initialize receive tasklet. Called from ISM device IRQ handler to start + * receiver side. + */ +void smcd_cdc_rx_init(struct smc_connection *conn) +{ + tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn); +} + /***************************** init, exit, misc ******************************/ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index f60082fee5b8..8fbce4fee3e4 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -50,6 +50,20 @@ struct smc_cdc_msg { u8 reserved[18]; } __packed; /* format defined in RFC7609 */ +/* CDC message for SMC-D */ +struct smcd_cdc_msg { + struct smc_wr_rx_hdr common; /* Type = 0xFE */ + u8 res1[7]; + u16 prod_wrap; + u32 prod_count; + u8 res2[2]; + u16 cons_wrap; + u32 cons_count; + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + u8 res3[8]; +} __packed; + static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || @@ -204,9 +218,9 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, smc_curs_write(local, smc_curs_read(&temp, conn), conn); } -static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, - struct smc_cdc_msg *peer, - struct smc_connection *conn) +static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) { local->common.type = peer->common.type; local->len = peer->len; @@ -218,6 +232,27 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, local->conn_state_flags = peer->conn_state_flags; } +static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smcd_cdc_msg *peer) +{ + local->prod.wrap = peer->prod_wrap; + local->prod.count = peer->prod_count; + local->cons.wrap = peer->cons_wrap; + local->cons.count = peer->cons_count; + local->prod_flags = peer->prod_flags; + local->conn_state_flags = peer->conn_state_flags; +} + +static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + if (conn->lgr->is_smcd) + smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer); + else + smcr_cdc_msg_to_host(local, peer, conn); +} + struct smc_cdc_tx_pend; int smc_cdc_get_free_slot(struct smc_connection *conn, @@ -227,6 +262,8 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); +int smcd_cdc_msg_send(struct smc_connection *conn); int smc_cdc_init(void) __init; +void smcd_cdc_rx_init(struct smc_connection *conn); #endif /* SMC_CDC_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index daa88db1841a..434c028162a4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -281,10 +281,12 @@ void smc_conn_free(struct smc_connection *conn) { if (!conn->lgr) return; - if (conn->lgr->is_smcd) + if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); - else + tasklet_kill(&conn->rx_tsklet); + } else { smc_cdc_tx_dismiss_slots(conn); + } smc_lgr_unregister_conn(conn); smc_buf_unuse(conn); } @@ -324,10 +326,13 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, struct smc_buf_desc *buf_desc) { - if (is_dmb) + if (is_dmb) { + /* restore original buf len */ + buf_desc->len += sizeof(struct smcd_cdc_msg); smc_ism_unregister_dmb(lgr->smcd, buf_desc); - else + } else { kfree(buf_desc->cpu_addr); + } kfree(buf_desc); } @@ -632,6 +637,10 @@ create: conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + if (is_smcd) { + conn->rx_off = sizeof(struct smcd_cdc_msg); + smcd_cdc_rx_init(conn); /* init tasklet for this conn */ + } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif @@ -776,8 +785,9 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, kfree(buf_desc); return ERR_PTR(-EAGAIN); } - memset(buf_desc->cpu_addr, 0, bufsize); - buf_desc->len = bufsize; + buf_desc->pages = virt_to_page(buf_desc->cpu_addr); + /* CDC header stored in buf. So, pretend it was smaller */ + buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); } else { buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | @@ -854,7 +864,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); - conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); + conn->rmbe_update_limit = + smc_rmb_wnd_update_limit(buf_desc->len); if (is_smcd) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index f44e4dff244a..cfade7fdcc6d 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -302,5 +302,13 @@ EXPORT_SYMBOL_GPL(smcd_handle_event); */ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) { + struct smc_connection *conn = NULL; + unsigned long flags; + + spin_lock_irqsave(&smcd->lock, flags); + conn = smcd->conn[dmbno]; + if (conn) + tasklet_schedule(&conn->rx_tsklet); + spin_unlock_irqrestore(&smcd->lock, flags); } EXPORT_SYMBOL_GPL(smcd_handle_irq); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 3d77b383cccd..b329803c8339 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -305,7 +305,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ - rcvbuf_base = conn->rmb_desc->cpu_addr; + rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; do { /* while (read_remaining) */ if (read_done >= target || (pipe && read_done)) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f82886b7d1d8..142bcb134dd6 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,7 @@ #include "smc.h" #include "smc_wr.h" #include "smc_cdc.h" +#include "smc_ism.h" #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ @@ -250,6 +251,24 @@ out_err: /***************************** sndbuf consumer *******************************/ +/* sndbuf consumer: actual data transfer of one target chunk with ISM write */ +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal) +{ + struct smc_ism_position pos; + int rc; + + memset(&pos, 0, sizeof(pos)); + pos.token = conn->peer_token; + pos.index = conn->peer_rmbe_idx; + pos.offset = conn->tx_off + offset; + pos.signal = signal; + rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); + if (rc) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + return rc; +} + /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_sge sges[]) @@ -297,21 +316,104 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, smc_curs_add(conn->sndbuf_desc->len, sent, len); } +/* SMC-R helper for smc_tx_rdma_writes() */ +static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + dma_addr_t dma_addr = + sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + int src_len_sum = src_len, dst_len_sum = dst_len; + struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; + int sent_count = src_off; + int srcchunk, dstchunk; + int num_sges; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + num_sges = 0; + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + sges[srcchunk].addr = dma_addr + src_off; + sges[srcchunk].length = src_len; + sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; + num_sges++; + + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); + if (rc) + return rc; + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - + sent_count); + src_len_sum = src_len; + } + return 0; +} + +/* SMC-D helper for smc_tx_rdma_writes() */ +static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + int src_len_sum = src_len, dst_len_sum = dst_len; + int srcchunk, dstchunk; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + void *data = conn->sndbuf_desc->cpu_addr + src_off; + + rc = smcd_tx_ism_write(conn, data, src_len, dst_off + + sizeof(struct smcd_cdc_msg), 0); + if (rc) + return rc; + dst_off += src_len; + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); + src_len_sum = src_len; + } + return 0; +} + /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ static int smc_tx_rdma_writes(struct smc_connection *conn) { - size_t src_off, src_len, dst_off, dst_len; /* current chunk values */ - size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk; + size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; - struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; - struct smc_link_group *lgr = conn->lgr; struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; - struct smc_link *link; - dma_addr_t dma_addr; - int num_sges; int rc; /* source: sndbuf */ @@ -341,7 +443,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) len = min(to_send, rmbespace); /* initialize variables for first iteration of subsequent nested loop */ - link = &lgr->lnk[SMC_SINGLE_LINK]; dst_off = prod.count; if (prod.wrap == cons.wrap) { /* the filled destination area is unwrapped, @@ -358,8 +459,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) */ dst_len = len; } - dst_len_sum = dst_len; - src_off = sent.count; /* dst_len determines the maximum src_len */ if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ @@ -368,38 +467,15 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ src_len = conn->sndbuf_desc->len - sent.count; } - src_len_sum = src_len; - dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); - for (dstchunk = 0; dstchunk < 2; dstchunk++) { - num_sges = 0; - for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sges[srcchunk].addr = dma_addr + src_off; - sges[srcchunk].length = src_len; - sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; - num_sges++; - src_off += src_len; - if (src_off >= conn->sndbuf_desc->len) - src_off -= conn->sndbuf_desc->len; - /* modulo in send ring */ - if (src_len_sum == dst_len) - break; /* either on 1st or 2nd iteration */ - /* prepare next (== 2nd) iteration */ - src_len = dst_len - src_len; /* remainder */ - src_len_sum += src_len; - } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); - if (rc) - return rc; - if (dst_len_sum == len) - break; /* either on 1st or 2nd iteration */ - /* prepare next (== 2nd) iteration */ - dst_off = 0; /* modulo offset in RMBE ring buffer */ - dst_len = len - dst_len; /* remainder */ - dst_len_sum += dst_len; - src_len = min_t(int, - dst_len, conn->sndbuf_desc->len - sent.count); - src_len_sum = src_len; - } + + if (conn->lgr->is_smcd) + rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + else + rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + if (rc) + return rc; if (conn->urg_tx_pend && len == to_send) pflags->urg_data_present = 1; @@ -420,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags; struct smc_cdc_tx_pend *pend; @@ -467,6 +543,37 @@ out_unlock: return rc; } +static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + int rc = 0; + + spin_lock_bh(&conn->send_lock); + if (!pflags->urg_data_present) + rc = smc_tx_rdma_writes(conn); + if (!rc) + rc = smcd_cdc_msg_send(conn); + + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } + spin_unlock_bh(&conn->send_lock); + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + if (conn->lgr->is_smcd) + rc = smcd_tx_sndbuf_nonempty(conn); + else + rc = smcr_tx_sndbuf_nonempty(conn); + + return rc; +} + /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 9d2238909fa0..b22bdc5694c4 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -33,5 +33,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal); #endif /* SMC_TX_H */ -- cgit v1.2.3 From 413498440e30bfe381ac99dfc31628a3d8d4382a Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:11 +0200 Subject: net/smc: add SMC-D support in af_smc This patch ties together the previous SMC-D patches. It adds support for SMC-D to the listen and connect functions and, thus, enables SMC-D support in the SMC code. If a connection supports both SMC-R and SMC-D, SMC-D is preferred. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- net/smc/af_smc.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++----- net/smc/smc_core.c | 2 +- net/smc/smc_core.h | 1 + 3 files changed, 200 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 20afa94be8bb..cbbb947dbfcf 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include "smc_cdc.h" #include "smc_core.h" #include "smc_ib.h" +#include "smc_ism.h" #include "smc_pnet.h" #include "smc_tx.h" #include "smc_rx.h" @@ -372,8 +374,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) return 0; } -static void smc_conn_save_peer_info(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm *clc) +static void smcr_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->rmbe_size); @@ -384,6 +386,28 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } +static void smcd_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + int bufsize = smc_uncompress_bufsize(clc->dmbe_size); + + smc->conn.peer_rmbe_idx = clc->dmbe_idx; + smc->conn.peer_token = clc->token; + /* msg header takes up space in the buffer */ + smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; +} + +static void smc_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + if (smc->conn.lgr->is_smcd) + smcd_conn_save_peer_info(smc, clc); + else + smcr_conn_save_peer_info(smc, clc); +} + static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc) { @@ -450,15 +474,51 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, return reason_code; } +/* check if there is an ISM device available for this connection. */ +/* called for connect and listen */ +static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +{ + /* Find ISM device with same PNETID as connecting interface */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); + if (!(*ismdev)) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + return 0; +} + +/* Check for VLAN ID and register it on ISM device just for CLC handshake */ +static int smc_connect_ism_vlan_setup(struct smc_sock *smc, + struct smcd_dev *ismdev, + unsigned short vlan_id) +{ + if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + +/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is + * used, the VLAN ID will be registered again during the connection setup. + */ +static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, + struct smcd_dev *ismdev, + unsigned short vlan_id) +{ + if (!is_smcd) + return 0; + if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_ib_device *ibdev, u8 ibport, + struct smcd_dev *ismdev) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, NULL); + rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, ismdev); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -538,11 +598,50 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; } +/* setup for ISM connection of client */ +static int smc_connect_ism(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smcd_dev *ismdev) +{ + int local_contact = SMC_FIRST_CONTACT; + int rc = 0; + + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, + NULL, ismdev, aclc->gid); + if (local_contact < 0) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0); + + /* Create send and receive buffers */ + if (smc_buf_create(smc, true)) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + + smc_conn_save_peer_info(smc, aclc); + smc_close_init(smc); + smc_rx_init(smc); + smc_tx_init(smc); + + rc = smc_clc_send_confirm(smc); + if (rc) + return smc_connect_abort(smc, rc, local_contact); + mutex_unlock(&smc_create_lgr_pending); + + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return 0; +} + /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { + bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; struct smc_ib_device *ibdev; + struct smcd_dev *ismdev; + unsigned short vlan; + int smc_type; int rc = 0; u8 ibport; @@ -559,20 +658,52 @@ static int __smc_connect(struct smc_sock *smc) if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check if a RDMA device is available; if not, fall back */ - if (smc_check_rdma(smc, &ibdev, &ibport)) + /* check for VLAN ID */ + if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + + /* check if there is an ism device available */ + if (!smc_check_ism(smc, &ismdev) && + !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + /* ISM is supported for this connection */ + ism_supported = true; + smc_type = SMC_TYPE_D; + } + + /* check if there is a rdma device available */ + if (!smc_check_rdma(smc, &ibdev, &ibport)) { + /* RDMA is supported for this connection */ + rdma_supported = true; + if (ism_supported) + smc_type = SMC_TYPE_B; /* both */ + else + smc_type = SMC_TYPE_R; /* only RDMA */ + } + + /* if neither ISM nor RDMA are supported, fallback */ + if (!rdma_supported && !ism_supported) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); /* perform CLC handshake */ - rc = smc_connect_clc(smc, SMC_TYPE_R, &aclc, ibdev, ibport); - if (rc) + rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, ismdev); + if (rc) { + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); + } - /* connect using rdma */ - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); - if (rc) + /* depending on previous steps, connect using rdma or ism */ + if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) + rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) + rc = smc_connect_ism(smc, &aclc, ismdev); + else + rc = SMC_CLC_DECL_CNFERR; + if (rc) { + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); + } + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return 0; } @@ -909,6 +1040,44 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, return 0; } +/* listen worker: initialize connection and buffers for SMC-D */ +static int smc_listen_ism_init(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smcd_dev *ismdev, + int *local_contact) +{ + struct smc_clc_msg_smcd *pclc_smcd; + + pclc_smcd = smc_get_clc_msg_smcd(pclc); + *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL, + ismdev, pclc_smcd->gid); + if (*local_contact < 0) { + if (*local_contact == -ENOMEM) + return SMC_CLC_DECL_MEM;/* insufficient memory*/ + return SMC_CLC_DECL_INTERR; /* other error */ + } + + /* Check if peer can be reached via ISM device */ + if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, + new_smc->conn.lgr->vlan_id, + new_smc->conn.lgr->smcd)) { + if (*local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + smc_conn_free(&new_smc->conn); + return SMC_CLC_DECL_CNFERR; + } + + /* Create send and receive buffers */ + if (smc_buf_create(new_smc, true)) { + if (*local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + smc_conn_free(&new_smc->conn); + return SMC_CLC_DECL_MEM; + } + + return 0; +} + /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { @@ -967,6 +1136,8 @@ static void smc_listen_work(struct work_struct *work) struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; struct smc_ib_device *ibdev; + bool ism_supported = false; + struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; int local_contact = 0; int reason_code = 0; @@ -1007,13 +1178,21 @@ static void smc_listen_work(struct work_struct *work) smc_rx_init(new_smc); smc_tx_init(new_smc); + /* check if ISM is available */ + if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && + !smc_check_ism(new_smc, &ismdev) && + !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { + ism_supported = true; + } + /* check if RDMA is available */ - if ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_check_rdma(new_smc, &ibdev, &ibport) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact)) { + if (!ism_supported && + ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || + smc_check_rdma(new_smc, &ibdev, &ibport) || + smc_listen_rdma_check(new_smc, pclc) || + smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, + &local_contact) || + smc_listen_rdma_reg(new_smc, local_contact))) { /* SMC not supported, decline */ mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); @@ -1038,7 +1217,8 @@ static void smc_listen_work(struct work_struct *work) } /* finish worker */ - smc_listen_rdma_finish(new_smc, &cclc, local_contact); + if (!ism_supported) + smc_listen_rdma_finish(new_smc, &cclc, local_contact); smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); smc_listen_out_connected(new_smc); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 434c028162a4..66741e61a3b0 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -478,7 +478,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index cd9268a9570e..8b47e0168fc3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -257,6 +257,7 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); +int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, -- cgit v1.2.3 From 4b1b7d3b30a6d32ac1a1dcede284e76ef8a8542d Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:12 +0200 Subject: net/smc: add SMC-D diag support This patch adds diag support for SMC-D. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- include/uapi/linux/smc_diag.h | 10 ++++++++++ net/smc/smc_diag.c | 15 +++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 0ae5d4685ba3..92be255e534c 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -35,6 +35,7 @@ enum { SMC_DIAG_CONNINFO, SMC_DIAG_LGRINFO, SMC_DIAG_SHUTDOWN, + SMC_DIAG_DMBINFO, __SMC_DIAG_MAX, }; @@ -83,4 +84,13 @@ struct smc_diag_lgrinfo { struct smc_diag_linkinfo lnk[1]; __u8 role; }; + +struct smcd_diag_dmbinfo { /* SMC-D Socket internals */ + __u32 linkid; /* Link identifier */ + __u64 peer_gid; /* Peer GID */ + __u64 my_gid; /* My GID */ + __u64 token; /* Token of DMB */ + __u64 peer_token; /* Token of remote DMBE */ +}; + #endif /* _UAPI_SMC_DIAG_H_ */ diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 64ce107c24d9..6d83eef1b743 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -156,6 +156,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } + if (smc->conn.lgr && smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && + !list_empty(&smc->conn.lgr->list)) { + struct smc_connection *conn = &smc->conn; + struct smcd_diag_dmbinfo dinfo = { + .linkid = *((u32 *)conn->lgr->id), + .peer_gid = conn->lgr->peer_gid, + .my_gid = conn->lgr->smcd->local_gid, + .token = conn->rmb_desc->token, + .peer_token = conn->peer_token + }; + + if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) + goto errout; + } nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From ef9be755697f1b841c2a219a05df1a72ccd6f471 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Jun 2018 22:25:04 +0200 Subject: tipc: eliminate buffer cloning in function tipc_msg_extract() The function tipc_msg_extract() is using skb_clone() to clone inner messages from a message bundle buffer. Although this method is safe, it has an undesired effect that each buffer clone inherits the true-size of the bundling buffer. As a result, the buffer clone almost always ends up with being copied anyway by the message validation function. This makes the cloning into a sub-optimization. In this commit we take the consequence of this realization, and copy each inner message to a separately allocated buffer up front in the extraction function. As a bonus we can now eliminate the two cases where we had to copy re-routed packets that may potentially go out on the wire again. Signed-off-by: Tung Nguyen Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b6c45dccba3d..b61891054709 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -416,26 +416,31 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { - struct tipc_msg *msg; - int imsz, offset; + struct tipc_msg *hdr, *ihdr; + int imsz; *iskb = NULL; if (unlikely(skb_linearize(skb))) goto none; - msg = buf_msg(skb); - offset = msg_hdr_sz(msg) + *pos; - if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE))) + hdr = buf_msg(skb); + if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE))) goto none; - *iskb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!*iskb)) + ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos); + imsz = msg_size(ihdr); + + if ((*pos + imsz) > msg_data_sz(hdr)) goto none; - skb_pull(*iskb, offset); - imsz = msg_size(buf_msg(*iskb)); - skb_trim(*iskb, imsz); + + *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC); + if (!*iskb) + goto none; + + skb_copy_to_linear_data(*iskb, ihdr, imsz); if (unlikely(!tipc_msg_validate(iskb))) goto none; + *pos += align(imsz); return true; none: @@ -531,12 +536,6 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - if (skb_cloned(_skb) && - pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) - goto exit; - - /* reassign after skb header modifications */ - hdr = buf_msg(_skb); /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); @@ -595,10 +594,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) if (!skb_cloned(skb)) return true; - /* Unclone buffer in case it was bundled */ - if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) - return false; - return true; } -- cgit v1.2.3 From 759f29b62fb9af5274e7f761f9f4cdfa7bb5a1f2 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Jun 2018 22:39:25 +0200 Subject: tipc: optimize function tipc_node_timeout() In single-link usage, the function tipc_node_timeout() still iterates over the whole link array to handle each link. Given that the maximum number of bearers are 3, there are 2 redundant iterations with lock grab/release. Since this function is executing very frequently it makes sense to optimize it. This commit adds conditional checking to exit from the loop if the known number of configured links has already been accessed. Acked-by: Ying Xue Signed-off-by: Tung Nguyen Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 6a44eb812baf..8972ca1c654c 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -551,21 +551,23 @@ static void tipc_node_timeout(struct timer_list *t) struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; + int remains = n->link_cnt; int bearer_id; int rc = 0; __skb_queue_head_init(&xmitq); - for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); if (le->link) { + spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); + spin_unlock_bh(&le->lock); + remains--; } - spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); if (rc & TIPC_LINK_DOWN_EVT) -- cgit v1.2.3 From 6a939f365bdb03a74b4617bdb4402fc08da088b9 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Fri, 29 Jun 2018 13:23:41 +0200 Subject: tipc: Auto removal of peer down node instance A peer node is considered down if there are no active links (or) lost contact to the node. In current implementation, a peer node instance is deleted either if a) TIPC module is removed (or) b) Application can use a netlink/iproute2 interface to delete a specific down node. Thus, a down node instance lives in the system forever, unless the application explicitly removes it. We fix this by deleting the nodes which are down for a specified amount of time (5 minutes). Existing node supervision timer is used to achieve this. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David S. Miller --- net/tipc/node.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 8972ca1c654c..cfdbaf479fd1 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -45,6 +45,7 @@ #include "netlink.h" #define INVALID_NODE_SIG 0x10000 +#define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down @@ -96,6 +97,7 @@ struct tipc_bclink_entry { * @link_id: local and remote bearer ids of changing link, if any * @publ_list: list of publications * @rcu: rcu struct for tipc_node + * @delete_at: indicates the time for deleting a down node */ struct tipc_node { u32 addr; @@ -121,6 +123,7 @@ struct tipc_node { unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; + unsigned long delete_at; }; /* Node FSM states and events: @@ -160,6 +163,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); +static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; @@ -390,6 +394,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; + n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; @@ -433,11 +438,16 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } -static void tipc_node_delete(struct tipc_node *node) +static void tipc_node_delete_from_list(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); +} + +static void tipc_node_delete(struct tipc_node *node) +{ + tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); @@ -544,6 +554,42 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) tipc_node_put(node); } +static void tipc_node_clear_links(struct tipc_node *node) +{ + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + struct tipc_link_entry *le = &node->links[i]; + + if (le->link) { + kfree(le->link); + le->link = NULL; + node->link_cnt--; + } + } +} + +/* tipc_node_cleanup - delete nodes that does not + * have active links for NODE_CLEANUP_AFTER time + */ +static int tipc_node_cleanup(struct tipc_node *peer) +{ + struct tipc_net *tn = tipc_net(peer->net); + bool deleted = false; + + spin_lock_bh(&tn->node_list_lock); + tipc_node_write_lock(peer); + + if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { + tipc_node_clear_links(peer); + tipc_node_delete_from_list(peer); + deleted = true; + } + tipc_node_write_unlock(peer); + spin_unlock_bh(&tn->node_list_lock); + return deleted; +} + /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) @@ -555,6 +601,12 @@ static void tipc_node_timeout(struct timer_list *t) int bearer_id; int rc = 0; + if (!node_is_up(n) && tipc_node_cleanup(n)) { + /*Removing the reference of Timer*/ + tipc_node_put(n); + return; + } + __skb_queue_head_init(&xmitq); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { @@ -1173,6 +1225,7 @@ static void node_lost_contact(struct tipc_node *n, uint i; pr_debug("Lost contact with %x\n", n->addr); + n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); @@ -1742,7 +1795,6 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct tipc_node *peer; u32 addr; int err; - int i; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) @@ -1777,15 +1829,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) goto err_out; } - for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link_entry *le = &peer->links[i]; - - if (le->link) { - kfree(le->link); - le->link = NULL; - peer->link_cnt--; - } - } + tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); -- cgit v1.2.3 From a1be5a20f137bdf436bab86c18998229908ce951 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Fri, 29 Jun 2018 13:26:18 +0200 Subject: tipc: extend sock diag for group communication This commit extends the existing TIPC socket diagnostics framework for information related to TIPC group communication. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 14 ++++++++++++++ net/tipc/group.c | 32 ++++++++++++++++++++++++++++++++ net/tipc/group.h | 1 + net/tipc/socket.c | 5 +++++ 4 files changed, 52 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 85c11982c89b..0ebe02ef1a86 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -121,6 +121,7 @@ enum { TIPC_NLA_SOCK_TIPC_STATE, /* u32 */ TIPC_NLA_SOCK_COOKIE, /* u64 */ TIPC_NLA_SOCK_PAD, /* flag */ + TIPC_NLA_SOCK_GROUP, /* nest */ __TIPC_NLA_SOCK_MAX, TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1 @@ -233,6 +234,19 @@ enum { TIPC_NLA_MON_PEER_MAX = __TIPC_NLA_MON_PEER_MAX - 1 }; +/* Nest, socket group info */ +enum { + TIPC_NLA_SOCK_GROUP_ID, /* u32 */ + TIPC_NLA_SOCK_GROUP_OPEN, /* flag */ + TIPC_NLA_SOCK_GROUP_NODE_SCOPE, /* flag */ + TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE, /* flag */ + TIPC_NLA_SOCK_GROUP_INSTANCE, /* u32 */ + TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, /* u32 */ + + __TIPC_NLA_SOCK_GROUP_MAX, + TIPC_NLA_SOCK_GROUP_MAX = __TIPC_NLA_SOCK_GROUP_MAX - 1 +}; + /* Nest, connection info */ enum { TIPC_NLA_CON_UNSPEC, diff --git a/net/tipc/group.c b/net/tipc/group.c index d7a7befeddd4..cbe39e8db39c 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -918,3 +918,35 @@ void tipc_group_member_evt(struct tipc_group *grp, } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } + +int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) +{ + struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP); + + if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, + grp->type) || + nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, + grp->instance) || + nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, + grp->bc_snd_nxt)) + goto group_msg_cancel; + + if (grp->scope == TIPC_NODE_SCOPE) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) + goto group_msg_cancel; + + if (grp->scope == TIPC_CLUSTER_SCOPE) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) + goto group_msg_cancel; + + if (*grp->open) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) + goto group_msg_cancel; + + nla_nest_end(skb, group); + return 0; + +group_msg_cancel: + nla_nest_cancel(skb, group); + return -1; +} diff --git a/net/tipc/group.h b/net/tipc/group.h index 5996af6e9f1d..76b4e5a7b39d 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); +int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 14a5d055717d..840dd995f631 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3316,6 +3316,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, goto stat_msg_cancel; nla_nest_end(skb, stat); + + if (tsk->group) + if (tipc_group_fill_sock_diag(tsk->group, skb)) + goto stat_msg_cancel; + nla_nest_end(skb, attrs); return 0; -- cgit v1.2.3 From 4e485d06bb8c7811a0d69a811c77befd54b9ab0c Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Sat, 30 Jun 2018 00:45:55 +0530 Subject: strparser: Call skb_unclone conditionally Calling skb_unclone() is expensive as it triggers a memcpy operation. Instead of calling skb_unclone() unconditionally, call it only when skb has a shared frag_list. This improves tls rx throughout significantly. Signed-off-by: Vakul Garg Suggested-by: Boris Pismenny Signed-off-by: David S. Miller --- net/strparser/strparser.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 373836615c57..4f40a90ca016 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -155,11 +155,13 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - return 0; + if (skb_has_frag_list(head)) { + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + return 0; + } } if (unlikely(skb_shinfo(head)->frag_list)) { @@ -216,14 +218,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, memset(stm, 0, sizeof(*stm)); stm->strp.offset = orig_offset + eaten; } else { - /* Unclone since we may be appending to an skb that we + /* Unclone if we are appending to an skb that we * already share a frag_list with. */ - err = skb_unclone(skb, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - break; + if (skb_has_frag_list(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + break; + } } stm = _strp_msg(head); -- cgit v1.2.3 From 6d8e85ffe17895d7bc632dfbaa9e2e33b22fe873 Mon Sep 17 00:00:00 2001 From: Nathan Harold Date: Fri, 29 Jun 2018 15:07:10 -0700 Subject: xfrm: Allow Set Mark to be Updated Using UPDSA Allow UPDSA to change "set mark" to permit policy separation of packet routing decisions from SA keying in systems that use mark-based routing. The set mark, used as a routing and firewall mark for outbound packets, is made update-able which allows routing decisions to be handled independently of keying/SA creation. To maintain consistency with other optional attributes, the set mark is only updated if sent with a non-zero value. The per-SA lock and the xfrm_state_lock are taken in that order to avoid a deadlock with xfrm_timer_handler(), which also takes the locks in that order. Signed-off-by: Nathan Harold Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index e04a510ec992..c9ffcdfa89f6 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1562,6 +1562,15 @@ out: if (x1->curlft.use_time) xfrm_state_check_expire(x1); + if (x->props.smark.m || x->props.smark.v) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + + x1->props.smark = x->props.smark; + + __xfrm_state_bump_genids(x1); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + err = 0; x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); -- cgit v1.2.3 From 80d19669ecd34423e85ca04f2210b0e42a47cb16 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:41 -0700 Subject: net: Refactor XPS for CPUs and Rx queues Refactor XPS code to support Tx queue selection based on CPU(s) map or Rx queue(s) map. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- include/linux/cpumask.h | 11 ++- include/linux/netdevice.h | 98 ++++++++++++++++++++- net/core/dev.c | 211 ++++++++++++++++++++++++++++++---------------- net/core/net-sysfs.c | 4 +- 4 files changed, 244 insertions(+), 80 deletions(-) (limited to 'net') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bf53d893ad02..57f20a0a7794 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask; #define cpu_active(cpu) ((cpu) == 0) #endif -/* verify cpu argument to cpumask_* operators */ -static inline unsigned int cpumask_check(unsigned int cpu) +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS - WARN_ON_ONCE(cpu >= nr_cpumask_bits); + WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +} + +/* verify cpu argument to cpumask_* operators */ +static inline unsigned int cpumask_check(unsigned int cpu) +{ + cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6b377a15869..8bf8d6149f79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -731,10 +731,15 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; - struct xps_map __rcu *cpu_map[0]; + struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */ }; -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ + +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) + +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ + (_rxqs * (_tcs) * sizeof(struct xps_map *))) + #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 @@ -1910,7 +1915,8 @@ struct net_device { int watchdog_timeo; #ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps; + struct xps_dev_maps __rcu *xps_cpus_map; + struct xps_dev_maps __rcu *xps_rxqs_map; #endif #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; @@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map); + +/** + * netif_attr_test_mask - Test a CPU or Rx queue set in a mask + * @j: CPU/Rx queue index + * @mask: bitmask of all cpus/rx queues + * @nr_bits: number of bits in the bitmask + * + * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. + */ +static inline bool netif_attr_test_mask(unsigned long j, + const unsigned long *mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + return test_bit(j, mask); +} + +/** + * netif_attr_test_online - Test for online CPU/Rx queue + * @j: CPU/Rx queue index + * @online_mask: bitmask for CPUs/Rx queues that are online + * @nr_bits: number of bits in the bitmask + * + * Returns true if a CPU/Rx queue is online. + */ +static inline bool netif_attr_test_online(unsigned long j, + const unsigned long *online_mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + + if (online_mask) + return test_bit(j, online_mask); + + return (j < nr_bits); +} + +/** + * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask + * @n: CPU/Rx queue index + * @srcp: the cpumask/Rx queue mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set. + */ +static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (srcp) + return find_next_bit(srcp, nr_bits, n + 1); + + return n + 1; +} + +/** + * netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p + * @n: CPU/Rx queue index + * @src1p: the first CPUs/Rx queues mask pointer + * @src2p: the second CPUs/Rx queues mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set in both. + */ +static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, + const unsigned long *src2p, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (src1p && src2p) + return find_next_and_bit(src1p, src2p, nr_bits, n + 1); + else if (src1p) + return find_next_bit(src1p, nr_bits, n + 1); + else if (src2p) + return find_next_bit(src2p, nr_bits, n + 1); + + return n + 1; +} #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, diff --git a/net/core/dev.c b/net/core/dev.c index dffed642e686..71059558dc39 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } - RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } @@ -2135,31 +2135,58 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, + struct xps_dev_maps *dev_maps, unsigned int nr_ids, + u16 offset, u16 count, bool is_rxqs_map) +{ + bool active = false; + int i, j; + + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), + j < nr_ids;) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, + count); + if (!active) { + if (is_rxqs_map) { + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + } + kfree_rcu(dev_maps, rcu); + } +} + static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { + const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - int cpu, i; - bool active = false; + unsigned int nr_ids; mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - if (!dev_maps) - goto out_no_maps; - - for_each_possible_cpu(cpu) - active |= remove_xps_queue_cpu(dev, dev_maps, cpu, - offset, count); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + if (dev_maps) { + nr_ids = dev->num_rx_queues; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, + count, true); - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); } - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + dev_maps = xmap_dereference(dev->xps_cpus_map); + if (!dev_maps) + goto out_no_maps; + + if (num_possible_cpus() > 1) + possible_mask = cpumask_bits(cpu_possible_mask); + nr_ids = nr_cpu_ids; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, + false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2170,8 +2197,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -2183,7 +2210,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -2191,9 +2218,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -2205,14 +2237,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, - u16 index) +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map) { + const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; - int i, cpu, tci, numa_node_id = -2; + int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; + unsigned int nr_ids; if (dev->num_tc) { num_tc = dev->num_tc; @@ -2221,16 +2255,27 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -EINVAL; } - maps_sz = XPS_DEV_MAPS_SIZE(num_tc); - if (maps_sz < L1_CACHE_BYTES) - maps_sz = L1_CACHE_BYTES; - mutex_lock(&xps_map_mutex); + if (is_rxqs_map) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) { + online_mask = cpumask_bits(cpu_online_mask); + possible_mask = cpumask_bits(cpu_possible_mask); + } + dev_maps = xmap_dereference(dev->xps_cpus_map); + nr_ids = nr_cpu_ids; + } - dev_maps = xmap_dereference(dev->xps_maps); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; /* allocate memory for queue storage */ - for_each_cpu_and(cpu, cpu_online_mask, mask) { + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2238,73 +2283,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - tci = cpu * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : + tci = j * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* We need to explicitly update tci as prevous loop * could break out early if dev_maps is NULL. */ - tci = cpu * num_tc + tc; + tci = j * num_tc + tc; - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (!is_rxqs_map) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + if (is_rxqs_map) + rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); + else + rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); - map = xmap_dereference(dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } @@ -2317,19 +2370,23 @@ out_no_old_maps: active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (!is_rxqs_map) { + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); + } if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - for (i = tc, tci = cpu * num_tc; i--; tci++) + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = tc, tci = j * num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + if (!netif_attr_test_mask(j, mask, nr_ids) || + !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -2337,7 +2394,10 @@ out_no_new_maps: /* free map if not active */ if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); + if (is_rxqs_map) + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + else + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); kfree_rcu(dev_maps, rcu); } @@ -2347,11 +2407,12 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[tci]) : + xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); @@ -2363,6 +2424,12 @@ error: kfree(new_dev_maps); return -ENOMEM; } + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); +} EXPORT_SYMBOL(netif_set_xps_queue); #endif @@ -3384,7 +3451,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) int queue_index = -1; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; @@ -3393,7 +3460,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) tci += netdev_get_prio_tc_map(dev, skb->priority); } - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb7e80f4ced3..b39987c81d53 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -ENOMEM; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { for_each_possible_cpu(cpu) { int i, tci = cpu * num_tc + tc; struct xps_map *map; - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; -- cgit v1.2.3 From 04157469b7b848f4a9978b63b1ea2ce62ad3a0a3 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:46 -0700 Subject: net: Use static_key for XPS maps Use static_key for XPS maps to reduce the cost of extra map checks, similar to how it is used for RPS and RFS. This includes static_key 'xps_needed' for XPS and another for 'xps_rxqs_needed' for XPS using Rx queues map. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- net/core/dev.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 71059558dc39..43b5575e40c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS +struct static_key xps_needed __read_mostly; +EXPORT_SYMBOL(xps_needed); +struct static_key xps_rxqs_needed __read_mostly; +EXPORT_SYMBOL(xps_rxqs_needed); static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) @@ -2168,14 +2172,18 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, struct xps_dev_maps *dev_maps; unsigned int nr_ids; - mutex_lock(&xps_map_mutex); + if (!static_key_false(&xps_needed)) + return; - dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) { - nr_ids = dev->num_rx_queues; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, - count, true); + mutex_lock(&xps_map_mutex); + if (static_key_false(&xps_rxqs_needed)) { + dev_maps = xmap_dereference(dev->xps_rxqs_map); + if (dev_maps) { + nr_ids = dev->num_rx_queues; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, + offset, count, true); + } } dev_maps = xmap_dereference(dev->xps_cpus_map); @@ -2189,6 +2197,10 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, false); out_no_maps: + if (static_key_enabled(&xps_rxqs_needed)) + static_key_slow_dec(&xps_rxqs_needed); + + static_key_slow_dec(&xps_needed); mutex_unlock(&xps_map_mutex); } @@ -2297,6 +2309,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; + static_key_slow_inc(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc(&xps_rxqs_needed); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ @@ -3450,6 +3466,9 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) struct xps_map *map; int queue_index = -1; + if (!static_key_false(&xps_needed)) + return -1; + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { -- cgit v1.2.3 From c6345ce7d361dce1b5d02a2181ccb598c27fd7ae Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:57 -0700 Subject: net: Record receive queue number for a connection This patch adds a new field to sock_common 'skc_rx_queue_mapping' which holds the receive queue number for the connection. The Rx queue is marked in tcp_finish_connect() to allow a client app to do SO_INCOMING_NAPI_ID after a connect() call to get the right queue association for a socket. Rx queue is also marked in tcp_conn_request() to allow syn-ack to go on the right tx-queue associated with the queue on which syn is received. Signed-off-by: Amritha Nambiar Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/busy_poll.h | 1 + include/net/sock.h | 28 ++++++++++++++++++++++++++++ net/core/sock.c | 2 ++ net/ipv4/tcp_input.c | 3 +++ 4 files changed, 34 insertions(+) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c5187438af38..9e36fda652b7 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = skb->napi_id; #endif + sk_rx_queue_set(sk, skb); } /* variant used for unconnected sockets */ diff --git a/include/net/sock.h b/include/net/sock.h index 37b09c84504b..2b097cc89727 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair; * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings @@ -215,6 +216,9 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; +#ifdef CONFIG_XPS + unsigned short skc_rx_queue_mapping; +#endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; @@ -326,6 +330,9 @@ struct sock { #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping +#ifdef CONFIG_XPS +#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping +#endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end @@ -1702,6 +1709,27 @@ static inline int sk_tx_queue_get(const struct sock *sk) return -1; } +static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + if (skb_rx_queue_recorded(skb)) { + u16 rx_queue = skb_get_rx_queue(skb); + + if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) + return; + + sk->sk_rx_queue_mapping = rx_queue; + } +#endif +} + +static inline void sk_rx_queue_clear(struct sock *sk) +{ +#ifdef CONFIG_XPS + sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; +#endif +} + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); diff --git a/net/core/sock.c b/net/core/sock.c index bcc41829a16d..dac6d785186b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2818,6 +2818,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0U; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eecd359595fc..a4731995e899 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include #include #include +#include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -5592,6 +5593,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6420,6 +6422,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); -- cgit v1.2.3 From fc9bab24e9c654f62f3d411fc0b041be9e487e9d Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:27:02 -0700 Subject: net: Enable Tx queue selection based on Rx queues This patch adds support to pick Tx queue based on the Rx queue(s) map configuration set by the admin through the sysfs attribute for each Tx queue. If the user configuration for receive queue(s) map does not apply, then the Tx queue selection falls back to CPU(s) map based selection and finally to hashing. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++++ net/core/dev.c | 62 +++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 55 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 2b097cc89727..2ed99bfa4595 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1730,6 +1730,16 @@ static inline void sk_rx_queue_clear(struct sock *sk) #endif } +#ifdef CONFIG_XPS +static inline int sk_rx_queue_get(const struct sock *sk) +{ + if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) + return sk->sk_rx_queue_mapping; + + return -1; +} +#endif + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); diff --git a/net/core/dev.c b/net/core/dev.c index 43b5575e40c5..08d58e0debe5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3459,35 +3459,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) } #endif /* CONFIG_NET_EGRESS */ -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + struct xps_map *map; + int queue_index = -1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; - struct xps_map *map; + struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(dev->xps_rxqs_map); if (dev_maps) { - unsigned int tci = skb->sender_cpu - 1; + int tci = sk_rx_queue_get(sk); - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tci >= 0 && tci < dev->num_rx_queues) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } - map = rcu_dereference(dev_maps->attr_map[tci]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(dev->xps_cpus_map); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); } } rcu_read_unlock(); -- cgit v1.2.3 From 8af2c06ff4b144064b51b7f688194474123d9c9c Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:27:07 -0700 Subject: net-sysfs: Add interface for Rx queue(s) map per Tx queue Extend transmit queue sysfs attribute to configure Rx queue(s) map per Tx queue. By default no receive queues are configured for the Tx queue. - /sys/class/net/eth0/queues/tx-*/xps_rxqs Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b39987c81d53..f25ac5ff48a6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1283,6 +1283,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + unsigned long *mask, index; + int j, len, num_tc = 1, tc = 0; + + index = get_netdev_queue_index(queue); + + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), + j < dev->num_rx_queues;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; + } + } + } +out_no_maps: + rcu_read_unlock(); + + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + kfree(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask, index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + kfree(mask); + return err; + } + + err = __netif_set_xps_queue(dev, mask, index, true); + kfree(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { @@ -1290,6 +1372,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL -- cgit v1.2.3 From d6f19938eb031ee2158272757db33258153ae59c Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 1 Jul 2018 23:31:30 +0800 Subject: net: expose sk wmem in sock_exceed_buf_limit tracepoint Currently trace_sock_exceed_buf_limit() only show rmem info, but wmem limit may also be hit. So expose wmem info in this tracepoint as well. Regarding memcg, I think it is better to introduce a new tracepoint(if that is needed), i.e. trace_memcg_limit_hit other than show memcg info in trace_sock_exceed_buf_limit. Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/trace/events/sock.h | 30 +++++++++++++++++++++++++----- net/core/sock.c | 6 ++++-- 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3176a3931107..a0c4b8a30966 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -35,6 +35,10 @@ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) +#define skmem_kind_names \ + EM(SK_MEM_SEND) \ + EMe(SK_MEM_RECV) + /* enums need to be exported to user space */ #undef EM #undef EMe @@ -44,6 +48,7 @@ family_names inet_protocol_names tcp_state_names +skmem_kind_names #undef EM #undef EMe @@ -59,6 +64,9 @@ tcp_state_names #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) +#define show_skmem_kind_names(val) \ + __print_symbolic(val, skmem_kind_names) + TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), @@ -83,9 +91,9 @@ TRACE_EVENT(sock_rcvqueue_full, TRACE_EVENT(sock_exceed_buf_limit, - TP_PROTO(struct sock *sk, struct proto *prot, long allocated), + TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), - TP_ARGS(sk, prot, allocated), + TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) @@ -93,6 +101,10 @@ TRACE_EVENT(sock_exceed_buf_limit, __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) + __field(int, sysctl_wmem) + __field(int, wmem_alloc) + __field(int, wmem_queued) + __field(int, kind) ), TP_fast_assign( @@ -101,17 +113,25 @@ TRACE_EVENT(sock_exceed_buf_limit, __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); + __entry->sysctl_wmem = sk_get_wmem0(sk, prot); + __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); + __entry->wmem_queued = sk->sk_wmem_queued; + __entry->kind = kind; ), - TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld " - "sysctl_rmem=%d rmem_alloc=%d", + TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, - __entry->rmem_alloc) + __entry->rmem_alloc, + __entry->sysctl_wmem, + __entry->wmem_alloc, + __entry->wmem_queued, + show_skmem_kind_names(__entry->kind) + ) ); TRACE_EVENT(inet_sock_set_state, diff --git a/net/core/sock.c b/net/core/sock.c index dac6d785186b..8b69ac96a850 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2401,9 +2401,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); + bool charged = true; if (mem_cgroup_sockets_enabled && sk->sk_memcg && - !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) goto suppress_allocation; /* Under limit. */ @@ -2461,7 +2462,8 @@ suppress_allocation: return 1; } - trace_sock_exceed_buf_limit(sk, prot, allocated); + if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); -- cgit v1.2.3 From 9868c0b2eb18470a91d6f0f0df318738a50554e2 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Mon, 2 Jul 2018 00:02:02 -0400 Subject: net sched actions: add extack messages in pedit action Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index ab151346d3d4..55bc96b610e8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -144,8 +144,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, int ret = 0, err; int ksize; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) @@ -154,21 +156,27 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; - if (!pattr) + if (!pattr) { + NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; + } parm = nla_data(pattr); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (nla_len(pattr) < sizeof(*parm) + ksize) + if (nla_len(pattr) < sizeof(*parm) + ksize) { + NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); return -EINVAL; + } keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); if (!tcf_idr_check(tn, parm->index, a, bind)) { - if (!parm->nkeys) + if (!parm->nkeys) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); return -EINVAL; + } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); if (ret) -- cgit v1.2.3 From 69b9e1e07d98b57b972df3c44647ca8795284d39 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:11 +0800 Subject: ipv4: add __ip_queue_xmit() that supports tos param This patch introduces __ip_queue_xmit(), through which the callers can pass tos param into it without having to set inet->tos. For ipv6, ip6_xmit() already allows passing tclass parameter. It's needed when some transport protocol doesn't use inet->tos, like sctp's per transport dscp, which will be added in next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/ip.h | 9 ++++++++- net/ipv4/ip_output.c | 9 +++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 0d2281b4b27a..09da79d8ceea 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -148,7 +148,8 @@ void ip_send_check(struct iphdr *ip); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, @@ -174,6 +175,12 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags); +static inline int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, + struct flowi *fl) +{ + return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); +} + static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b3308e9d9762..188cc586e7ff 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), + RT_CONN_FLAGS_TOS(sk, tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -478,7 +479,7 @@ packet_routed: skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); - *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else @@ -511,7 +512,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } -EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(__ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { -- cgit v1.2.3 From 8a9c58d28d0f66569737a3295116710ed24573cd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:12 +0800 Subject: sctp: add support for dscp and flowlabel per transport Like some other per transport params, flowlabel and dscp are added in transport, asoc and sctp_sock. By default, transport sets its value from asoc's, and asoc does it from sctp_sock. flowlabel only works for ipv6 transport. Other than that they need to be passed down in sctp_xmit, flow4/6 also needs to set them before looking up route in get_dst. Note that it uses '& 0x100000' to check if flowlabel is set and '& 0x1' (tos 1st bit is unused) to check if dscp is set by users, so that they could be set to 0 by sockopt in next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 7 +++++++ include/net/sctp/structs.h | 9 +++++++++ net/sctp/associola.c | 7 +++++++ net/sctp/ipv6.c | 11 +++++++++-- net/sctp/protocol.c | 16 ++++++++++++---- 5 files changed, 44 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index b36c76635f18..83d94341e003 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -801,4 +801,11 @@ struct sctp_strreset_resptsn { __be32 receivers_next_tsn; }; +enum { + SCTP_DSCP_SET_MASK = 0x1, + SCTP_DSCP_VAL_MASK = 0xfc, + SCTP_FLOWLABEL_SET_MASK = 0x100000, + SCTP_FLOWLABEL_VAL_MASK = 0xfffff +}; + #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 701a51736fa5..ab869e0d8326 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -193,6 +193,9 @@ struct sctp_sock { /* This is the max_retrans value for new associations. */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* The initial Path MTU to use for new associations. */ __u32 pathmtu; @@ -895,6 +898,9 @@ struct sctp_transport { */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option @@ -1772,6 +1778,9 @@ struct sctp_association { */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* Flag that path mtu update is pending */ __u8 pmtu_pending; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5d5a16204d50..16ecfbc95614 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init( /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; + asoc->flowlabel = sp->flowlabel; + asoc->dscp = sp->dscp; + /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; @@ -647,6 +650,10 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; + if (addr->sa.sa_family == AF_INET6) + peer->flowlabel = asoc->flowlabel; + peer->dscp = asoc->dscp; + /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0cd2e764f47f..38102bf7f13e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + __u8 tclass = np->tclass; int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - IP6_ECN_flow_xmit(sk, fl6->flowlabel); + if (transport->dscp & SCTP_DSCP_SET_MASK) + tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + + if (INET_ECN_is_capable(tclass)) + IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; @@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + tclass); rcu_read_unlock(); return res; } @@ -254,6 +259,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; + if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) + fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 67f73d3a1356..e948db29ab53 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; + __u8 tos = inet_sk(sk)->tos; + if (t->dscp & SCTP_DSCP_SET_MASK) + tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(fl4, 0x0, sizeof(struct flowi4)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } @@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), + RT_CONN_FLAGS_TOS(asoc->base.sk, tos), daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); @@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct inet_sock *inet = inet_sk(skb->sk); + __u8 dscp = inet->tos; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); + skb->len, &transport->fl.u.ip4.saddr, + &transport->fl.u.ip4.daddr); + + if (transport->dscp & SCTP_DSCP_SET_MASK) + dscp = transport->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(&inet->sk, skb, &transport->fl); + return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); } static struct sctp_af sctp_af_inet; -- cgit v1.2.3 From 0b0dce7a36fb9f1a9dd8245ea82d3a268c6943fe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:13 +0800 Subject: sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams spp_ipv6_flowlabel and spp_dscp are added in sctp_paddrparams in this patch so that users could set sctp_sock/asoc/transport dscp and flowlabel with spp_flags SPP_IPV6_FLOWLABEL or SPP_DSCP by SCTP_PEER_ADDR_PARAMS , as described section 8.1.12 in RFC6458. As said in last patch, it uses '| 0x100000' or '|0x1' to mark flowlabel or dscp is set, so that their values could be set to 0. Note that to guarantee that an old app built with old kernel headers could work on the newer kernel, the param's check in sctp_g/setsockopt_peer_addr_params() is also improved, which follows the way that sctp_g/setsockopt_delayed_ack() or some other sockopts' process that accept two types of params does. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 4 ++ net/sctp/socket.c | 177 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 175 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index c02986a284db..b479db5c71d9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -763,6 +763,8 @@ enum sctp_spp_flags { SPP_SACKDELAY_DISABLE = 1<<6, /*Disable SACK*/ SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE, SPP_HB_TIME_IS_ZERO = 1<<7, /* Set HB delay to 0 */ + SPP_IPV6_FLOWLABEL = 1<<8, + SPP_DSCP = 1<<9, }; struct sctp_paddrparams { @@ -773,6 +775,8 @@ struct sctp_paddrparams { __u32 spp_pathmtu; __u32 spp_sackdelay; __u32 spp_flags; + __u32 spp_ipv6_flowlabel; + __u8 spp_dscp; } __attribute__((packed, aligned(4))); /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0e4c8332771a..50b7ef975b42 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2393,6 +2393,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; + * uint32_t spp_ipv6_flowlabel; + * uint8_t spp_dscp; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the @@ -2472,6 +2474,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. + * + * SPP_IPV6_FLOWLABEL: Setting this flag enables the + * setting of the IPV6 flow label value. The value is + * contained in the spp_ipv6_flowlabel field. + * Upon retrieval, this flag will be set to indicate that + * the spp_ipv6_flowlabel field has a valid value returned. + * If a specific destination address is set (in the + * spp_address field), then the value returned is that of + * the address. If just an association is specified (and + * no address), then the association's default flow label + * is returned. If neither an association nor a destination + * is specified, then the socket's default flow label is + * returned. For non-IPv6 sockets, this flag will be left + * cleared. + * + * SPP_DSCP: Setting this flag enables the setting of the + * Differentiated Services Code Point (DSCP) value + * associated with either the association or a specific + * address. The value is obtained in the spp_dscp field. + * Upon retrieval, this flag will be set to indicate that + * the spp_dscp field has a valid value returned. If a + * specific destination address is set when called (in the + * spp_address field), then that specific destination + * address's DSCP value is returned. If just an association + * is specified, then the association's default DSCP is + * returned. If neither an association nor a destination is + * specified, then the socket's default DSCP is returned. + * + * spp_ipv6_flowlabel + * - This field is used in conjunction with the + * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. + * The 20 least significant bits are used for the flow + * label. This setting has precedence over any IPv6-layer + * setting. + * + * spp_dscp - This field is used in conjunction with the SPP_DSCP flag + * and contains the DSCP. The 6 most significant bits are + * used for the DSCP. This setting has precedence over any + * IPv4- or IPv6- layer setting. */ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, struct sctp_transport *trans, @@ -2611,6 +2652,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } } + if (params->spp_flags & SPP_IPV6_FLOWLABEL) { + if (trans && trans->ipaddr.sa.sa_family == AF_INET6) { + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else if (asoc) { + list_for_each_entry(trans, + &asoc->peer.transport_addr_list, + transports) { + if (trans->ipaddr.sa.sa_family != AF_INET6) + continue; + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } + asoc->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) { + sp->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } + } + + if (params->spp_flags & SPP_DSCP) { + if (trans) { + trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + trans->dscp |= SCTP_DSCP_SET_MASK; + } else if (asoc) { + list_for_each_entry(trans, + &asoc->peer.transport_addr_list, + transports) { + trans->dscp = params->spp_dscp & + SCTP_DSCP_VAL_MASK; + trans->dscp |= SCTP_DSCP_SET_MASK; + } + asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + asoc->dscp |= SCTP_DSCP_SET_MASK; + } else { + sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + sp->dscp |= SCTP_DSCP_SET_MASK; + } + } + return 0; } @@ -2625,11 +2711,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, int error; int hb_change, pmtud_change, sackdelay_change; - if (optlen != sizeof(struct sctp_paddrparams)) + if (optlen == sizeof(params)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) + return -EINVAL; + } else { return -EINVAL; - - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; + } /* Validate flags and value parameters. */ hb_change = params.spp_flags & SPP_HB; @@ -5453,6 +5546,45 @@ out: * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. + * + * SPP_IPV6_FLOWLABEL: Setting this flag enables the + * setting of the IPV6 flow label value. The value is + * contained in the spp_ipv6_flowlabel field. + * Upon retrieval, this flag will be set to indicate that + * the spp_ipv6_flowlabel field has a valid value returned. + * If a specific destination address is set (in the + * spp_address field), then the value returned is that of + * the address. If just an association is specified (and + * no address), then the association's default flow label + * is returned. If neither an association nor a destination + * is specified, then the socket's default flow label is + * returned. For non-IPv6 sockets, this flag will be left + * cleared. + * + * SPP_DSCP: Setting this flag enables the setting of the + * Differentiated Services Code Point (DSCP) value + * associated with either the association or a specific + * address. The value is obtained in the spp_dscp field. + * Upon retrieval, this flag will be set to indicate that + * the spp_dscp field has a valid value returned. If a + * specific destination address is set when called (in the + * spp_address field), then that specific destination + * address's DSCP value is returned. If just an association + * is specified, then the association's default DSCP is + * returned. If neither an association nor a destination is + * specified, then the socket's default DSCP is returned. + * + * spp_ipv6_flowlabel + * - This field is used in conjunction with the + * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. + * The 20 least significant bits are used for the flow + * label. This setting has precedence over any IPv6-layer + * setting. + * + * spp_dscp - This field is used in conjunction with the SPP_DSCP flag + * and contains the DSCP. The 6 most significant bits are + * used for the DSCP. This setting has precedence over any + * IPv4- or IPv6- layer setting. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -5462,9 +5594,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); - if (len < sizeof(struct sctp_paddrparams)) + if (len >= sizeof(params)) + len = sizeof(params); + else if (len >= ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4)) + len = ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4); + else return -EINVAL; - len = sizeof(struct sctp_paddrparams); + if (copy_from_user(¶ms, optval, len)) return -EFAULT; @@ -5499,6 +5637,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = trans->param_flags; + if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = trans->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (trans->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } else if (asoc) { /* Fetch association values. */ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); @@ -5508,6 +5655,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = asoc->param_flags; + if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = asoc->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (asoc->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } else { /* Fetch socket values. */ params.spp_hbinterval = sp->hbinterval; @@ -5517,6 +5673,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = sp->param_flags; + if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = sp->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (sp->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } if (copy_to_user(optval, ¶ms, len)) -- cgit v1.2.3 From 4be4139f7d0dc74e5a0932c7c7ddf0eb65da9e3a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:14 +0800 Subject: sctp: add support for setting flowlabel when adding a transport Struct sockaddr_in6 has the member sin6_flowinfo that includes the ipv6 flowlabel, it should also support for setting flowlabel when adding a transport whose ipaddr is from userspace. Note that addrinfo in sctp_sendmsg is using struct in6_addr for the secondary addrs, which doesn't contain sin6_flowinfo, and it needs to copy sin6_flowinfo from the primary addr. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/associola.c | 12 ++++++++++-- net/sctp/socket.c | 5 +++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 16ecfbc95614..297d9cf960b9 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -650,8 +650,16 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; - if (addr->sa.sa_family == AF_INET6) - peer->flowlabel = asoc->flowlabel; + if (addr->sa.sa_family == AF_INET6) { + __be32 info = addr->v6.sin6_flowinfo; + + if (info) { + peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK); + peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else { + peer->flowlabel = asoc->flowlabel; + } + } peer->dscp = asoc->dscp; /* Enable/disable heartbeat, SACK delay, and path MTU discovery diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 50b7ef975b42..502c0d7cb105 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1697,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_association *asoc; enum sctp_scope scope; struct cmsghdr *cmsg; + __be32 flowinfo = 0; struct sctp_af *af; int err; @@ -1781,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, if (!cmsgs->addrs_msg) return 0; + if (daddr->sa.sa_family == AF_INET6) + flowinfo = daddr->v6.sin6_flowinfo; + /* sendv addr list parse */ for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { struct sctp_transport *transport; @@ -1813,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, } dlen = sizeof(struct in6_addr); + daddr->v6.sin6_flowinfo = flowinfo; daddr->v6.sin6_family = AF_INET6; daddr->v6.sin6_port = htons(asoc->peer.port); memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); -- cgit v1.2.3 From 0999f021c988770a37edfb266027db9c413901fd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:15 +0800 Subject: sctp: check for ipv6_pinfo legal sndflow with flowlabel in sctp_v6_get_dst The transport with illegal flowlabel should not be allowed to send packets. Other transport protocols already denies this. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 38102bf7f13e..fc6c5e4bffa5 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -262,6 +262,15 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); + if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { + struct ip6_flowlabel *flowlabel; + + flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); + if (!flowlabel) + goto out; + fl6_sock_release(flowlabel); + } + pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); if (asoc) -- cgit v1.2.3 From f6ad8c1bcdf014272d08c55b9469536952a0a771 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:12:45 +0100 Subject: net: core: trivial netif_receive_skb_list() entry point Just calls netif_receive_skb() in a loop. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64480a0f2c16..f67258f057ca 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3477,6 +3477,7 @@ int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); diff --git a/net/core/dev.c b/net/core/dev.c index 08d58e0debe5..85c456a4b551 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4906,6 +4906,25 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * For now, just calls netif_receive_skb() in a loop, ignoring the + * return value. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + netif_receive_skb(skb); +} +EXPORT_SYMBOL(netif_receive_skb_list); + DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ -- cgit v1.2.3 From 920572b73280a29e3a9f58807a8b90051b19ee60 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:13:11 +0100 Subject: net: core: unwrap skb list receive slightly further Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/trace/events/net.h | 7 +++++++ net/core/dev.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 9c886739246a..00aa72ce0e7c 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -223,6 +223,13 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_ARGS(skb) ); +DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, + + TP_PROTO(const struct sk_buff *skb), + + TP_ARGS(skb) +); + DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), diff --git a/net/core/dev.c b/net/core/dev.c index 85c456a4b551..308acfd48139 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4920,8 +4920,10 @@ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb, *next; + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); list_for_each_entry_safe(skb, next, head, list) - netif_receive_skb(skb); + netif_receive_skb_internal(skb); } EXPORT_SYMBOL(netif_receive_skb_list); -- cgit v1.2.3 From 7da517a3bc529dc5399e742688b32cafa2ca5ca0 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:13:24 +0100 Subject: net: core: Another step of skb receive list processing netif_receive_skb_list_internal() now processes a list and hands it on to the next function. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 308acfd48139..1e87361df2ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4843,6 +4843,14 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + __netif_receive_skb(skb); +} + static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4883,6 +4891,50 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } +static void netif_receive_skb_list_internal(struct list_head *head) +{ + struct bpf_prog *xdp_prog = NULL; + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(netdev_tstamp_prequeue, skb); + if (skb_defer_rx_timestamp(skb)) + /* Handled, remove from list */ + list_del(&skb->list); + } + + if (static_branch_unlikely(&generic_xdp_needed_key)) { + preempt_disable(); + rcu_read_lock(); + list_for_each_entry_safe(skb, next, head, list) { + xdp_prog = rcu_dereference(skb->dev->xdp_prog); + if (do_xdp_generic(xdp_prog, skb) != XDP_PASS) + /* Dropped, remove from list */ + list_del(&skb->list); + } + rcu_read_unlock(); + preempt_enable(); + } + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + /* Handled, remove from list */ + list_del(&skb->list); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -4910,20 +4962,19 @@ EXPORT_SYMBOL(netif_receive_skb); * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * - * For now, just calls netif_receive_skb() in a loop, ignoring the - * return value. + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { - struct sk_buff *skb, *next; + struct sk_buff *skb; list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); - list_for_each_entry_safe(skb, next, head, list) - netif_receive_skb_internal(skb); + netif_receive_skb_list_internal(head); } EXPORT_SYMBOL(netif_receive_skb_list); -- cgit v1.2.3 From 4ce0017a373afaaa9ef17614d8fa4f6fde261d18 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:13:40 +0100 Subject: net: core: another layer of lists, around PF_MEMALLOC skb handling First example of a layer splitting the list (rather than merely taking individual packets off it). Involves new list.h function, list_cut_before(), like list_cut_position() but cuts on the other side of the given entry. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/list.h | 30 ++++++++++++++++++++++++++++++ net/core/dev.c | 44 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/list.h b/include/linux/list.h index 4b129df4d46b..de04cc5ed536 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head *list, __list_cut_position(list, head, entry); } +/** + * list_cut_before - cut a list into two, before given entry + * @list: a new list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * + * This helper moves the initial part of @head, up to but + * excluding @entry, from @head to @list. You should pass + * in @entry an element you know is on @head. @list should + * be an empty list or a list you do not care about losing + * its data. + * If @entry == @head, all entries on @head are moved to + * @list. + */ +static inline void list_cut_before(struct list_head *list, + struct list_head *head, + struct list_head *entry) +{ + if (head->next == entry) { + INIT_LIST_HEAD(list); + return; + } + list->next = head->next; + list->next->prev = list; + list->prev = entry->prev; + list->prev->next = list; + head->next = entry; + entry->prev = head; +} + static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) diff --git a/net/core/dev.c b/net/core/dev.c index 1e87361df2ab..9aadef976e8c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4784,6 +4784,14 @@ int netif_receive_skb_core(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb_core); +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + __netif_receive_skb_core(skb, pfmemalloc); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; @@ -4809,6 +4817,34 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4843,14 +4879,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) return ret; } -static void __netif_receive_skb_list(struct list_head *head) -{ - struct sk_buff *skb, *next; - - list_for_each_entry_safe(skb, next, head, list) - __netif_receive_skb(skb); -} - static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; -- cgit v1.2.3 From 88eb1944e18c1ba61da538ae9d1732832eb79b9d Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:13:56 +0100 Subject: net: core: propagate SKB lists through packet_type lookup __netif_receive_skb_core() does a depressingly large amount of per-packet work that can't easily be listified, because the another_round looping makes it nontrivial to slice up into smaller functions. Fortunately, most of that work disappears in the fast path: * Hardware devices generally don't have an rx_handler * Unless you're tcpdumping or something, there is usually only one ptype * VLAN processing comes before the protocol ptype lookup, so doesn't force a pt_prev deliver so normally, __netif_receive_skb_core() will run straight through and pass back the one ptype found in ptype_base[hash of skb->protocol]. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 9aadef976e8c..1bc485bb0678 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4608,7 +4608,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, return 0; } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, + struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -4738,8 +4739,7 @@ skip_classify: if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; - else - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) @@ -4757,6 +4757,18 @@ out: return ret; } +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + return ret; +} + /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process @@ -4777,19 +4789,63 @@ int netif_receive_skb_core(struct sk_buff *skb) int ret; rcu_read_lock(); - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); -static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) { struct sk_buff *skb, *next; + if (!pt_prev) + return; + if (list_empty(head)) + return; + list_for_each_entry_safe(skb, next, head, list) - __netif_receive_skb_core(skb, pfmemalloc); + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct list_head sublist; + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + pt_curr = pt_prev; + od_curr = orig_dev; + } + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(head, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) @@ -4809,10 +4865,10 @@ static int __netif_receive_skb(struct sk_buff *skb) * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); - ret = __netif_receive_skb_core(skb, true); + ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); return ret; } -- cgit v1.2.3 From 17266ee939849cb095ed7dd9edbec4162172226b Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:14:12 +0100 Subject: net: ipv4: listified version of ip_rcv Also involved adding a way to run a netfilter hook over a list of packets. Rather than attempting to make netfilter know about lists (which would be a major project in itself) we just let it call the regular okfn (in this case ip_rcv_finish()) for any packets it steals, and have it give us back a list of packets it's synchronously accepted (which normally NF_HOOK would automatically call okfn() on, but we want to be able to potentially pass the list to a listified version of okfn().) The netfilter hooks themselves are indirect calls that still happen per- packet (see nf_hook_entry_hookfn()), but again, changing that can be left for future work. There is potential for out-of-order receives if the netfilter hook ends up synchronously stealing packets, as they will be processed before any accepts earlier in the list. However, it was already possible for an asynchronous accept to cause out-of-order receives, so presumably this is considered OK. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/netfilter.h | 22 +++++++++++++++ include/net/ip.h | 2 ++ net/core/dev.c | 8 +++--- net/ipv4/af_inet.c | 1 + net/ipv4/ip_input.c | 68 ++++++++++++++++++++++++++++++++++++++++++----- 6 files changed, 94 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f67258f057ca..c1ef749b6f9f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2297,6 +2297,9 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + void (*list_func) (struct list_head *, + struct packet_type *, + struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); void *af_packet_priv; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index dd2052f0efb7..5a5e0a2ab2a3 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct return ret; } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); + if (ret != 1) + list_del(&skb->list); + } +} + /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); @@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, return okfn(net, sk, skb); } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + /* nothing to do */ +} + static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, diff --git a/include/net/ip.h b/include/net/ip.h index 09da79d8ceea..99d1b835d2aa 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, struct ip_options_rcu *opt); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1bc485bb0678..5e22719ce71d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4806,9 +4806,11 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head, return; if (list_empty(head)) return; - - list_for_each_entry_safe(skb, next, head, list) - pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (pt_prev->list_func != NULL) + pt_prev->list_func(head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9263a2c114e0..c716be13d58c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..914240830bdf 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -408,10 +408,9 @@ drop_error: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +420,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +487,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +496,63 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + struct sk_buff *skb, *next; + + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + list_for_each_entry_safe(skb, next, head, list) + ip_rcv_finish(net, NULL, skb); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, dev, net); + /* start new sublist */ + curr_dev = dev; + curr_net = net; + } + } + /* dispatch final sublist */ + ip_sublist_rcv(head, curr_dev, curr_net); } -- cgit v1.2.3 From 5fa12739a53d0780265ed9d44d9ec9ba5f9ad00a Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:14:34 +0100 Subject: net: ipv4: listify ip_rcv_finish ip_rcv_finish_core(), if it does not drop, sets skb->dst by either early demux or route lookup. The last step, calling dst_input(skb), is left to the caller; in the listified case, we split to form sublists with a common dst, but then ip_sublist_rcv_finish() just calls dst_input(skb) in a loop. The next step in listification would thus be to add a list_input() method to struct dst_entry. Early demux is an indirect call based on iph->protocol; this is another opportunity for listification which is not taken here (it would require slicing up ip_rcv_finish_core() to allow splitting on protocol changes). Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 914240830bdf..24b9b0210aeb 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,7 +307,8 @@ drop: return true; } -static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); @@ -393,7 +394,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - return dst_input(skb); + return NET_RX_SUCCESS; drop: kfree_skb(skb); @@ -405,6 +406,15 @@ drop_error: goto drop; } +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret = ip_rcv_finish_core(net, sk, skb); + + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; +} + /* * Main IP Receive routine. */ @@ -515,15 +525,47 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ip_rcv_finish); } -static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, - struct net *net) +static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; + list_for_each_entry_safe(skb, next, head, list) + dst_input(skb); +} + +static void ip_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + curr_dst = dst; + } + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(head); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); - list_for_each_entry_safe(skb, next, head, list) - ip_rcv_finish(net, NULL, skb); + ip_list_rcv_finish(net, NULL, head); } /* Receive a list of IP packets */ -- cgit v1.2.3 From b9f463d6c9849230043123a6335d59ac7fea4d5a Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:14:44 +0100 Subject: net: don't bother calling list RX functions on empty lists Generally the check should be very cheap, as the sk_buff_head is in cache. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 5e22719ce71d..7e6a2f66db5c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4887,7 +4887,8 @@ static void __netif_receive_skb_list(struct list_head *head) /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); - __netif_receive_skb_list_core(&sublist, pfmemalloc); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) @@ -4897,7 +4898,8 @@ static void __netif_receive_skb_list(struct list_head *head) } } /* Handle the remaining sublist */ - __netif_receive_skb_list_core(head, pfmemalloc); + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); @@ -5058,6 +5060,8 @@ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; + if (list_empty(head)) + return; list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); netif_receive_skb_list_internal(head); -- cgit v1.2.3 From e7e3728bd776d1d1450212ad266832f1003f833f Mon Sep 17 00:00:00 2001 From: Qiaobin Fu Date: Sun, 1 Jul 2018 15:16:27 -0400 Subject: net:sched: add action inheritdsfield to skbedit The new action inheritdsfield copies the field DS of IPv4 and IPv6 packets into skb->priority. This enables later classification of packets based on the DS field. v5: *Update the drop counter for TC_ACT_SHOT v4: *Not allow setting flags other than the expected ones. *Allow dumping the pure flags. v3: *Use optional flags, so that it won't break old versions of tc. *Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags. v2: *Fix the style issue *Move the code from skbmod to skbedit Original idea by Jamal Hadi Salim Signed-off-by: Qiaobin Fu Reviewed-by: Michel Machado Acked-by: Jamal Hadi Salim Reviewed-by: Marcelo Ricardo Leitner Acked-by: Davide Caratti Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_skbedit.h | 2 ++ net/sched/act_skbedit.c | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index fbcfe27a4e6c..6de6071ebed6 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -30,6 +30,7 @@ #define SKBEDIT_F_MARK 0x4 #define SKBEDIT_F_PTYPE 0x8 #define SKBEDIT_F_MASK 0x10 +#define SKBEDIT_F_INHERITDSFIELD 0x20 struct tc_skbedit { tc_gen; @@ -45,6 +46,7 @@ enum { TCA_SKBEDIT_PAD, TCA_SKBEDIT_PTYPE, TCA_SKBEDIT_MASK, + TCA_SKBEDIT_FLAGS, __TCA_SKBEDIT_MAX }; #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6138d1d71900..dfaf5d8028dd 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include #include @@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, if (d->flags & SKBEDIT_F_PRIORITY) skb->priority = d->priority; + if (d->flags & SKBEDIT_F_INHERITDSFIELD) { + int wlen = skb_network_offset(skb); + + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + break; + + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + break; + } + } if (d->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > d->queue_mapping) skb_set_queue_mapping(skb, d->queue_mapping); @@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, spin_unlock(&d->tcf_lock); return d->tcf_action; + +err: + d->tcf_qstats.drops++; + spin_unlock(&d->tcf_lock); + return TC_ACT_SHOT; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, @@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mask = nla_data(tb[TCA_SKBEDIT_MASK]); } + if (tb[TCA_SKBEDIT_FLAGS] != NULL) { + u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); + + if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) + flags |= SKBEDIT_F_INHERITDSFIELD; + } + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, .action = d->tcf_action, }; struct tcf_t t; + u64 pure_flags = 0; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, if ((d->flags & SKBEDIT_F_MASK) && nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) goto nla_put_failure; + if (d->flags & SKBEDIT_F_INHERITDSFIELD) + pure_flags |= SKBEDIT_F_INHERITDSFIELD; + if (pure_flags != 0 && + nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) + goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) -- cgit v1.2.3 From 30e99ed6dbdde68f5ad23db3a5872c3c247526b6 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 3 Jul 2018 13:45:12 +0000 Subject: net: sched: act_pedit: fix possible memory leak in tcf_pedit_init() 'keys_ex' is malloced by tcf_pedit_keys_ex_parse() in tcf_pedit_init() but not all of the error handle path free it, this may cause memory leak. This patch fix it. Fixes: 71d0ed7079df ("net/act_pedit: Support using offset relative to the conventional network headers") Signed-off-by: Wei Yongjun Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 55bc96b610e8..e43aef28fdac 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -175,32 +175,35 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!tcf_idr_check(tn, parm->index, a, bind)) { if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); - return -EINVAL; + ret = -EINVAL; + goto out_free; } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); if (ret) - return ret; + goto out_free; p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { tcf_idr_release(*a, bind); - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } ret = ACT_P_CREATED; } else { if (bind) - return 0; + goto out_free; tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + if (!ovr) { + ret = -EEXIST; + goto out_free; + } p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } } } @@ -222,6 +225,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +out_free: + kfree(keys_ex); + return ret; + } static void tcf_pedit_cleanup(struct tc_action *a) -- cgit v1.2.3 From c47d8c2f38f805ba541496ddd7d8c3aee59b49d5 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:47 -0700 Subject: net: Clear skb->tstamp only on the forwarding path This is done in preparation for the upcoming time based transmission patchset. Now that skb->tstamp will be used to hold packet's txtime, we must ensure that it is being cleared when traversing namespaces. Also, doing that from skb_scrub_packet() before the early return would break our feature when tunnels are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1357f36c8a5e..c4e24ac27464 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4898,7 +4898,6 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -4912,6 +4911,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) ipvs_reset(skb); skb->mark = 0; + skb->tstamp = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); -- cgit v1.2.3 From 80b14dee2bea128928537d61c333f24cb8cbb62f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 3 Jul 2018 15:42:48 -0700 Subject: net: Add a new socket option for a future transmit time. This patch introduces SO_TXTIME. User space enables this option in order to pass a desired future transmit time in a CMSG when calling sendmsg(2). The argument to this socket option is a 8-bytes long struct provided by the uapi header net_tstamp.h defined as: struct sock_txtime { clockid_t clockid; u32 flags; }; Note that new fields were added to struct sock by filling a 2-bytes hole found in the struct. For that reason, neither the struct size or number of cachelines were altered. Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 3 +++ arch/ia64/include/uapi/asm/socket.h | 3 +++ arch/mips/include/uapi/asm/socket.h | 3 +++ arch/parisc/include/uapi/asm/socket.h | 3 +++ arch/s390/include/uapi/asm/socket.h | 3 +++ arch/sparc/include/uapi/asm/socket.h | 3 +++ arch/xtensa/include/uapi/asm/socket.h | 3 +++ include/net/sock.h | 10 ++++++++++ include/uapi/asm-generic/socket.h | 3 +++ include/uapi/linux/net_tstamp.h | 15 +++++++++++++++ net/core/sock.c | 35 +++++++++++++++++++++++++++++++++++ 11 files changed, 84 insertions(+) (limited to 'net') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index be14f16149d5..065fb372e355 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -112,4 +112,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 3efba40adc54..c872c4e6bafb 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -114,4 +114,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 49c3d4795963..71370fb3ceef 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -123,4 +123,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1d0fdc3b5d22..061b9cf2a779 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -104,4 +104,7 @@ #define SO_ZEROCOPY 0x4035 +#define SO_TXTIME 0x4036 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 3510c0fd06f4..39d901476ee5 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -111,4 +111,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index d58520c2e6ff..7ea35e5601b6 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -101,6 +101,9 @@ #define SO_ZEROCOPY 0x003e +#define SO_TXTIME 0x003f +#define SCM_TXTIME SO_TXTIME + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 75a07b8119a9..1de07a7f7680 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -116,4 +116,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 2ed99bfa4595..68347b9821c6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -319,6 +319,9 @@ struct sock_common { * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_rcu: used during RCU grace period + * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) + * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME + * @sk_txtime_unused: unused txtime flags */ struct sock { /* @@ -475,6 +478,11 @@ struct sock { u8 sk_shutdown; u32 sk_tskey; atomic_t sk_zckey; + + u8 sk_clockid; + u8 sk_txtime_deadline_mode : 1, + sk_txtime_unused : 7; + struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -790,6 +798,7 @@ enum sock_flags { SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ + SOCK_TXTIME, }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1585,6 +1594,7 @@ void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { + u64 transmit_time; u32 mark; u16 tsflags; }; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 0ae758c90e54..a12692e5f7a8 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -107,4 +107,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 4fe104b2411f..c9a77c353b98 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -141,4 +141,19 @@ struct scm_ts_pktinfo { __u32 reserved[2]; }; +/* + * SO_TXTIME gets a struct sock_txtime with flags being an integer bit + * field comprised of these values. + */ +enum txtime_flags { + SOF_TXTIME_DEADLINE_MODE = (1 << 0), + + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) +}; + +struct sock_txtime { + clockid_t clockid; /* reference clockid */ + u32 flags; /* flags defined by enum txtime_flags */ +}; + #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 6429982eb976..fe64b839f1b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -91,6 +91,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop); int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; @@ -1070,6 +1072,24 @@ set_rcvbuf: } break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + } else if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + } else if (copy_from_user(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + } else { + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + } + break; + default: ret = -ENOPROTOOPT; break; @@ -1115,6 +1135,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, u64 val64; struct linger ling; struct timeval tm; + struct sock_txtime txtime; } v; int lv = sizeof(int); @@ -1403,6 +1424,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2137,6 +2165,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: -- cgit v1.2.3 From bc969a977880511057053642a81371196303ca01 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:49 -0700 Subject: net: ipv4: Hook into time based transmission Add a transmit_time field to struct inet_cork, then copy the timestamp from the CMSG cookie at ip_setup_cork() so we can safely copy it into the skb later during __ip_make_skb(). For the raw fast path, just perform the copy at raw_send_hdrinc(). Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + net/ipv4/icmp.c | 2 ++ net/ipv4/ip_output.c | 3 +++ net/ipv4/ping.c | 1 + net/ipv4/raw.c | 2 ++ net/ipv4/udp.c | 1 + 6 files changed, 10 insertions(+) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 83d5b3c2ac42..314be484c696 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -148,6 +148,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u64 transmit_time; }; struct inet_cork_full { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..937239afd68d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -437,6 +437,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -715,6 +716,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 188cc586e7ff..570e3ebc3974 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1154,6 +1154,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->tos = ipc->tos; cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; return 0; } @@ -1414,6 +1415,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1551,6 +1553,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..b47492205507 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -746,6 +746,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..446af7be2b55 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -562,6 +563,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipc.sockc.tsflags = sk->sk_tsflags; + ipc.sockc.transmit_time = 0; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24e116ddae79..5c76ba0666ec 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -930,6 +930,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; -- cgit v1.2.3 From a818f75e311c23cdac528888c60ae6e43a8958d0 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:50 -0700 Subject: net: ipv6: Hook into time based transmission Add a struct sockcm_cookie parameter to ip6_setup_cork() so we can easily re-use the transmit_time field from struct inet_cork for most paths, by copying the timestamp from the CMSG cookie. This is later copied into the skb during __ip6_make_skb(). For the raw fast path, also pass the sockcm_cookie as a parameter so we can just perform the copy at rawv6_send_hdrinc() directly. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 11 ++++++++--- net/ipv6/raw.c | 7 +++++-- net/ipv6/udp.c | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a14fb4fcdf18..f48af7e62f12 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1158,7 +1158,8 @@ static void ip6_append_data_mtu(unsigned int *mtu, static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, - struct rt6_info *rt, struct flowi6 *fl6) + struct rt6_info *rt, struct flowi6 *fl6, + const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; @@ -1226,6 +1227,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; + cork->base.transmit_time = sockc->transmit_time; + return 0; } @@ -1575,7 +1578,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ err = ip6_setup_cork(sk, &inet->cork, &np->cork, - ipc6, rt, fl6); + ipc6, rt, fl6, sockc); if (err) return err; @@ -1673,6 +1676,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->base.transmit_time; + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { @@ -1765,7 +1770,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork->base.opt = NULL; cork->base.dst = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); + err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6, sockc); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index afc307c89d1a..5737c50f16eb 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -620,7 +620,7 @@ out: static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, - unsigned int flags) + unsigned int flags, const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *dstp = NULL; @@ -848,6 +849,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = sk->sk_bound_dev_if; sockc.tsflags = sk->sk_tsflags; + sockc.transmit_time = 0; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); @@ -921,7 +923,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: if (inet->hdrincl) - err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); + err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, + msg->msg_flags, &sockc); else { ipc6.opt = opt; lock_sock(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e6645cae403e..ac6fc6728903 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1148,6 +1148,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.dontfrag = -1; ipc6.gso_size = up->gso_size; sockc.tsflags = sk->sk_tsflags; + sockc.transmit_time = 0; /* destination address check */ if (sin6) { -- cgit v1.2.3 From 3d0ba8c03ca9c49ffcb79d989312f123dd1bdc7a Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 3 Jul 2018 15:42:51 -0700 Subject: net: packet: Hook into time based transmission. For raw layer-2 packets, copy the desired future transmit time from the CMSG cookie into the skb. Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 57634bc3da74..3428f7739ae9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1951,6 +1951,7 @@ retry: goto out_unlock; } + sockc.transmit_time = 0; sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -1962,6 +1963,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); @@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; @@ -2633,6 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; + sockc.transmit_time = 0; sockc.tsflags = po->sk.sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); @@ -2829,6 +2833,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.transmit_time = 0; sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { @@ -2903,6 +2908,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sockc.mark; + skb->tstamp = sockc.transmit_time; if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); -- cgit v1.2.3 From 860b642b9c33ea4a6ae2f416607b0b98a9d11bb0 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:52 -0700 Subject: net/sched: Allow creating a Qdisc watchdog with other clocks This adds 'qdisc_watchdog_init_clockid()' that allows a clockid to be passed, this allows other time references to be used when scheduling the Qdisc to run. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 ++ net/sched/sch_api.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 815b92a23936..2466ea143d01 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -72,6 +72,8 @@ struct qdisc_watchdog { struct Qdisc *qdisc; }; +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54eca685420f..98541c6399db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) -- cgit v1.2.3 From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:53 -0700 Subject: net/sched: Introduce the ETF Qdisc The ETF (Earliest TxTime First) qdisc uses the information added earlier in this series (the socket option SO_TXTIME and the new role of sk_buff->tstamp) to schedule packets transmission based on absolute time. For some workloads, just bandwidth enforcement is not enough, and precise control of the transmission of packets is necessary. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \ clockid CLOCK_TAI In this example, the Qdisc will provide SW best-effort for the control of the transmission time to the network adapter, the time stamp in the socket will be in reference to the clockid CLOCK_TAI and packets will leave the qdisc "delta" (100000) nanoseconds before its transmission time. The ETF qdisc will buffer packets sorted by their txtime. It will drop packets on enqueue() if their skbuff clockid does not match the clock reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped if it expires while being enqueued. The qdisc also supports the SO_TXTIME deadline mode. For this mode, it will dequeue a packet as soon as possible and change the skb timestamp to 'now' during etf_dequeue(). Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid to be configured, but it's been decided that usage of CLOCK_TAI should be enforced until we decide to allow for other clockids to be used. The rationale here is that PTP times are usually in the TAI scale, thus no other clocks should be necessary. For now, the qdisc will return EINVAL if any clocks other than CLOCK_TAI are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/uapi/linux/pkt_sched.h | 17 ++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_etf.c | 384 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+) create mode 100644 net/sched/sch_etf.c (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1ef749b6f9f..f06ee8f91e74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -798,6 +798,7 @@ enum tc_setup_type { TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, + TC_SETUP_QDISC_ETF, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bad3c03bcf43..d5e933ce1447 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -937,4 +937,21 @@ enum { #define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a01169fb5325..fcc89706745b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 8811d3804878..9a5a7077d217 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c new file mode 100644 index 000000000000..4b7f4903ac17 --- /dev/null +++ b/net/sched/sch_etf.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia + * Vinicius Costa Gomes + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) + +struct etf_sched_data { + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + /* FIXME: Must return error on the socket's error queue */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d deadline %s\n", + qopt->delta, qopt->clockid, + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 88cab77162e86e0f6a2b7e4f859c1435c4e24feb Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:54 -0700 Subject: net/sched: Add HW offloading capability to ETF Add infra so etf qdisc supports HW offload of time-based transmission. For hw offload, the time sorted list is still used, so packets are dequeued always in order of txtime. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf offload delta 100000 \ clockid CLOCK_REALTIME In this example, the Qdisc will use HW offload for the control of the transmission time through the network adapter. The hrtimer used for packets scheduling inside the qdisc will use the clockid CLOCK_REALTIME as reference and packets leave the Qdisc "delta" (100000) nanoseconds before their transmission time. Because this will be using HW offload and since dynamic clocks are not supported by the hrtimer, the system clock and the PHC clock must be synchronized for this mode to behave as expected. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 5 +++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_etf.c | 71 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2466ea143d01..7dc769e5452b 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -155,4 +155,9 @@ struct tc_cbs_qopt_offload { s32 sendslope; }; +struct tc_etf_qopt_offload { + u8 enable; + s32 queue; +}; + #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d5e933ce1447..949118461009 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -944,6 +944,7 @@ struct tc_etf_qopt { __s32 clockid; __u32 flags; #define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) }; enum { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 4b7f4903ac17..932a136db568 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -20,8 +20,10 @@ #include #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) struct etf_sched_data { + bool offload; bool deadline_mode; int clockid; int queue; @@ -45,6 +47,9 @@ static inline int validate_input_params(struct tc_etf_qopt *qopt, * * Dynamic clockids are not supported. * * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); @@ -225,6 +230,56 @@ out: return skb; } +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -251,8 +306,9 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_ETF_PARMS]); - pr_debug("delta %d clockid %d deadline %s\n", + pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); @@ -261,9 +317,16 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); switch (q->clockid) { @@ -326,10 +389,13 @@ static void etf_reset(struct Qdisc *sch) static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -344,6 +410,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.delta = q->delta; opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; -- cgit v1.2.3 From 4b15c7075352668d4467ced7594b676707d11cae Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:43:00 -0700 Subject: net/sched: Make etf report drops on error_queue Use the socket error queue for reporting dropped packets if the socket has enabled that feature through the SO_TXTIME API. Packets are dropped either on enqueue() if they aren't accepted by the qdisc or on dequeue() if the system misses their deadline. Those are reported as different errors so applications can react accordingly. Userspace can retrieve the errors through the socket error queue and the corresponding cmsg interfaces. A struct sock_extended_err* is used for returning the error data, and the packet's timestamp can be retrieved by adding both ee_data and ee_info fields as e.g.: ((__u64) serr->ee_data << 32) + serr->ee_info This feature is disabled by default and must be explicitly enabled by applications. Enabling it can bring some overhead for the Tx cycles of the application. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- include/uapi/linux/errqueue.h | 4 ++++ include/uapi/linux/net_tstamp.h | 5 ++++- net/core/sock.c | 4 ++++ net/sched/sch_etf.c | 35 +++++++++++++++++++++++++++++++++-- 5 files changed, 47 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 68347b9821c6..e0eac9ef44b5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,7 +481,8 @@ struct sock { u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, - sk_txtime_unused : 7; + sk_txtime_report_errors : 1, + sk_txtime_unused : 6; struct socket *sk_socket; void *sk_user_data; diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..c0151200f7d1 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,12 +20,16 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_TXTIME 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_CODE_TXTIME_INVALID_PARAM 1 +#define SO_EE_CODE_TXTIME_MISSED 2 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index c9a77c353b98..f8f4539f1135 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -147,8 +147,11 @@ struct scm_ts_pktinfo { */ enum txtime_flags { SOF_TXTIME_DEADLINE_MODE = (1 << 0), + SOF_TXTIME_REPORT_ERRORS = (1 << 1), - SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) + SOF_TXTIME_FLAGS_LAST = SOF_TXTIME_REPORT_ERRORS, + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_FLAGS_LAST - 1) | + SOF_TXTIME_FLAGS_LAST }; struct sock_txtime { diff --git a/net/core/sock.c b/net/core/sock.c index fe64b839f1b2..03fdea5b0f57 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1087,6 +1087,8 @@ set_rcvbuf: sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); } break; @@ -1429,6 +1431,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; break; default: diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 932a136db568..1538d6fa8165 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,32 @@ static void reset_watchdog(struct Qdisc *sch) qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -130,8 +157,11 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct rb_node **p = &q->head.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; - if (!is_packet_valid(sch, nskb)) + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); + } while (*p) { struct sk_buff *skb; @@ -174,6 +204,8 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, if (drop) { struct sk_buff *to_free = NULL; + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + qdisc_drop(skb, sch, &to_free); kfree_skb_list(to_free); qdisc_qstats_overlimit(sch); @@ -199,7 +231,6 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) now = q->get_time(); /* Drop if packet has expired while in queue. */ - /* FIXME: Must return error on the socket's error queue */ if (ktime_before(skb->tstamp, now)) { timesortedlist_erase(sch, skb, true); skb = NULL; -- cgit v1.2.3 From a4ca8b7df73c6d78b8b5aa8246a7d794b25c25ce Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 4 Jul 2018 19:23:50 +0100 Subject: net: ipv4: fix drop handling in ip_list_rcv() and ip_list_rcv_finish() Since callees (ip_rcv_core() and ip_rcv_finish_core()) might free or steal the skb, we can't use the list_cut_before() method; we can't even do a list_del(&skb->list) in the drop case, because skb might have already been freed and reused. So instead, take each skb off the source list before processing, and add it to the sublist afterwards if it wasn't freed or stolen. Fixes: 5fa12739a53d net: ipv4: listify ip_rcv_finish Fixes: 17266ee93984 net: ipv4: listified version of ip_rcv Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 24b9b0210aeb..14ba628b2761 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -540,24 +540,27 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb, *next; struct list_head sublist; + INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; + list_del(&skb->list); if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { /* dispatch old sublist */ - list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ + INIT_LIST_HEAD(&sublist); curr_dst = dst; } + list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv_finish(head); + ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, @@ -577,24 +580,27 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct sk_buff *skb, *next; struct list_head sublist; + INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); + list_del(&skb->list); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ - list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) - ip_sublist_rcv(&sublist, dev, net); + ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ + INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } + list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - ip_sublist_rcv(head, curr_dev, curr_net); + ip_sublist_rcv(&sublist, curr_dev, curr_net); } -- cgit v1.2.3 From 6312fe77751f57d4fa2b28abeef84c6a95c28136 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 5 Jul 2018 14:34:32 +0800 Subject: net: limit each hash list length to MAX_GRO_SKBS After commit 07d78363dcff ("net: Convert NAPI gro list into a small hash table.")' there is 8 hash buckets, which allows more flows to be held for merging. but MAX_GRO_SKBS, the total held skb for merging, is 8 skb still, limit the hash table performance. keep MAX_GRO_SKBS as 8 skb, but limit each hash list length to 8 skb, not the total 8 skb Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++- net/core/dev.c | 56 +++++++++++++++++++---------------------------- 2 files changed, 29 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f06ee8f91e74..b683971e500d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -302,6 +302,11 @@ struct netdev_boot_setup { int __init netdev_boot_setup(char *str); +struct gro_list { + struct list_head list; + int count; +}; + /* * Structure for NAPI scheduling similar to tasklet but with weighting */ @@ -323,7 +328,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct list_head gro_hash[GRO_HASH_BUCKETS]; + struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; diff --git a/net/core/dev.c b/net/core/dev.c index 7e6a2f66db5c..89825c1eccdc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -149,7 +149,6 @@ #include "net-sysfs.h" -/* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ @@ -5151,9 +5150,10 @@ out: return netif_receive_skb_internal(skb); } -static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head, +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { + struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { @@ -5162,22 +5162,20 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *h list_del_init(&skb->list); napi_gro_complete(skb); napi->gro_count--; + napi->gro_hash[index].count--; } } -/* napi->gro_hash contains packets ordered by age. +/* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct list_head *head = &napi->gro_hash[i]; + u32 i; - __napi_gro_flush_chain(napi, head, flush_old); - } + for (i = 0; i < GRO_HASH_BUCKETS; i++) + __napi_gro_flush_chain(napi, i, flush_old); } EXPORT_SYMBOL(napi_gro_flush); @@ -5189,7 +5187,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)]; + head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5257,27 +5255,13 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } -static void gro_flush_oldest(struct napi_struct *napi) +static void gro_flush_oldest(struct list_head *head) { - struct sk_buff *oldest = NULL; - unsigned long age = jiffies; - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct list_head *head = &napi->gro_hash[i]; - struct sk_buff *skb; - - if (list_empty(head)) - continue; + struct sk_buff *oldest; - skb = list_last_entry(head, struct sk_buff, list); - if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) { - oldest = skb; - age = NAPI_GRO_CB(skb)->age; - } - } + oldest = list_last_entry(head, struct sk_buff, list); - /* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is + /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) @@ -5292,6 +5276,7 @@ static void gro_flush_oldest(struct napi_struct *napi) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5358,6 +5343,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff list_del_init(&pp->list); napi_gro_complete(pp); napi->gro_count--; + napi->gro_hash[hash].count--; } if (same_flow) @@ -5366,10 +5352,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - gro_flush_oldest(napi); + if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { + gro_flush_oldest(gro_head); } else { napi->gro_count++; + napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; @@ -6006,8 +5993,10 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - for (i = 0; i < GRO_HASH_BUCKETS; i++) - INIT_LIST_HEAD(&napi->gro_hash[i]); + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&napi->gro_hash[i].list); + napi->gro_hash[i].count = 0; + } napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -6047,8 +6036,9 @@ static void flush_gro_hash(struct napi_struct *napi) for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; - list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list) + list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); + napi->gro_hash[i].count = 0; } } -- cgit v1.2.3 From eabaef1896bc06319461a644e3aa139885454def Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:28 +0300 Subject: devlink: Add devlink_param register and unregister Define configuration parameters data structure. Add functions to register and unregister the driver supported configuration parameters table. For each parameter registered, the driver should fill all the parameter's fields. In case the only supported configuration mode is "driverinit" the parameter's get()/set() functions are not required and should be set to NULL, for any other configuration mode, these functions are required and should be set by the driver. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 85 +++++++++++++++++++++++++ include/uapi/linux/devlink.h | 10 +++ net/core/devlink.c | 148 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index e336ea9c73df..4a0687a1fb99 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -27,6 +27,7 @@ struct devlink { struct list_head sb_list; struct list_head dpipe_table_list; struct list_head resource_list; + struct list_head param_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -295,6 +296,68 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 +#define DEVLINK_PARAM_MAX_STRING_VALUE 32 +enum devlink_param_type { + DEVLINK_PARAM_TYPE_U8, + DEVLINK_PARAM_TYPE_U16, + DEVLINK_PARAM_TYPE_U32, + DEVLINK_PARAM_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL, +}; + +union devlink_param_value { + u8 vu8; + u16 vu16; + u32 vu32; + const char *vstr; + bool vbool; +}; + +struct devlink_param_gset_ctx { + union devlink_param_value val; + enum devlink_param_cmode cmode; +}; + +/** + * struct devlink_param - devlink configuration parameter data + * @name: name of the parameter + * @generic: indicates if the parameter is generic or driver specific + * @type: parameter type + * @supported_cmodes: bitmap of supported configuration modes + * @get: get parameter value, used for runtime and permanent + * configuration modes + * @set: set parameter value, used for runtime and permanent + * configuration modes + * + * This struct should be used by the driver to fill the data for + * a parameter it registers. + */ +struct devlink_param { + u32 id; + const char *name; + bool generic; + enum devlink_param_type type; + unsigned long supported_cmodes; + int (*get)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + int (*set)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); +}; + +struct devlink_param_item { + struct list_head list; + const struct devlink_param *param; + union devlink_param_value driverinit_value; + bool driverinit_value_valid; +}; + +enum devlink_param_generic_id { + + /* add new param generic ids above here*/ + __DEVLINK_PARAM_GENERIC_ID_MAX, + DEVLINK_PARAM_GENERIC_ID_MAX = __DEVLINK_PARAM_GENERIC_ID_MAX - 1, +}; + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -430,6 +493,12 @@ void devlink_resource_occ_get_register(struct devlink *devlink, void *occ_get_priv); void devlink_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id); +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count); +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count); #else @@ -622,6 +691,22 @@ devlink_resource_occ_get_unregister(struct devlink *devlink, { } +static inline int +devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + return 0; +} + +static inline void +devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 75cb5450c851..d814fa67c7b9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -142,6 +142,16 @@ enum devlink_port_flavour { */ }; +enum devlink_param_cmode { + DEVLINK_PARAM_CMODE_RUNTIME, + DEVLINK_PARAM_CMODE_DRIVERINIT, + DEVLINK_PARAM_CMODE_PERMANENT, + + /* Add new configuration modes above */ + __DEVLINK_PARAM_CMODE_MAX, + DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1 +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, diff --git a/net/core/devlink.c b/net/core/devlink.c index 22099705cc41..41b1a5d1c992 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2604,6 +2604,82 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static const struct devlink_param devlink_param_generic[] = {}; + +static int devlink_param_generic_verify(const struct devlink_param *param) +{ + /* verify it match generic parameter by id and name */ + if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + if (strcmp(param->name, devlink_param_generic[param->id].name)) + return -ENOENT; + + WARN_ON(param->type != devlink_param_generic[param->id].type); + + return 0; +} + +static int devlink_param_driver_verify(const struct devlink_param *param) +{ + int i; + + if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + /* verify no such name in generic params */ + for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) + if (!strcmp(param->name, devlink_param_generic[i].name)) + return -EEXIST; + + return 0; +} + +static struct devlink_param_item * +devlink_param_find_by_name(struct list_head *param_list, + const char *param_name) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (!strcmp(param_item->param->name, param_name)) + return param_item; + return NULL; +} + +static int devlink_param_register_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + if (devlink_param_find_by_name(&devlink->param_list, + param->name)) + return -EEXIST; + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + param_item->param = param; + + list_add_tail(¶m_item->list, &devlink->param_list); + return 0; +} + +static void devlink_param_unregister_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_name(&devlink->param_list, + param->name); + WARN_ON(!param_item); + list_del(¶m_item->list); + kfree(param_item); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2845,6 +2921,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->param_list); mutex_init(&devlink->lock); return devlink; } @@ -3434,6 +3511,77 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); +/** + * devlink_params_register - register configuration parameters + * + * @devlink: devlink + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the driver. + */ +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + int err; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) { + if (!param || !param->name || !param->supported_cmodes) { + err = -EINVAL; + goto rollback; + } + if (param->generic) { + err = devlink_param_generic_verify(param); + if (err) + goto rollback; + } else { + err = devlink_param_driver_verify(param); + if (err) + goto rollback; + } + err = devlink_param_register_one(devlink, param); + if (err) + goto rollback; + } + + mutex_unlock(&devlink->lock); + return 0; + +rollback: + if (!i) + goto unlock; + for (param--; i > 0; i--, param--) + devlink_param_unregister_one(devlink, param); +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_params_register); + +/** + * devlink_params_unregister - unregister configuration parameters + * @devlink: devlink + * @params: configuration parameters to unregister + * @params_count: number of parameters provided + */ +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister_one(devlink, param); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_params_unregister); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From 45f05def5c44c806f094709f1c9b03dcecdd54f0 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:29 +0300 Subject: devlink: Add param get command Add param get command which gets data per parameter. Option to dump the parameters data per device. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 11 ++ net/core/devlink.c | 250 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index d814fa67c7b9..2ccfe84176bf 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -78,6 +78,8 @@ enum devlink_command { */ DEVLINK_CMD_RELOAD, + DEVLINK_CMD_PARAM_GET, /* can dump */ + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -248,6 +250,15 @@ enum devlink_attr { DEVLINK_ATTR_PORT_NUMBER, /* u32 */ DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, /* u32 */ + DEVLINK_ATTR_PARAM, /* nested */ + DEVLINK_ATTR_PARAM_NAME, /* string */ + DEVLINK_ATTR_PARAM_GENERIC, /* flag */ + DEVLINK_ATTR_PARAM_TYPE, /* u8 */ + DEVLINK_ATTR_PARAM_VALUES_LIST, /* nested */ + DEVLINK_ATTR_PARAM_VALUE, /* nested */ + DEVLINK_ATTR_PARAM_VALUE_DATA, /* dynamic */ + DEVLINK_ATTR_PARAM_VALUE_CMODE, /* u8 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 41b1a5d1c992..b22d41275f0b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2645,6 +2645,248 @@ devlink_param_find_by_name(struct list_head *param_list, return NULL; } +static bool +devlink_param_cmode_is_supported(const struct devlink_param *param, + enum devlink_param_cmode cmode) +{ + return test_bit(cmode, ¶m->supported_cmodes); +} + +static int devlink_param_get(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->get) + return -EOPNOTSUPP; + return param->get(devlink, param->id, ctx); +} + +static int +devlink_param_type_to_nla_type(enum devlink_param_type param_type) +{ + switch (param_type) { + case DEVLINK_PARAM_TYPE_U8: + return NLA_U8; + case DEVLINK_PARAM_TYPE_U16: + return NLA_U16; + case DEVLINK_PARAM_TYPE_U32: + return NLA_U32; + case DEVLINK_PARAM_TYPE_STRING: + return NLA_STRING; + case DEVLINK_PARAM_TYPE_BOOL: + return NLA_FLAG; + default: + return -EINVAL; + } +} + +static int +devlink_nl_param_value_fill_one(struct sk_buff *msg, + enum devlink_param_type type, + enum devlink_param_cmode cmode, + union devlink_param_value val) +{ + struct nlattr *param_value_attr; + + param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE); + if (!param_value_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) + goto value_nest_cancel; + + switch (type) { + case DEVLINK_PARAM_TYPE_U8: + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U16: + if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U32: + if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, + val.vstr)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_BOOL: + if (val.vbool && + nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) + goto value_nest_cancel; + break; + } + + nla_nest_end(msg, param_value_attr); + return 0; + +value_nest_cancel: + nla_nest_cancel(msg, param_value_attr); +nla_put_failure: + return -EMSGSIZE; +} + +static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + const struct devlink_param *param = param_item->param; + struct devlink_param_gset_ctx ctx; + struct nlattr *param_values_list; + struct nlattr *param_attr; + int nla_type; + void *hdr; + int err; + int i; + + /* Get value from driver part to driverinit configuration mode */ + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { + if (!param_item->driverinit_value_valid) + return -EOPNOTSUPP; + param_value[i] = param_item->driverinit_value; + } else { + ctx.cmode = i; + err = devlink_param_get(devlink, param, &ctx); + if (err) + return err; + param_value[i] = ctx.val; + } + } + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM); + if (!param_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) + goto param_nest_cancel; + if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) + goto param_nest_cancel; + + nla_type = devlink_param_type_to_nla_type(param->type); + if (nla_type < 0) + goto param_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + goto param_nest_cancel; + + param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); + if (!param_values_list) + goto param_nest_cancel; + + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + err = devlink_nl_param_value_fill_one(msg, param->type, + i, param_value[i]); + if (err) + goto values_list_nest_cancel; + } + + nla_nest_end(msg, param_values_list); + nla_nest_end(msg, param_attr); + genlmsg_end(msg, hdr); + return 0; + +values_list_nest_cancel: + nla_nest_end(msg, param_values_list); +param_nest_cancel: + nla_nest_cancel(msg, param_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_param_item *param_item; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(param_item, &devlink->param_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_param_fill(msg, devlink, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static struct devlink_param_item * +devlink_param_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *param_name; + + if (!info->attrs[DEVLINK_ATTR_PARAM_NAME]) + return NULL; + + param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); + return devlink_param_find_by_name(&devlink->param_list, param_name); +} + +static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_param_item *param_item; + struct sk_buff *msg; + int err; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_param_fill(msg, devlink, param_item, + DEVLINK_CMD_PARAM_GET, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + static int devlink_param_register_one(struct devlink *devlink, const struct devlink_param *param) { @@ -2883,6 +3125,14 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .doit = devlink_nl_cmd_param_get_doit, + .dumpit = devlink_nl_cmd_param_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From e3b7ca18ad7b2f47ebd3b6e6ce58a42c6ec24746 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:30 +0300 Subject: devlink: Add param set command Add param set command to set value for a parameter. Value can be set to any of the supported configuration modes. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++ include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 134 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4a0687a1fb99..88062752dcd7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -328,6 +328,7 @@ struct devlink_param_gset_ctx { * configuration modes * @set: set parameter value, used for runtime and permanent * configuration modes + * @validate: validate input value is applicable (within value range, etc.) * * This struct should be used by the driver to fill the data for * a parameter it registers. @@ -342,6 +343,9 @@ struct devlink_param { struct devlink_param_gset_ctx *ctx); int (*set)(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx); + int (*validate)(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack); }; struct devlink_param_item { diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 2ccfe84176bf..ea0623e568f0 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -79,6 +79,7 @@ enum devlink_command { DEVLINK_CMD_RELOAD, DEVLINK_CMD_PARAM_GET, /* can dump */ + DEVLINK_CMD_PARAM_SET, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index b22d41275f0b..0cd7a42dcec2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2661,6 +2661,15 @@ static int devlink_param_get(struct devlink *devlink, return param->get(devlink, param->id, ctx); } +static int devlink_param_set(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->set) + return -EOPNOTSUPP; + return param->set(devlink, param->id, ctx); +} + static int devlink_param_type_to_nla_type(enum devlink_param_type param_type) { @@ -2847,6 +2856,69 @@ out: return msg->len; } +static int +devlink_param_type_get_from_info(struct genl_info *info, + enum devlink_param_type *param_type) +{ + if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE]) + return -EINVAL; + + switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { + case NLA_U8: + *param_type = DEVLINK_PARAM_TYPE_U8; + break; + case NLA_U16: + *param_type = DEVLINK_PARAM_TYPE_U16; + break; + case NLA_U32: + *param_type = DEVLINK_PARAM_TYPE_U32; + break; + case NLA_STRING: + *param_type = DEVLINK_PARAM_TYPE_STRING; + break; + case NLA_FLAG: + *param_type = DEVLINK_PARAM_TYPE_BOOL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +devlink_param_value_get_from_info(const struct devlink_param *param, + struct genl_info *info, + union devlink_param_value *value) +{ + if (param->type != DEVLINK_PARAM_TYPE_BOOL && + !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + return -EINVAL; + + switch (param->type) { + case DEVLINK_PARAM_TYPE_U8: + value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U16: + value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U32: + value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) > + DEVLINK_PARAM_MAX_STRING_VALUE) + return -EINVAL; + value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_BOOL: + value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? + true : false; + break; + } + return 0; +} + static struct devlink_param_item * devlink_param_get_from_info(struct devlink *devlink, struct genl_info *info) @@ -2887,6 +2959,58 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_param_type param_type; + struct devlink_param_gset_ctx ctx; + enum devlink_param_cmode cmode; + struct devlink_param_item *param_item; + const struct devlink_param *param; + union devlink_param_value value; + int err = 0; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + param = param_item->param; + err = devlink_param_type_get_from_info(info, ¶m_type); + if (err) + return err; + if (param_type != param->type) + return -EINVAL; + err = devlink_param_value_get_from_info(param, info, &value); + if (err) + return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, info->extack); + if (err) + return err; + } + + if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]) + return -EINVAL; + cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); + if (!devlink_param_cmode_is_supported(param, cmode)) + return -EOPNOTSUPP; + + if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + param_item->driverinit_value = value; + param_item->driverinit_value_valid = true; + } else { + if (!param->set) + return -EOPNOTSUPP; + ctx.val = value; + ctx.cmode = cmode; + err = devlink_param_set(devlink, param, &ctx); + if (err) + return err; + } + + return 0; +} + static int devlink_param_register_one(struct devlink *devlink, const struct devlink_param *param) { @@ -2942,6 +3066,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -3133,6 +3260,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_PARAM_SET, + .doit = devlink_nl_cmd_param_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From ec01aeb1803eaaf0d006e7b07b5ddb5e429c38a4 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:31 +0300 Subject: devlink: Add support for get/set driverinit value "driverinit" configuration mode value is held by devlink to enable the driver query the value after reload. Two additional functions added to help the driver get/set the value from/to devlink: devlink_param_driverinit_value_set() and devlink_param_driverinit_value_get(). Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 18 ++++++++++++ net/core/devlink.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 88062752dcd7..3302e43b09a4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -503,6 +503,10 @@ int devlink_params_register(struct devlink *devlink, void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count); +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val); +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val); #else @@ -711,6 +715,20 @@ devlink_params_unregister(struct devlink *devlink, } +static inline int +devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + return -EOPNOTSUPP; +} + +static inline int +devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 0cd7a42dcec2..3af08f4562b5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2645,6 +2645,17 @@ devlink_param_find_by_name(struct list_head *param_list, return NULL; } +static struct devlink_param_item * +devlink_param_find_by_id(struct list_head *param_list, u32 param_id) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (param_item->param->id == param_id) + return param_item; + return NULL; +} + static bool devlink_param_cmode_is_supported(const struct devlink_param *param, enum devlink_param_cmode cmode) @@ -3966,6 +3977,72 @@ void devlink_params_unregister(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devlink_params_unregister); +/** + * devlink_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + */ +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + struct devlink_param_item *param_item; + + if (!devlink->ops || !devlink->ops->reload) + return -EOPNOTSUPP; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!param_item->driverinit_value_valid || + !devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + *init_val = param_item->driverinit_value; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); + +/** + * devlink_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From ea601e17098856ee059f35c2a75659e57df81f25 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:32 +0300 Subject: devlink: Add devlink notifications support for params Add devlink_param_notify() function to support devlink param notifications. Add notification call to devlink param set, register and unregister functions. Add devlink_param_value_changed() function to enable the driver notify devlink on value change. Driver should use this function after value was changed on any configuration mode part to driverinit. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 7 +++++++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 50 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3302e43b09a4..792edaa996ba 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -507,6 +507,7 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *init_val); int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val); +void devlink_param_value_changed(struct devlink *devlink, u32 param_id); #else @@ -729,6 +730,12 @@ devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, return -EOPNOTSUPP; } +static inline void +devlink_param_value_changed(struct devlink *devlink, u32 param_id) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ea0623e568f0..68641fb56654 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -80,6 +80,8 @@ enum devlink_command { DEVLINK_CMD_PARAM_GET, /* can dump */ DEVLINK_CMD_PARAM_SET, + DEVLINK_CMD_PARAM_NEW, + DEVLINK_CMD_PARAM_DEL, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 3af08f4562b5..89d948fd4727 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2828,6 +2828,28 @@ genlmsg_cancel: return -EMSGSIZE; } +static void devlink_param_notify(struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { @@ -3019,6 +3041,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, return err; } + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } @@ -3042,6 +3065,7 @@ static int devlink_param_register_one(struct devlink *devlink, param_item->param = param; list_add_tail(¶m_item->list, &devlink->param_list); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } @@ -3053,6 +3077,7 @@ static void devlink_param_unregister_one(struct devlink *devlink, param_item = devlink_param_find_by_name(&devlink->param_list, param->name); WARN_ON(!param_item); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); list_del(¶m_item->list); kfree(param_item); } @@ -4039,10 +4064,35 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); +/** + * devlink_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink: devlink + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + * devlink_param_driverinit_value_set() instead. + */ +void devlink_param_value_changed(struct devlink *devlink, u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_param_value_changed); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From 036467c3990c75ec8ce97e517a864b52e184a1aa Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:33 +0300 Subject: devlink: Add generic parameters internal_err_reset and max_macs Add 2 first generic parameters to devlink configuration parameters set: internal_err_reset - When set enables reset device on internal errors. max_macs - max number of MACs per ETH port. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 31 +++++++++++++++++++++++++++++++ net/core/devlink.c | 14 +++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 792edaa996ba..a1c230d18911 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -356,12 +356,43 @@ struct devlink_param_item { }; enum devlink_param_generic_id { + DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + DEVLINK_PARAM_GENERIC_ID_MAX_MACS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, DEVLINK_PARAM_GENERIC_ID_MAX = __DEVLINK_PARAM_GENERIC_ID_MAX - 1, }; +#define DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME "internal_error_reset" +#define DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_MAX_MACS_NAME "max_macs" +#define DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ +{ \ + .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ + .name = DEVLINK_PARAM_GENERIC_##_id##_NAME, \ + .type = DEVLINK_PARAM_GENERIC_##_id##_TYPE, \ + .generic = true, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ +} + +#define DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, _get, _set, _validate) \ +{ \ + .id = _id, \ + .name = _name, \ + .type = _type, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ +} + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, diff --git a/net/core/devlink.c b/net/core/devlink.c index 89d948fd4727..5bbd0aa7571a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2604,7 +2604,19 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } -static const struct devlink_param devlink_param_generic[] = {}; +static const struct devlink_param devlink_param_generic[] = { + { + .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, + }, + +}; static int devlink_param_generic_verify(const struct devlink_param *param) { -- cgit v1.2.3 From f567bcdae2b052bab94be7903863cb9ab47c907c Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Wed, 4 Jul 2018 14:30:36 +0300 Subject: devlink: Add enable_sriov boolean generic parameter enable_sriov - Enables Single-Root Input/Output Virtualization(SR-IOV) characteristic of the device. Reviewed-by: Michael Chan Signed-off-by: Vasundhara Volam Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index a1c230d18911..8ed571385626 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -358,6 +358,7 @@ struct devlink_param_item { enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -370,6 +371,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_MAX_MACS_NAME "max_macs" #define DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME "enable_sriov" +#define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 5bbd0aa7571a..470f3dbfecfe 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2615,7 +2615,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, }, - + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From c53e0c787e672b4edbf719b7c1ec5833db3af2da Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 4 Jul 2018 16:13:59 -0500 Subject: tipc: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Warning level 2 was used: -Wimplicit-fallthrough=2 Signed-off-by: Gustavo A. R. Silva Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 1 + net/tipc/link.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 2dfb492a7c94..fd6d8f18955c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -610,6 +610,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; + /* else: fall through */ case NETDEV_UP: test_and_set_bit_lock(0, &b->up); break; diff --git a/net/tipc/link.c b/net/tipc/link.c index 695acb783969..63860329dbaa 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1063,6 +1063,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, skb_queue_tail(mc_inputq, skb); return true; } + /* else: fall through */ case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; -- cgit v1.2.3 From 3cc87d03992fc2a2d57b8a76110463fb18a95d72 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 4 Jul 2018 17:05:00 -0500 Subject: net: decnet: dn_nsp_in: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/decnet/dn_nsp_in.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 1b2120645730..34aba55ed573 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -491,6 +491,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb) break; case DN_RUN: sk->sk_shutdown |= SHUTDOWN_MASK; + /* fall through */ case DN_CC: scp->state = DN_CN; } -- cgit v1.2.3 From 2cc0608e42aceb38abc9e57a017449b9efc2e4a9 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 4 Jul 2018 17:34:37 -0500 Subject: net: core: filter: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Warning level 2 was used: -Wimplicit-fallthrough=2 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 547fd34589be..b9ec916f4e3a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4660,6 +4660,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* else: fall through */ default: return NULL; } -- cgit v1.2.3 From be01dc33b7b36fd974275ed91b85fcf2c6ae62aa Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 5 Jul 2018 14:42:49 +0200 Subject: batman-adv: fix checkpatch warning about misspelled "cache" commit a2d4df9b673c ("spelling.txt: add more spellings to spelling.txt") introduced the spellcheck of "cache" for checkpatch.pl. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 4229b01ac7b5..95a94160baf8 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -117,7 +117,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode, #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache + * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache * @inode: inode which was opened * @file: file handle to be initialized * -- cgit v1.2.3 From efe6aaca67a0229a195f493d142102c864b41dde Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 5 Jul 2018 15:47:39 +0100 Subject: net: ipv4: fix list processing on L3 slave devices If we have an L3 master device, l3mdev_ip_rcv() will steal the skb, but we were returning NET_RX_SUCCESS from ip_rcv_finish_core() which meant that ip_list_rcv_finish() would keep it on the list. Instead let's move the l3mdev_ip_rcv() call into the caller, so that our response to a steal can be different in the single packet path (return NET_RX_SUCCESS) and the list path (forget this packet and continue). Fixes: 5fa12739a53d ("net: ipv4: listify ip_rcv_finish") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 14ba628b2761..1a3b6f32b1c9 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -316,13 +316,6 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct rtable *rt; int err; - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -408,8 +401,16 @@ drop_error: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret = ip_rcv_finish_core(net, sk, skb); + int ret; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + ret = ip_rcv_finish_core(net, sk, skb); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -545,6 +546,12 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct dst_entry *dst; list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + continue; if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) continue; -- cgit v1.2.3 From d8269e2cbf908f9d26aa5d3217236227dffd1d89 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 5 Jul 2018 15:49:42 +0100 Subject: net: ipv6: listify ipv6_rcv() and ip6_rcv_finish() Essentially the same as the ipv4 equivalents. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 + net/ipv6/af_inet6.c | 1 + net/ipv6/ip6_input.c | 131 ++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 118 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 16475c269749..b7843e0b16ee 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -922,6 +922,8 @@ static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9ed0eae91758..c9535354149f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, + .list_func = ipv6_list_rcv, }; static int __init ipv6_packet_init(void) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index f08d34491ece..6242682be876 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,17 +47,11 @@ #include #include -int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static void ip6_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { void (*edemux)(struct sk_buff *skb); - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip6_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) } if (!skb_valid_dst(skb)) ip6_route_input(skb); +} + +int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + ip6_rcv_finish_core(net, sk, skb); return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static void ip6_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + dst_input(skb); +} + +static void ip6_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + continue; + ip6_rcv_finish_core(net, sk, skb); + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv_finish(&sublist); +} + +static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, + struct net *net) { const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; - struct net *net = dev_net(skb->dev); if (skb->pkt_type == PACKET_OTHERHOST) { kfree_skb(skb); - return NET_RX_DROP; + return NULL; } rcu_read_lock(); @@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ipv6_parse_hopopts(skb) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); - return NET_RX_DROP; + return NULL; } } @@ -205,15 +252,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip6_rcv_finish); + return skb; err: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); - return NET_RX_DROP; + return NULL; +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(skb->dev); + + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip6_rcv_finish); +} + +static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip6_rcv_finish); + ip6_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IPv6 packets */ +void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } /* -- cgit v1.2.3 From 03bc05e1a4972f73b4eb8907aa373369e825c252 Mon Sep 17 00:00:00 2001 From: Michael Scott Date: Tue, 19 Jun 2018 16:44:06 -0700 Subject: 6lowpan: iphc: reset mac_header after decompress to fix panic After decompression of 6lowpan socket data, an IPv6 header is inserted before the existing socket payload. After this, we reset the network_header value of the skb to account for the difference in payload size from prior to decompression + the addition of the IPv6 header. However, we fail to reset the mac_header value. Leaving the mac_header value untouched here, can cause a calculation error in net/packet/af_packet.c packet_rcv() function when an AF_PACKET socket is opened in SOCK_RAW mode for use on a 6lowpan interface. On line 2088, the data pointer is moved backward by the value returned from skb_mac_header(). If skb->data is adjusted so that it is before the skb->head pointer (which can happen when an old value of mac_header is left in place) the kernel generates a panic in net/core/skbuff.c line 1717. This panic can be generated by BLE 6lowpan interfaces (such as bt0) and 802.15.4 interfaces (such as lowpan0) as they both use the same 6lowpan sources for compression and decompression. Signed-off-by: Michael Scott Acked-by: Alexander Aring Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/6lowpan/iphc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 6b1042e21656..52fad5dad9f7 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -770,6 +770,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, hdr.hop_limit, &hdr.daddr); skb_push(skb, sizeof(hdr)); + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, &hdr, sizeof(hdr)); -- cgit v1.2.3 From cfdb0c2d095ac5d7f09cac1317b7d0a9e8178134 Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 29 Jun 2018 12:12:50 +0530 Subject: Bluetooth: Store Resolv list size When the controller supports the Read LE Resolv List size feature, the maximum list size are read and now stored. Before patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 17.979791 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 17.980629 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 17.980786 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 17.981627 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #59 [hci0] 17.981786 > HCI Event: Command Complete (0x0e) plen 12 #60 [hci0] 17.982636 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 After patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 13.338168 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 13.338842 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 13.339029 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 13.339939 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 13.340152 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 13.340952 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #61 [hci0] 13.341180 > HCI Event: Command Complete (0x0e) plen 12 #62 [hci0] 13.341898 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 ++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 8 ++++++++ net/bluetooth/hci_debugfs.c | 19 +++++++++++++++++++ net/bluetooth/hci_event.c | 18 ++++++++++++++++++ 5 files changed, 53 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 1668211297a9..484f24c7a415 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1490,6 +1490,12 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a +struct hci_rp_le_read_resolv_list_size { + __u8 status; + __u8 size; +} __packed; + #define HCI_OP_LE_READ_MAX_DATA_LEN 0x202f struct hci_rp_le_read_max_data_len { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 893bbbb5d2fa..409f49bd8338 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -221,6 +221,7 @@ struct hci_dev { __u8 features[HCI_MAX_PAGES][8]; __u8 le_features[8]; __u8 le_white_list_size; + __u8 le_resolv_list_size; __u8 le_states[8]; __u8 commands[64]; __u8 hci_ver; @@ -367,6 +368,7 @@ struct hci_dev { struct list_head identity_resolving_keys; struct list_head remote_oob_data; struct list_head le_white_list; + struct list_head le_resolv_list; struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ee8ef1228263..036e14267d0a 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -714,6 +714,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); } + if (hdev->commands[34] & 0x40) { + /* Read LE Resolving List Size */ + hci_req_add(req, HCI_OP_LE_READ_RESOLV_LIST_SIZE, + 0, NULL); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); @@ -3017,6 +3023,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_white_list); + INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); @@ -3218,6 +3225,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 0d8ab5b3c177..51f5b1efc3a5 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -694,6 +694,21 @@ static int white_list_show(struct seq_file *f, void *ptr) DEFINE_SHOW_ATTRIBUTE(white_list); +static int resolv_list_show(struct seq_file *f, void *ptr) +{ + struct hci_dev *hdev = f->private; + struct bdaddr_list *b; + + hci_dev_lock(hdev); + list_for_each_entry(b, &hdev->le_resolv_list, list) + seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(resolv_list); + static int identity_resolving_keys_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -955,6 +970,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &hdev->le_white_list_size); debugfs_create_file("white_list", 0444, hdev->debugfs, hdev, &white_list_fops); + debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs, + &hdev->le_resolv_list_size); + debugfs_create_file("resolv_list", 0444, hdev->debugfs, hdev, + &resolv_list_fops); debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs, hdev, &identity_resolving_keys_fops); debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 235b5aaab23d..6ee69a79258f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -221,6 +221,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) hdev->ssp_debug_mode = 0; hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_resolv_list); } static void hci_cc_read_stored_link_key(struct hci_dev *hdev, @@ -1306,6 +1307,19 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev, hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); } +static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_le_read_resolv_list_size *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size); + + if (rp->status) + return; + + hdev->le_resolv_list_size = rp->size; +} + static void hci_cc_le_read_max_data_len(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3015,6 +3029,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_write_def_data_len(hdev, skb); break; + case HCI_OP_LE_READ_RESOLV_LIST_SIZE: + hci_cc_le_read_resolv_list_size(hdev, skb); + break; + case HCI_OP_LE_READ_MAX_DATA_LEN: hci_cc_le_read_max_data_len(hdev, skb); break; -- cgit v1.2.3 From 545f2596b907f0747170c7cb71edc74cecf68c5c Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 29 Jun 2018 12:13:20 +0530 Subject: Bluetooth: Add HCI command for clear Resolv list Check for Resolv list supported by controller. So check the supported commmand first before issuing this command i.e.,HCI_OP_LE_CLEAR_RESOLV_LIST Before patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 13.338168 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 13.338842 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 13.339029 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 13.339939 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 13.340152 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 13.340952 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #61 [hci0] 13.341180 > HCI Event: Command Complete (0x0e) plen 12 #62 [hci0] 13.341898 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 After patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 28.919131 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 28.920016 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 28.920164 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 28.920873 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 28.921109 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 28.922016 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear Resolving... (0x08|0x0029) plen 0 #61 [hci0] 28.922166 > HCI Event: Command Complete (0x0e) plen 4 #62 [hci0] 28.922872 LE Clear Resolving List (0x08|0x0029) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #63 [hci0] 28.923117 > HCI Event: Command Complete (0x0e) plen 12 #64 [hci0] 28.924030 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 ++ net/bluetooth/hci_core.c | 5 +++++ net/bluetooth/hci_event.c | 17 +++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 484f24c7a415..4af1a3a4d9b1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1490,6 +1490,8 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_CLEAR_RESOLV_LIST 0x2029 + #define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a struct hci_rp_le_read_resolv_list_size { __u8 status; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 036e14267d0a..ce2447d89ce1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -720,6 +720,11 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) 0, NULL); } + if (hdev->commands[34] & 0x20) { + /* Clear LE Resolving List */ + hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6ee69a79258f..562e7a854ed6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1307,6 +1307,19 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev, hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); } +static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + hci_bdaddr_list_clear(&hdev->le_resolv_list); +} + static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3029,6 +3042,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_write_def_data_len(hdev, skb); break; + case HCI_OP_LE_CLEAR_RESOLV_LIST: + hci_cc_le_clear_resolv_list(hdev, skb); + break; + case HCI_OP_LE_READ_RESOLV_LIST_SIZE: hci_cc_le_read_resolv_list_size(hdev, skb); break; -- cgit v1.2.3 From 3baef810462746cd5a085c1e1416829d2af2622d Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:27 +0530 Subject: Bluetooth: Introduce helpers for LE set scan start and complete Introduce a helper hci_req_start_scan() which starts an LE scan and call it from passive_Scan() and active_scan(). There is not functionality change in this patch. This is basically done to enable extended scanning if the controller supports which will be done in the subsequent patch Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 37 ++++++++++++++++------------- net/bluetooth/hci_request.c | 58 +++++++++++++++++++-------------------------- 2 files changed, 46 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 562e7a854ed6..9ec07cd4ab13 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1127,24 +1127,11 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr, d->last_adv_data_len = len; } -static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, - struct sk_buff *skb) +static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) { - struct hci_cp_le_set_scan_enable *cp; - __u8 status = *((__u8 *) skb->data); - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - if (status) - return; - - cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE); - if (!cp) - return; - hci_dev_lock(hdev); - switch (cp->enable) { + switch (enable) { case LE_SCAN_ENABLE: hci_dev_set_flag(hdev, HCI_LE_SCAN); if (hdev->le_scan_type == LE_SCAN_ACTIVE) @@ -1190,13 +1177,31 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, default: bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d", - cp->enable); + enable); break; } hci_dev_unlock(hdev); } +static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_scan_enable *cp; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE); + if (!cp) + return; + + le_set_scan_enable_complete(hdev, cp->enable); +} + static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, struct sk_buff *skb) { diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e44d34734834..76dcc3f14cea 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -767,10 +767,30 @@ static bool scan_use_rpa(struct hci_dev *hdev) return hci_dev_test_flag(hdev, HCI_PRIVACY); } -void hci_req_add_le_passive_scan(struct hci_request *req) +static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, + u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_scan_param param_cp; struct hci_cp_le_set_scan_enable enable_cp; + + memset(¶m_cp, 0, sizeof(param_cp)); + param_cp.type = type; + param_cp.interval = cpu_to_le16(interval); + param_cp.window = cpu_to_le16(window); + param_cp.own_address_type = own_addr_type; + param_cp.filter_policy = filter_policy; + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), + ¶m_cp); + + memset(&enable_cp, 0, sizeof(enable_cp)); + enable_cp.enable = LE_SCAN_ENABLE; + enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), + &enable_cp); +} + +void hci_req_add_le_passive_scan(struct hci_request *req) +{ struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 filter_policy; @@ -804,20 +824,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) filter_policy |= 0x02; - memset(¶m_cp, 0, sizeof(param_cp)); - param_cp.type = LE_SCAN_PASSIVE; - param_cp.interval = cpu_to_le16(hdev->le_scan_interval); - param_cp.window = cpu_to_le16(hdev->le_scan_window); - param_cp.own_address_type = own_addr_type; - param_cp.filter_policy = filter_policy; - hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), - ¶m_cp); - - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), - &enable_cp); + hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval, + hdev->le_scan_window, own_addr_type, filter_policy); } static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) @@ -2010,8 +2018,6 @@ static int active_scan(struct hci_request *req, unsigned long opt) { uint16_t interval = opt; struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_param param_cp; - struct hci_cp_le_set_scan_enable enable_cp; u8 own_addr_type; int err; @@ -2050,22 +2056,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; - memset(¶m_cp, 0, sizeof(param_cp)); - param_cp.type = LE_SCAN_ACTIVE; - param_cp.interval = cpu_to_le16(interval); - param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN); - param_cp.own_address_type = own_addr_type; - - hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), - ¶m_cp); - - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), - &enable_cp); - + hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN, + own_addr_type, 0); return 0; } -- cgit v1.2.3 From a2344b9e3a8c5c2064306b0d99b0e9a6c4813c08 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:28 +0530 Subject: Bluetooth: Use extended scanning if controller supports This implements Set extended scan param and set extended scan enable commands and use it for start LE scan based on controller support. The new features added in these commands are setting of new PHY for scanning and setting of scan duration. Both features are disabled for now, meaning only 1M PHY is set and scan duration is set to 0 which means that scanning will be done untill scan disable is called. < HCI Command: LE Set Extended Scan Parameters (0x08|0x0041) plen 8 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x01 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 24 +++++++++ include/net/bluetooth/hci_core.h | 4 ++ net/bluetooth/hci_event.c | 51 ++++++++++++++++++ net/bluetooth/hci_request.c | 110 ++++++++++++++++++++++++++++++--------- 4 files changed, 164 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4af1a3a4d9b1..8c2868f439e7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1514,6 +1514,30 @@ struct hci_cp_le_set_default_phy { __u8 rx_phys; } __packed; +#define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 +struct hci_cp_le_set_ext_scan_params { + __u8 own_addr_type; + __u8 filter_policy; + __u8 scanning_phys; + __u8 data[0]; +} __packed; + +#define LE_SCAN_PHY_1M 0x01 + +struct hci_cp_le_scan_phy_params { + __u8 type; + __le16 interval; + __le16 window; +} __packed; + +#define HCI_OP_LE_SET_EXT_SCAN_ENABLE 0x2042 +struct hci_cp_le_set_ext_scan_enable { + __u8 enable; + __u8 filter_dup; + __le16 duration; + __le16 period; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 409f49bd8338..cc0bde74dd45 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1158,6 +1158,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +/* Use ext scanning if set ext scan param and ext scan enable is supported */ +#define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ + ((dev)->commands[37] & 0x40)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 9ec07cd4ab13..15afad005d72 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1098,6 +1098,31 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_ext_scan_params *cp; + __u8 status = *((__u8 *) skb->data); + struct hci_cp_le_scan_phy_params *phy_param; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS); + if (!cp) + return; + + phy_param = (void *)cp->data; + + hci_dev_lock(hdev); + + hdev->le_scan_type = phy_param->type; + + hci_dev_unlock(hdev); +} + static bool has_pending_adv_report(struct hci_dev *hdev) { struct discovery_state *d = &hdev->discovery; @@ -1202,6 +1227,24 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, le_set_scan_enable_complete(hdev, cp->enable); } +static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_ext_scan_enable *cp; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE); + if (!cp) + return; + + le_set_scan_enable_complete(hdev, cp->enable); +} + static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3079,6 +3122,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_write_ssp_debug_mode(hdev, skb); break; + case HCI_OP_LE_SET_EXT_SCAN_PARAMS: + hci_cc_le_set_ext_scan_param(hdev, skb); + break; + + case HCI_OP_LE_SET_EXT_SCAN_ENABLE: + hci_cc_le_set_ext_scan_enable(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 76dcc3f14cea..faf7c711234c 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -647,11 +647,22 @@ void __hci_req_update_eir(struct hci_request *req) void hci_req_add_le_scan_disable(struct hci_request *req) { - struct hci_cp_le_set_scan_enable cp; + struct hci_dev *hdev = req->hdev; - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_DISABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + if (use_ext_scan(hdev)) { + struct hci_cp_le_set_ext_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_DISABLE; + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp), + &cp); + } else { + struct hci_cp_le_set_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_DISABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + } } static void add_to_white_list(struct hci_request *req, @@ -770,23 +781,60 @@ static bool scan_use_rpa(struct hci_dev *hdev) static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { - struct hci_cp_le_set_scan_param param_cp; - struct hci_cp_le_set_scan_enable enable_cp; - - memset(¶m_cp, 0, sizeof(param_cp)); - param_cp.type = type; - param_cp.interval = cpu_to_le16(interval); - param_cp.window = cpu_to_le16(window); - param_cp.own_address_type = own_addr_type; - param_cp.filter_policy = filter_policy; - hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), - ¶m_cp); + struct hci_dev *hdev = req->hdev; - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), - &enable_cp); + /* Use ext scanning if set ext scan param and ext scan enable is + * supported + */ + if (use_ext_scan(hdev)) { + struct hci_cp_le_set_ext_scan_params *ext_param_cp; + struct hci_cp_le_set_ext_scan_enable ext_enable_cp; + struct hci_cp_le_scan_phy_params *phy_params; + /* Ony single PHY (1M) is supported as of now */ + u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 1]; + + ext_param_cp = (void *)data; + phy_params = (void *)ext_param_cp->data; + + memset(ext_param_cp, 0, sizeof(*ext_param_cp)); + ext_param_cp->own_addr_type = own_addr_type; + ext_param_cp->filter_policy = filter_policy; + ext_param_cp->scanning_phys = LE_SCAN_PHY_1M; + + memset(phy_params, 0, sizeof(*phy_params)); + phy_params->type = type; + phy_params->interval = cpu_to_le16(interval); + phy_params->window = cpu_to_le16(window); + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS, + sizeof(*ext_param_cp) + sizeof(*phy_params), + ext_param_cp); + + memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); + ext_enable_cp.enable = LE_SCAN_ENABLE; + ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, + sizeof(ext_enable_cp), &ext_enable_cp); + } else { + struct hci_cp_le_set_scan_param param_cp; + struct hci_cp_le_set_scan_enable enable_cp; + + memset(¶m_cp, 0, sizeof(param_cp)); + param_cp.type = type; + param_cp.interval = cpu_to_le16(interval); + param_cp.window = cpu_to_le16(window); + param_cp.own_address_type = own_addr_type; + param_cp.filter_policy = filter_policy; + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), + ¶m_cp); + + memset(&enable_cp, 0, sizeof(enable_cp)); + enable_cp.enable = LE_SCAN_ENABLE; + enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), + &enable_cp); + } } void hci_req_add_le_passive_scan(struct hci_request *req) @@ -1948,7 +1996,6 @@ discov_stopped: static int le_scan_restart(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_enable cp; /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) @@ -1956,10 +2003,23 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt) hci_req_add_le_scan_disable(req); - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_ENABLE; - cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + if (use_ext_scan(hdev)) { + struct hci_cp_le_set_ext_scan_enable ext_enable_cp; + + memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); + ext_enable_cp.enable = LE_SCAN_ENABLE; + ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, + sizeof(ext_enable_cp), &ext_enable_cp); + } else { + struct hci_cp_le_set_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_ENABLE; + cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + } return 0; } -- cgit v1.2.3 From c215e9397b00b3045a668120ed7dbd89f2866e74 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:29 +0530 Subject: Bluetooth: Process extended ADV report event This patch enables Extended ADV report event if extended scanning is supported in the controller and process the same. The new features are not handled and for now its as good as legacy ADV report. > HCI Event: LE Meta Event (0x3e) plen 53 LE Extended Advertising Report (0x0d) Num reports: 1 Entry 0 Event type: 0x0013 Props: 0x0013 Connectable Scannable Use legacy advertising PDUs Data status: Complete Legacy PDU Type: ADV_IND (0x0013) Address type: Random (0x01) Address: DB:7E:2E:1A:85:E8 (Static) Primary PHY: LE 1M Secondary PHY: LE 1M SID: 0x00 TX power: 0 dBm RSSI: -90 dBm (0xa6) Periodic advertising invteral: 0.00 msec (0x0000) Direct address type: Public (0x00) Direct address: 00:00:00:00:00:00 (OUI 00-00-00) Data length: 0x1b 0f 09 44 65 73 69 67 6e 65 72 20 4d 6f 75 73 65 ..Designer Mouse 03 19 c2 03 02 01 05 03 03 12 18 ........... Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 26 +++++++++++++++++++++++ net/bluetooth/hci_core.c | 9 ++++++++ net/bluetooth/hci_event.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8c2868f439e7..0ec51eb14810 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1925,6 +1925,15 @@ struct hci_ev_le_conn_complete { #define LE_ADV_SCAN_IND 0x02 #define LE_ADV_NONCONN_IND 0x03 #define LE_ADV_SCAN_RSP 0x04 +#define LE_ADV_INVALID 0x05 + +/* Legacy event types in extended adv report */ +#define LE_LEGACY_ADV_IND 0x0013 +#define LE_LEGACY_ADV_DIRECT_IND 0x0015 +#define LE_LEGACY_ADV_SCAN_IND 0x0012 +#define LE_LEGACY_NONCONN_IND 0x0010 +#define LE_LEGACY_SCAN_RSP_ADV 0x001b +#define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 @@ -1989,6 +1998,23 @@ struct hci_ev_le_direct_adv_info { __s8 rssi; } __packed; +#define HCI_EV_LE_EXT_ADV_REPORT 0x0d +struct hci_ev_le_ext_adv_report { + __le16 evt_type; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 primary_phy; + __u8 secondary_phy; + __u8 sid; + __u8 tx_power; + __s8 rssi; + __le16 interval; + __u8 direct_addr_type; + bdaddr_t direct_addr; + __u8 length; + __u8 data[0]; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ce2447d89ce1..e3ec2d782762 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -695,6 +695,15 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[35] & (0x20 | 0x40)) events[1] |= 0x08; /* LE PHY Update Complete */ + /* If the controller supports LE Set Extended Scan Parameters + * and LE Set Extended Scan Enable commands, enable the + * corresponding event. + */ + if (use_ext_scan(hdev)) + events[1] |= 0x10; /* LE Extended Advertising + * Report + */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 15afad005d72..6c6fd4f55f23 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5048,6 +5048,54 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static u8 convert_legacy_evt_type(u16 evt_type) +{ + switch (evt_type) { + case LE_LEGACY_ADV_IND: + return LE_ADV_IND; + case LE_LEGACY_ADV_DIRECT_IND: + return LE_ADV_DIRECT_IND; + case LE_LEGACY_ADV_SCAN_IND: + return LE_ADV_SCAN_IND; + case LE_LEGACY_NONCONN_IND: + return LE_ADV_NONCONN_IND; + case LE_LEGACY_SCAN_RSP_ADV: + case LE_LEGACY_SCAN_RSP_ADV_SCAN: + return LE_ADV_SCAN_RSP; + } + + BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", + evt_type); + + return LE_ADV_INVALID; +} + +static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + u8 num_reports = skb->data[0]; + void *ptr = &skb->data[1]; + + hci_dev_lock(hdev); + + while (num_reports--) { + struct hci_ev_le_ext_adv_report *ev = ptr; + u8 legacy_evt_type; + u16 evt_type; + + evt_type = __le16_to_cpu(ev->evt_type); + legacy_evt_type = convert_legacy_evt_type(evt_type); + if (legacy_evt_type != LE_ADV_INVALID) { + process_adv_report(hdev, legacy_evt_type, &ev->bdaddr, + ev->bdaddr_type, NULL, 0, ev->rssi, + ev->data, ev->length); + } + + ptr += sizeof(*ev) + ev->length + 1; + } + + hci_dev_unlock(hdev); +} + static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5280,6 +5328,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_direct_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_EXT_ADV_REPORT: + hci_le_ext_adv_report_evt(hdev, skb); + break; + default: break; } -- cgit v1.2.3 From d12fb05643f9b48134c7650f5a03f9729aacfde4 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:30 +0530 Subject: Bluetooth: Introduce helpers for le conn status and complete This is done so that the helpers can be used for extended conn implementation which will be done in subsequent patch. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 110 +++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6c6fd4f55f23..14e42e157de9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1971,55 +1971,63 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } -static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) +static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, + u8 peer_addr_type, u8 own_address_type, + u8 filter_policy) { - struct hci_cp_le_create_conn *cp; struct hci_conn *conn; - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI - * request completion callbacks used for connecting. - */ - if (status) - return; - - cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN); - if (!cp) - return; - - hci_dev_lock(hdev); - - conn = hci_conn_hash_lookup_le(hdev, &cp->peer_addr, - cp->peer_addr_type); + conn = hci_conn_hash_lookup_le(hdev, peer_addr, + peer_addr_type); if (!conn) - goto unlock; + return; /* Store the initiator and responder address information which * is needed for SMP. These values will not change during the * lifetime of the connection. */ - conn->init_addr_type = cp->own_address_type; - if (cp->own_address_type == ADDR_LE_DEV_RANDOM) + conn->init_addr_type = own_address_type; + if (own_address_type == ADDR_LE_DEV_RANDOM) bacpy(&conn->init_addr, &hdev->random_addr); else bacpy(&conn->init_addr, &hdev->bdaddr); - conn->resp_addr_type = cp->peer_addr_type; - bacpy(&conn->resp_addr, &cp->peer_addr); + conn->resp_addr_type = peer_addr_type; + bacpy(&conn->resp_addr, peer_addr); /* We don't want the connection attempt to stick around * indefinitely since LE doesn't have a page timeout concept * like BR/EDR. Set a timer for any connection that doesn't use * the white list for connecting. */ - if (cp->filter_policy == HCI_LE_USE_PEER_ADDR) + if (filter_policy == HCI_LE_USE_PEER_ADDR) queue_delayed_work(conn->hdev->workqueue, &conn->le_conn_timeout, conn->conn_timeout); +} + +static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_create_conn *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + /* All connection failure handling is taken care of by the + * hci_le_conn_failed function which is triggered by the HCI + * request completion callbacks used for connecting. + */ + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN); + if (!cp) + return; + + hci_dev_lock(hdev); + + cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type, + cp->own_address_type, cp->filter_policy); -unlock: hci_dev_unlock(hdev); } @@ -4551,16 +4559,15 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, } #endif -static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, + bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle, + u16 interval, u16 latency, u16 supervision_timeout) { - struct hci_ev_le_conn_complete *ev = (void *) skb->data; struct hci_conn_params *params; struct hci_conn *conn; struct smp_irk *irk; u8 addr_type; - BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); - hci_dev_lock(hdev); /* All controllers implicitly stop advertising in the event of a @@ -4570,13 +4577,13 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn = hci_lookup_le_connect(hdev); if (!conn) { - conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role); + conn = hci_conn_add(hdev, LE_LINK, bdaddr, role); if (!conn) { bt_dev_err(hdev, "no memory for new connection"); goto unlock; } - conn->dst_type = ev->bdaddr_type; + conn->dst_type = bdaddr_type; /* If we didn't have a hci_conn object previously * but we're in master role this must be something @@ -4587,8 +4594,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * initiator address based on the HCI_PRIVACY flag. */ if (conn->out) { - conn->resp_addr_type = ev->bdaddr_type; - bacpy(&conn->resp_addr, &ev->bdaddr); + conn->resp_addr_type = bdaddr_type; + bacpy(&conn->resp_addr, bdaddr); if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { conn->init_addr_type = ADDR_LE_DEV_RANDOM; bacpy(&conn->init_addr, &hdev->rpa); @@ -4612,8 +4619,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) else bacpy(&conn->resp_addr, &hdev->bdaddr); - conn->init_addr_type = ev->bdaddr_type; - bacpy(&conn->init_addr, &ev->bdaddr); + conn->init_addr_type = bdaddr_type; + bacpy(&conn->init_addr, bdaddr); /* For incoming connections, set the default minimum * and maximum connection interval. They will be used @@ -4639,8 +4646,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->dst_type = irk->addr_type; } - if (ev->status) { - hci_le_conn_failed(conn, ev->status); + if (status) { + hci_le_conn_failed(conn, status); goto unlock; } @@ -4659,17 +4666,17 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) mgmt_device_connected(hdev, conn, 0, NULL, 0); conn->sec_level = BT_SECURITY_LOW; - conn->handle = __le16_to_cpu(ev->handle); + conn->handle = handle; conn->state = BT_CONFIG; - conn->le_conn_interval = le16_to_cpu(ev->interval); - conn->le_conn_latency = le16_to_cpu(ev->latency); - conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout); + conn->le_conn_interval = interval; + conn->le_conn_latency = latency; + conn->le_supv_timeout = supervision_timeout; hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - if (!ev->status) { + if (!status) { /* The remote features procedure is defined for master * role only. So only in case of an initiated connection * request the remote features. @@ -4691,10 +4698,10 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_conn_hold(conn); } else { conn->state = BT_CONNECTED; - hci_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, status); } } else { - hci_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, status); } params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, @@ -4713,6 +4720,19 @@ unlock: hci_dev_unlock(hdev); } +static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_le_conn_complete *ev = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, + ev->role, le16_to_cpu(ev->handle), + le16_to_cpu(ev->interval), + le16_to_cpu(ev->latency), + le16_to_cpu(ev->supervision_timeout)); +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { -- cgit v1.2.3 From 4d94f95d30c8fbfe86068e9abed110974d697cf5 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 22:50:32 +0200 Subject: Bluetooth: Use extended LE Connection if supported This implements extended LE craete connection and enhanced LE conn complete event if the controller supports. For now it is as good as legacy LE connection and event as no new features in the extended connection is handled. < HCI Command: LE Extended Create Connection (0x08|0x0043) plen 26 Filter policy: White list is not used (0x00) Own address type: Public (0x00) Peer address type: Random (0x01) Peer address: DB:7E:2E:1D:85:E8 (Static) Initiating PHYs: 0x01 Entry 0: LE 1M Scan interval: 60.000 msec (0x0060) Scan window: 60.000 msec (0x0060) Min connection interval: 50.00 msec (0x0028) Max connection interval: 70.00 msec (0x0038) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Min connection length: 0.000 msec (0x0000) Max connection length: 0.000 msec (0x0000) > HCI Event: Command Status (0x0f) plen 4 LE Extended Create Connection (0x08|0x0043) ncmd 2 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 31 LE Enhanced Connection Complete (0x0a) Status: Success (0x00) Handle: 3585 Role: Master (0x00) Peer address type: Random (0x01) Peer address: DB:7E:2E:1D:85:E8 (Static) Local resolvable private address: 00:00:00:00:00:00 (Non-Resolvable) Peer resolvable private address: 00:00:00:00:00:00 (Non-Resolvable) Connection interval: 67.50 msec (0x0036) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Master clock accuracy: 0x00 @ MGMT Event: Device Connected (0x000b) plen 40 LE Address: DB:7E:2E:1D:85:E8 (Static) Flags: 0x00000000 Data length: 27 Name (complete): Designer Mouse Appearance: Mouse (0x03c2) Flags: 0x05 LE Limited Discoverable Mode BR/EDR Not Supported 16-bit Service UUIDs (complete): 1 entry Human Interface Device (0x1812) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 36 ++++++++++++++++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_conn.c | 72 ++++++++++++++++++++++++++++++---------- net/bluetooth/hci_core.c | 8 +++++ net/bluetooth/hci_event.c | 47 ++++++++++++++++++++++++++ 5 files changed, 147 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0ec51eb14810..73e48be5bbb3 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1538,6 +1538,27 @@ struct hci_cp_le_set_ext_scan_enable { __le16 period; } __packed; +#define HCI_OP_LE_EXT_CREATE_CONN 0x2043 +struct hci_cp_le_ext_create_conn { + __u8 filter_policy; + __u8 own_addr_type; + __u8 peer_addr_type; + bdaddr_t peer_addr; + __u8 phys; + __u8 data[0]; +} __packed; + +struct hci_cp_le_ext_conn_param { + __le16 scan_interval; + __le16 scan_window; + __le16 conn_interval_min; + __le16 conn_interval_max; + __le16 conn_latency; + __le16 supervision_timeout; + __le16 min_ce_len; + __le16 max_ce_len; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 @@ -2015,6 +2036,21 @@ struct hci_ev_le_ext_adv_report { __u8 data[0]; } __packed; +#define HCI_EV_LE_ENHANCED_CONN_COMPLETE 0x0a +struct hci_ev_le_enh_conn_complete { + __u8 status; + __le16 handle; + __u8 role; + __u8 bdaddr_type; + bdaddr_t bdaddr; + bdaddr_t local_rpa; + bdaddr_t peer_rpa; + __le16 interval; + __le16 latency; + __le16 supervision_timeout; + __u8 clk_accurancy; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cc0bde74dd45..a74453571264 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1161,6 +1161,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40)) +/* Use ext create connection if command is supported */ +#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 45ff5dc124cc..cc967ca67962 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -752,7 +752,6 @@ static void hci_req_add_le_create_conn(struct hci_request *req, struct hci_conn *conn, bdaddr_t *direct_rpa) { - struct hci_cp_le_create_conn cp; struct hci_dev *hdev = conn->hdev; u8 own_addr_type; @@ -775,25 +774,62 @@ static void hci_req_add_le_create_conn(struct hci_request *req, return; } - memset(&cp, 0, sizeof(cp)); + if (use_ext_conn(hdev)) { + struct hci_cp_le_ext_create_conn *cp; + struct hci_cp_le_ext_conn_param *p; + /* As of now only LE 1M is supported */ + u8 data[sizeof(*cp) + sizeof(*p) * 1]; - /* Set window to be the same value as the interval to enable - * continuous scanning. - */ - cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); - cp.scan_window = cp.scan_interval; + cp = (void *) data; + p = (void *) cp->data; - bacpy(&cp.peer_addr, &conn->dst); - cp.peer_addr_type = conn->dst_type; - cp.own_address_type = own_addr_type; - cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); - cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); - cp.conn_latency = cpu_to_le16(conn->le_conn_latency); - cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); - cp.min_ce_len = cpu_to_le16(0x0000); - cp.max_ce_len = cpu_to_le16(0x0000); - - hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); + memset(cp, 0, sizeof(*cp)); + + bacpy(&cp->peer_addr, &conn->dst); + cp->peer_addr_type = conn->dst_type; + cp->own_addr_type = own_addr_type; + cp->phys = LE_SCAN_PHY_1M; + + memset(p, 0, sizeof(*p)); + + /* Set window to be the same value as the interval to enable + * continuous scanning. + */ + + p->scan_interval = cpu_to_le16(hdev->le_scan_interval); + p->scan_window = p->scan_interval; + p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); + p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); + p->conn_latency = cpu_to_le16(conn->le_conn_latency); + p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); + p->min_ce_len = cpu_to_le16(0x0000); + p->max_ce_len = cpu_to_le16(0x0000); + + hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, sizeof(data), data); + + } else { + struct hci_cp_le_create_conn cp; + + memset(&cp, 0, sizeof(cp)); + + /* Set window to be the same value as the interval to enable + * continuous scanning. + */ + cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); + cp.scan_window = cp.scan_interval; + + bacpy(&cp.peer_addr, &conn->dst); + cp.peer_addr_type = conn->dst_type; + cp.own_address_type = own_addr_type; + cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); + cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); + cp.conn_latency = cpu_to_le16(conn->le_conn_latency); + cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); + cp.min_ce_len = cpu_to_le16(0x0000); + cp.max_ce_len = cpu_to_le16(0x0000); + + hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); + } conn->state = BT_CONNECT; clear_bit(HCI_CONN_SCANNING, &conn->flags); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e3ec2d782762..f5c21004186c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -704,6 +704,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ + /* If the controller supports the LE Extended Create Connection + * command, enable the corresponding event. + */ + if (use_ext_conn(hdev)) + events[1] |= 0x02; /* LE Enhanced Connection + * Complete + */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 14e42e157de9..68192152c23b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2031,6 +2031,31 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } +static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_ext_create_conn *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + /* All connection failure handling is taken care of by the + * hci_le_conn_failed function which is triggered by the HCI + * request completion callbacks used for connecting. + */ + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_EXT_CREATE_CONN); + if (!cp) + return; + + hci_dev_lock(hdev); + + cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type, + cp->own_addr_type, cp->filter_policy); + + hci_dev_unlock(hdev); +} + static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) { struct hci_cp_le_read_remote_features *cp; @@ -3233,6 +3258,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cs_le_start_enc(hdev, ev->status); break; + case HCI_OP_LE_EXT_CREATE_CONN: + hci_cs_le_ext_create_conn(hdev, ev->status); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; @@ -4733,6 +4762,20 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_enh_conn_complete *ev = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, + ev->role, le16_to_cpu(ev->handle), + le16_to_cpu(ev->interval), + le16_to_cpu(ev->latency), + le16_to_cpu(ev->supervision_timeout)); +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5352,6 +5395,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_ext_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_ENHANCED_CONN_COMPLETE: + hci_le_enh_conn_complete_evt(hdev, skb); + break; + default: break; } -- cgit v1.2.3 From 351782067b6be81879b0af0daf7bd3acbb32d986 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:54 -0400 Subject: ipv4: ipcm_cookie initializers Initialize the cookie in one location to reduce code duplication and avoid bugs from inconsistent initialization, such as that fixed in commit 9887cba19978 ("ip: limit use of gso_size to udp"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 15 +++++++++++++++ net/ipv4/icmp.c | 11 ++--------- net/ipv4/ip_output.c | 6 +----- net/ipv4/ping.c | 9 +-------- net/ipv4/raw.c | 9 +-------- net/ipv4/udp.c | 10 +--------- 6 files changed, 21 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 99d1b835d2aa..6db23bf1e5eb 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -79,6 +79,21 @@ struct ipcm_cookie { __u16 gso_size; }; +static inline void ipcm_init(struct ipcm_cookie *ipcm) +{ + *ipcm = (struct ipcm_cookie) { .tos = -1 }; +} + +static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, + const struct inet_sock *inet) +{ + ipcm_init(ipcm); + + ipcm->sockc.tsflags = inet->sk.sk_tsflags; + ipcm->oif = inet->sk.sk_bound_dev_if; + ipcm->addr = inet->inet_saddr; +} + #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 937239afd68d..695979b7ef6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -429,15 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; + ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -711,12 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; + ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 570e3ebc3974..81d0e4a77ec5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1548,12 +1548,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; + ipcm_init(&ipc); ipc.addr = daddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b47492205507..6f17fc8ebbdb 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -739,14 +739,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.oif = sk->sk_bound_dev_if; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 446af7be2b55..cf142909389c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -562,14 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.sockc.transmit_time = 0; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5c76ba0666ec..87f3a0b77864 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -926,12 +926,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; @@ -978,9 +972,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 1; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); ipc.gso_size = up->gso_size; if (msg->msg_controllen) { -- cgit v1.2.3 From b515430ac9c25d5192cf498af3c6be6c4f51caad Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:55 -0400 Subject: ipv6: ipcm6_cookie initializer Initialize the cookie in one location to reduce code duplication and avoid bugs from inconsistent initialization, such as that fixed in commit 9887cba19978 ("ip: limit use of gso_size to udp"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ipv6.h | 19 +++++++++++++++++++ net/ipv6/icmp.c | 7 ++----- net/ipv6/ping.c | 4 +--- net/ipv6/raw.c | 5 +---- net/ipv6/udp.c | 4 +--- net/l2tp/l2tp_ip6.c | 4 +--- 6 files changed, 25 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b7843e0b16ee..6cb247f54d4c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -301,6 +301,25 @@ struct ipcm6_cookie { __u16 gso_size; }; +static inline void ipcm6_init(struct ipcm6_cookie *ipc6) +{ + *ipc6 = (struct ipcm6_cookie) { + .hlimit = -1, + .tclass = -1, + .dontfrag = -1, + }; +} + +static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, + const struct ipv6_pinfo *np) +{ + *ipc6 = (struct ipcm6_cookie) { + .hlimit = -1, + .tclass = np->tclass, + .dontfrag = np->dontfrag, + }; +} + static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index be491bf6ab6e..d99fed67cd10 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -545,7 +545,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); @@ -553,8 +553,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -726,10 +724,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 96f56bf49a30..717e7c1fba29 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); @@ -142,8 +142,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.family = AF_INET6; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5737c50f16eb..5f40670271ee 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -791,10 +791,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; - ipc6.opt = NULL; + ipcm6_init(&ipc6); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ac6fc6728903..940115da9843 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1143,9 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sockcm_cookie sockc; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; sockc.tsflags = sk->sk_tsflags; sockc.transmit_time = 0; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 957369192ca1..38f80691f4ab 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -525,9 +525,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) -- cgit v1.2.3 From 657a0667025e77cc17f8a38b93e60a2bc24d830c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:56 -0400 Subject: sock: sockc cookie initializer Initialize the cookie in one location to reduce code duplication and avoid bugs from inconsistent initialization, such as that fixed in commit 9887cba19978 ("ip: limit use of gso_size to udp"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 6 ++++++ net/ipv4/tcp.c | 2 +- net/packet/af_packet.c | 9 +++------ 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index e0eac9ef44b5..83b747538bd0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1600,6 +1600,12 @@ struct sockcm_cookie { u16 tsflags; }; +static inline void sockcm_init(struct sockcm_cookie *sockc, + const struct sock *sk) +{ + *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; +} + int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf461fa77ed6..850dc8f15afc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1241,7 +1241,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3428f7739ae9..47931ebfaef3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1951,8 +1951,7 @@ retry: goto out_unlock; } - sockc.transmit_time = 0; - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -2636,8 +2635,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - sockc.transmit_time = 0; - sockc.tsflags = po->sk.sk_tsflags; + sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) @@ -2833,8 +2831,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; - sockc.transmit_time = 0; - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); -- cgit v1.2.3 From 5fdaa88dfefa87ee1ea92750e99950dca182ea41 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:57 -0400 Subject: ipv6: fold sockcm_cookie into ipcm6_cookie ipcm_cookie includes sockcm_cookie. Do the same for ipcm6_cookie. This reduces the number of arguments that need to be passed around, applies ipcm6_init to all cookie fields at once and reduces code differentiation between ipv4 and ipv6. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 +++---- include/net/transp_v6.h | 3 +-- net/ipv6/datagram.c | 4 ++-- net/ipv6/icmp.c | 7 ++----- net/ipv6/ip6_flowlabel.c | 3 +-- net/ipv6/ip6_output.c | 24 ++++++++++-------------- net/ipv6/ipv6_sockglue.c | 3 +-- net/ipv6/ping.c | 3 +-- net/ipv6/raw.c | 10 ++++------ net/ipv6/udp.c | 10 ++++------ net/l2tp/l2tp_ip6.c | 6 ++---- 11 files changed, 31 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6cb247f54d4c..aa6fd11a887c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -294,6 +294,7 @@ struct ipv6_fl_socklist { }; struct ipcm6_cookie { + struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __s8 dontfrag; @@ -959,8 +960,7 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, - const struct sockcm_cookie *sockc); + struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); @@ -977,8 +977,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - struct inet_cork_full *cork, - const struct sockcm_cookie *sockc); + struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index f6a3543e5247..a8f6020f1196 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -42,8 +42,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, - struct flowi6 *fl6, struct ipcm6_cookie *ipc6, - struct sockcm_cookie *sockc); + struct flowi6 *fl6, struct ipcm6_cookie *ipc6); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2ee08b6a86a4..201306b9b5ea 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -736,7 +736,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, - struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) + struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -755,7 +755,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, sockc); + err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); if (err) return err; continue; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index d99fed67cd10..24611c8b0562 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -430,7 +430,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; @@ -573,7 +572,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused)) { + MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -677,7 +676,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); - struct sockcm_cookie sockc_unused = {0}; saddr = &ipv6_hdr(skb)->daddr; @@ -731,8 +729,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused)) { + (struct rt6_info *)dst, MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3eee7637bdfe..cb54a8a3c273 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; err = -ENOMEM; @@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; - err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f48af7e62f12..1a3bf6437cb9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1158,8 +1158,7 @@ static void ip6_append_data_mtu(unsigned int *mtu, static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, - struct rt6_info *rt, struct flowi6 *fl6, - const struct sockcm_cookie *sockc) + struct rt6_info *rt, struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; @@ -1227,7 +1226,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; - cork->base.transmit_time = sockc->transmit_time; + cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; } @@ -1241,8 +1240,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6, - const struct sockcm_cookie *sockc) + unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; @@ -1321,7 +1319,7 @@ emsgsize: csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); + sock_tx_timestamp(sk, ipc6->sockc.tsflags, &tx_flags); if (tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -1563,8 +1561,7 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, - const struct sockcm_cookie *sockc) + struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -1578,7 +1575,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ err = ip6_setup_cork(sk, &inet->cork, &np->cork, - ipc6, rt, fl6, sockc); + ipc6, rt, fl6); if (err) return err; @@ -1592,7 +1589,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6, sockc); + from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1752,8 +1749,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - struct inet_cork_full *cork, - const struct sockcm_cookie *sockc) + struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; @@ -1770,7 +1766,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork->base.opt = NULL; cork->base.dst = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6, sockc); + err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); @@ -1781,7 +1777,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6, sockc); + flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4d780c7f0130..fabe3ba1bddc 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -489,7 +489,6 @@ sticky_done: struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); @@ -522,7 +521,7 @@ sticky_done: msg.msg_control = (void *)(opt+1); ipc6.opt = opt; - retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 717e7c1fba29..4c04bccc7417 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; - struct sockcm_cookie junk = {0}; struct ipcm6_cookie ipc6; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -146,7 +145,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, 0, &ipc6, &fl6, rt, - MSG_DONTWAIT, &junk); + MSG_DONTWAIT); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5f40670271ee..413d98bf24f4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -767,7 +767,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; - struct sockcm_cookie sockc; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; u16 proto; @@ -792,6 +791,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); + ipc6.sockc.tsflags = sk->sk_tsflags; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -845,15 +845,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; - sockc.tsflags = sk->sk_tsflags; - sockc.transmit_time = 0; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -921,13 +919,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: if (inet->hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, - msg->msg_flags, &sockc); + msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 940115da9843..f6b96956a8ed 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1141,12 +1141,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - struct sockcm_cookie sockc; ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; - sockc.tsflags = sk->sk_tsflags; - sockc.transmit_time = 0; + ipc6.sockc.tsflags = sk->sk_tsflags; /* destination address check */ if (sin6) { @@ -1281,7 +1279,7 @@ do_udp_sendmsg: err = udp_cmsg_send(sk, msg, &ipc6.gso_size); if (err > 0) err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, - &ipc6, &sockc); + &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1375,7 +1373,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &cork, &sockc); + msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6, &cork.base); @@ -1401,7 +1399,7 @@ do_append_data: up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 38f80691f4ab..672e5b753738 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -500,7 +500,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ @@ -573,8 +572,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, - &sockc_unused); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -639,7 +637,7 @@ back_from_confirm: err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc_unused); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) -- cgit v1.2.3 From 678ca42d688534adfc780b150abefaaac7c86687 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:58 -0400 Subject: ip: remove tx_flags from ipcm_cookie and use same logic for v4 and v6 skb_shinfo(skb)->tx_flags is derived from sk->sk_tsflags, possibly after modification by __sock_cmsg_send, by calling sock_tx_timestamp. The IPv4 and IPv6 paths do this conversion differently. In IPv4, the individual protocols that support tx timestamps call this function and store the result in ipc.tx_flags. In IPv6, sock_tx_timestamp is called in __ip6_append_data. There is no need to store both tx_flags and ts_flags in the cookie as one is derived from the other. Convert when setting up the cork and remove the redundant field. This is similar to IPv6, only have the conversion happen only once per datagram, in ip(6)_setup_cork. Also change __ip6_append_data to match __ip_append_data. Only update tskey if timestamping is enabled with OPT_ID. The SOCK_.. test is redundant: only valid protocols can have non-zero cork->tx_flags. After this change the IPv4 and IPv6 logic is the same. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 1 - net/ipv4/ip_output.c | 3 ++- net/ipv4/ping.c | 2 -- net/ipv4/raw.c | 2 -- net/ipv4/udp.c | 2 -- net/ipv6/ip6_output.c | 18 ++++++++---------- 6 files changed, 10 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 6db23bf1e5eb..e44b1a44f67a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -72,7 +72,6 @@ struct ipcm_cookie { __be32 addr; int oif; struct ip_options_rcu *opt; - __u8 tx_flags; __u8 ttl; __s16 tos; char priority; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 81d0e4a77ec5..e14c774cc092 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1153,8 +1153,9 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->priority = ipc->priority; - cork->tx_flags = ipc->tx_flags; cork->transmit_time = ipc->sockc.transmit_time; + cork->tx_flags = 0; + sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 6f17fc8ebbdb..b54c964ad925 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -763,8 +763,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cf142909389c..33df4d76db2d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -665,8 +665,6 @@ back_from_confirm: &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 87f3a0b77864..060e841dde40 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1020,8 +1020,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1a3bf6437cb9..ff4b28a600ab 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1221,6 +1221,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.fragsize = mtu; cork->base.gso_size = sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; + cork->base.tx_flags = 0; + sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; @@ -1250,7 +1252,6 @@ static int __ip6_append_data(struct sock *sk, int copy; int err; int offset = 0; - __u8 tx_flags = 0; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; @@ -1269,6 +1270,10 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1318,13 +1323,6 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, ipc6->sockc.tsflags, &tx_flags); - if (tx_flags & SKBTX_ANY_SW_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; - } - /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1443,8 +1441,8 @@ alloc_new_skb: dst_exthdrlen); /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = tx_flags; - tx_flags = 0; + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; -- cgit v1.2.3 From fbf47813607ba8c4e5c5b81da3c47fc66ac314b1 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:59 -0400 Subject: ip: unconditionally set cork gso_size Now that ipc(6)->gso_size is correctly initialized in all callers of ip(6)_setup_cork, it is safe to unconditionally pass it to the cork. Link: http://lkml.kernel.org/r/20180619164752.143249-1-willemdebruijn.kernel@gmail.com Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 +-- net/ipv6/ip6_output.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e14c774cc092..e2b6bd478afb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1146,8 +1146,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; - cork->gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0; + cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ff4b28a600ab..8047fd41ba88 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1219,8 +1219,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (mtu < IPV6_MIN_MTU) return -EINVAL; cork->base.fragsize = mtu; - cork->base.gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; + cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); -- cgit v1.2.3 From 8d356b89f36d234a56434a110ae779e8ac389ca2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 4 Jul 2018 16:46:29 -0700 Subject: rtnetlink: add rtnl_link_state check in rtnl_configure_link rtnl_configure_link sets dev->rtnl_link_state to RTNL_LINK_INITIALIZED and unconditionally calls __dev_notify_flags to notify user-space of dev flags. current call sequence for rtnl_configure_link rtnetlink_newlink rtnl_link_ops->newlink rtnl_configure_link (unconditionally notifies userspace of default and new dev flags) If a newlink handler wants to call rtnl_configure_link early, we will end up with duplicate notifications to user-space. This patch fixes rtnl_configure_link to check rtnl_link_state and call __dev_notify_flags with gchanges = 0 if already RTNL_LINK_INITIALIZED. Later in the series, this patch will help the following sequence where a driver implementing newlink can call rtnl_configure_link to initialize the link early. makes the following call sequence work: rtnetlink_newlink rtnl_link_ops->newlink (vxlan) -> rtnl_configure_link (initializes link and notifies user-space of default dev flags) rtnl_configure_link (updates dev flags if requested by user ifm and notifies user-space of new dev flags) Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ef61222fdef..e3f743c141b3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2759,9 +2759,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) return err; } - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - - __dev_notify_flags(dev, old_flags, ~0U); + if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { + __dev_notify_flags(dev, old_flags, 0U); + } else { + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + __dev_notify_flags(dev, old_flags, ~0U); + } return 0; } EXPORT_SYMBOL(rtnl_configure_link); -- cgit v1.2.3 From 2064c3d4c02026572d4975177f28a58052f0a8b7 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:12 +0000 Subject: net/flow_dissector: Save vlan ethertype from headers Change vlan dissector key to save vlan tpid to support both 802.1Q and 802.1AD ethertype. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 +- net/core/flow_dissector.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index adc24df56b90..8f899688a965 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -47,7 +47,7 @@ struct flow_dissector_key_tags { struct flow_dissector_key_vlan { u16 vlan_id:12, vlan_priority:3; - u16 padding; + __be16 vlan_tpid; }; struct flow_dissector_key_mpls { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 53f96e4f7bf5..18cb99b50cba 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -751,6 +751,7 @@ proto_again: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; bool vlan_tag_present = skb && skb_vlan_tag_present(skb); + __be16 saved_vlan_tpid = proto; if (vlan_tag_present) proto = skb->protocol; @@ -789,6 +790,7 @@ proto_again: (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } + key_vlan->vlan_tpid = saved_vlan_tpid; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; -- cgit v1.2.3 From aaab08344d2670e5c119b7b497d5063d7ddb8364 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:13 +0000 Subject: net/sched: flower: Add support for matching on vlan ethertype As flow dissector stores vlan ethertype, tc flower now can match on that. It is to make preparation for supporting QinQ. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 352876bb901b..da9ec30763fe 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -500,6 +500,7 @@ static int fl_set_key_mpls(struct nlattr **tb, } static void fl_set_key_vlan(struct nlattr **tb, + __be16 ethertype, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { @@ -516,6 +517,8 @@ static void fl_set_key_vlan(struct nlattr **tb, VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } + key_val->vlan_tpid = ethertype; + key_mask->vlan_tpid = cpu_to_be16(~0); } static void fl_set_key_flag(u32 flower_key, u32 flower_mask, @@ -592,8 +595,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); - if (ethertype == htons(ETH_P_8021Q)) { - fl_set_key_vlan(tb, &key->vlan, &mask->vlan); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, &key->vlan, &mask->vlan); fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_VLAN_ETH_TYPE, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, -- cgit v1.2.3 From 24c590e3b0f9eebe603ebe3d516990306d385f46 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:14 +0000 Subject: net/flow_dissector: Add support for QinQ dissection Dissect the QinQ packets to get both outer and inner vlan information, then store to the extended flow keys. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 ++ net/core/flow_dissector.c | 32 +++++++++++++++++--------------- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8f899688a965..c64406717eee 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -206,6 +206,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ + FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_MAX, }; @@ -237,6 +238,7 @@ struct flow_keys { struct flow_dissector_key_basic basic; struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_addrs addrs; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 18cb99b50cba..b555fc229e96 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -589,7 +589,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; - bool skip_vlan = false; + enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -748,15 +748,14 @@ proto_again: } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { - const struct vlan_hdr *vlan; + const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; - bool vlan_tag_present = skb && skb_vlan_tag_present(skb); __be16 saved_vlan_tpid = proto; - if (vlan_tag_present) + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && + skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; - - if (!vlan_tag_present || eth_type_vlan(skb->protocol)) { + } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { @@ -766,20 +765,23 @@ proto_again: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); - if (skip_vlan) { - fdret = FLOW_DISSECT_RET_PROTO_AGAIN; - break; - } } - skip_vlan = true; - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN)) { + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { + dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; + } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { + dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; + } else { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN, + dissector_vlan, target_container); - if (vlan_tag_present) { + if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); -- cgit v1.2.3 From d30695126f0ac5bca85d09c7946ad9a1deab5d25 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:15 +0000 Subject: net/sched: flower: Dump the ethertype encapsulated in vlan Currently the encapsulated ethertype is not dumped as it's the same as TCA_FLOWER_KEY_ETH_TYPE keyvalue. But the dumping result is inconsistent with input, we add dumping it with TCA_FLOWER_KEY_VLAN_ETH_TYPE. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index da9ec30763fe..e93b13d2cb81 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1313,6 +1313,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) goto nla_put_failure; + if (mask->vlan.vlan_tpid && + nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, key->basic.n_proto)) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, -- cgit v1.2.3 From d64efd0926ba4f32e657e615a4f4a6170d5cc0fa Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:16 +0000 Subject: net/sched: flower: Add supprt for matching on QinQ vlan headers As support dissecting of QinQ inner and outer vlan headers, user can add rules to match on QinQ vlan headers. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 4 +++ net/sched/cls_flower.c | 65 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 84e4c1d0f874..c4262d911596 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -469,6 +469,10 @@ enum { TCA_FLOWER_KEY_IP_TTL, /* u8 */ TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e93b13d2cb81..487a152a852c 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -35,6 +35,7 @@ struct fl_flow_key { struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -449,6 +450,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -501,19 +505,20 @@ static int fl_set_key_mpls(struct nlattr **tb, static void fl_set_key_vlan(struct nlattr **tb, __be16 ethertype, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { #define VLAN_PRIORITY_MASK 0x7 - if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + if (tb[vlan_id_key]) { key_val->vlan_id = - nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK; key_mask->vlan_id = VLAN_VID_MASK; } - if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + if (tb[vlan_prio_key]) { key_val->vlan_priority = - nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + nla_get_u8(tb[vlan_prio_key]) & VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } @@ -596,11 +601,25 @@ static int fl_set_key(struct net *net, struct nlattr **tb, ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); if (eth_type_vlan(ethertype)) { - fl_set_key_vlan(tb, ethertype, &key->vlan, &mask->vlan); - fl_set_key_val(tb, &key->basic.n_proto, - TCA_FLOWER_KEY_VLAN_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, + &mask->vlan); + + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, + TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + &mask->basic.n_proto, + TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } } else { key->basic.n_proto = ethertype; mask->basic.n_proto = cpu_to_be16(~0); @@ -825,6 +844,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask) FLOW_DISSECTOR_KEY_MPLS, mpls); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_CVLAN, cvlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1201,6 +1222,7 @@ static int fl_dump_key_ip(struct sk_buff *skb, } static int fl_dump_key_vlan(struct sk_buff *skb, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) { @@ -1209,13 +1231,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb, if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) return 0; if (vlan_mask->vlan_id) { - err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + err = nla_put_u16(skb, vlan_id_key, vlan_key->vlan_id); if (err) return err; } if (vlan_mask->vlan_priority) { - err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + err = nla_put_u8(skb, vlan_prio_key, vlan_key->vlan_priority); if (err) return err; @@ -1310,13 +1332,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) goto nla_put_failure; - if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan)) goto nla_put_failure; - if (mask->vlan.vlan_tpid && - nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, key->basic.n_proto)) + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan) || + (mask->cvlan.vlan_tpid && + nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->cvlan.vlan_tpid))) goto nla_put_failure; + if (mask->cvlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } else if (mask->vlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, -- cgit v1.2.3 From a4dc70d46cf1a05b244a06d3d8c0c09908ea13b9 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 6 Jul 2018 15:22:36 +0200 Subject: tipc: extend link reset criteria for stale packet retransmission Currently a link is declared stale and reset if there has been 100 repeated attempts to retransmit the same packet. However, in certain infrastructures we see that packet (NACK) duplicates and delays may cause such retransmit attempts to occur at a high rate, so that the peer doesn't have a reasonable chance to acknowledge the reception before the 100-limit is hit. This may take much less than the stipulated link tolerance time, and despite that probe/probe replies otherwise go through as normal. We now extend the criteria for link reset to also being time based. I.e., we don't reset the link until the link tolerance time is passed AND we have made 100 retransmissions attempts. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 63860329dbaa..ec4d28328652 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -106,7 +106,8 @@ struct tipc_stats { * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message - * @stale_count: # of identical retransmit requests made by peer + * @stale_cnt: counter for number of identical retransmit attempts + * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. * @rcv_nxt: next sequence number to expect for inbound messages @@ -161,7 +162,8 @@ struct tipc_link { u16 snd_nxt; u16 last_retransm; u16 window; - u32 stale_count; + u16 stale_cnt; + unsigned long stale_limit; /* Reception */ u16 rcv_nxt; @@ -860,7 +862,7 @@ void tipc_link_reset(struct tipc_link *l) l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stale_count = 0; + l->stale_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); @@ -997,39 +999,41 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); } -int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, +/* tipc_link_retrans() - retransmit one or more packets + * @l: the link to transmit on + * @r: the receiving link ordering the retransmit. Same as l if unicast + * @from: retransmit from (inclusive) this sequence number + * @to: retransmit to (inclusive) this sequence number + * xmitq: queue for accumulating the retransmitted packets + */ +int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); - struct tipc_msg *hdr; - u16 ack = l->rcv_nxt - 1; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 ack = l->rcv_nxt - 1; + struct tipc_msg *hdr; if (!skb) return 0; /* Detect repeated retransmit failures on same packet */ - if (nacker->last_retransm != buf_seqno(skb)) { - nacker->last_retransm = buf_seqno(skb); - nacker->stale_count = 1; - } else if (++nacker->stale_count > 100) { + if (r->last_retransm != buf_seqno(skb)) { + r->last_retransm = buf_seqno(skb); + r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance); + } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); - nacker->stale_count = 0; if (link_is_bc_sndlink(l)) return TIPC_LINK_DOWN_EVT; return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } - /* Move forward to where retransmission should start */ skb_queue_walk(&l->transmq, skb) { - if (!less(buf_seqno(skb), from)) - break; - } - - skb_queue_walk_from(&l->transmq, skb) { - if (more(buf_seqno(skb), to)) - break; hdr = buf_msg(skb); + if (less(msg_seqno(hdr), from)) + continue; + if (more(msg_seqno(hdr), to)) + break; _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1272,6 +1276,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { + l->stale_cnt = 0; tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); -- cgit v1.2.3 From 94f01eed49b569a14f02fcfecfd6401a95008049 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 7 Jul 2018 21:36:24 +0200 Subject: batman-adv: Unify include guards style All other include guards in batman-adv use the style: * _NET_BATMAN_ADV_$(FILENAME)_ * uppercase only * "." & "-" replaced with "_" Use this also in the B.A.T.M.A.N. IV/V OGM implementation headers. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.h | 6 +++--- net/batman-adv/bat_v_ogm.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 317cafd302cf..3dc6a7a43eb7 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -16,11 +16,11 @@ * along with this program; if not, see . */ -#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_ -#define _BATMAN_ADV_BATADV_IV_OGM_H_ +#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_ +#define _NET_BATMAN_ADV_BAT_IV_OGM_H_ #include "main.h" int batadv_iv_init(void); -#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */ +#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */ diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index ed36c5e79fde..e5be14c908c6 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -16,8 +16,8 @@ * along with this program; if not, see . */ -#ifndef _BATMAN_ADV_BATADV_V_OGM_H_ -#define _BATMAN_ADV_BATADV_V_OGM_H_ +#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_ +#define _NET_BATMAN_ADV_BAT_V_OGM_H_ #include "main.h" @@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */ +#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */ -- cgit v1.2.3 From 3b1709de64bcc54284812fa43808da07089008ca Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 7 Jul 2018 21:46:11 +0200 Subject: batman-adv: Join batadv_purge_orig_ref and _batadv_purge_orig The single line function batadv_purge_orig_ref has no function beside providing the name used by other source files. This can also be done simpler by just renaming _batadv_purge_orig to batadv_purge_orig_ref. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 716e5b43acfa..1d295da3e342 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1339,7 +1339,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, return false; } -static void _batadv_purge_orig(struct batadv_priv *bat_priv) +/** + * batadv_purge_orig_ref() - Purge all outdated originators + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; @@ -1385,21 +1389,12 @@ static void batadv_purge_orig(struct work_struct *work) delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); - _batadv_purge_orig(bat_priv); + batadv_purge_orig_ref(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } -/** - * batadv_purge_orig_ref() - Purge all outdated originators - * @bat_priv: the bat priv with all the soft interface information - */ -void batadv_purge_orig_ref(struct batadv_priv *bat_priv) -{ - _batadv_purge_orig(bat_priv); -} - #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** -- cgit v1.2.3 From b233504033dbd65740e59681820ccfd0a2a8ec53 Mon Sep 17 00:00:00 2001 From: Yifeng Sun Date: Mon, 2 Jul 2018 08:18:03 -0700 Subject: openvswitch: kernel datapath clone action Add 'clone' action to kernel datapath by using existing functions. When actions within clone don't modify the current flow, the flow key is not cloned before executing clone actions. This is a follow up patch for this incomplete work: https://patchwork.ozlabs.org/patch/722096/ v1 -> v2: Refactor as advised by reviewer. Signed-off-by: Yifeng Sun Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/openvswitch.h | 5 +++ include/uapi/linux/openvswitch.h | 3 ++ net/openvswitch/actions.c | 33 ++++++++++++++++++ net/openvswitch/flow_netlink.c | 73 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+) (limited to 'net') diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index e6b240b6196c..379affc63e24 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -21,4 +21,9 @@ #include +#define OVS_CLONE_ATTR_EXEC 0 /* Specify an u32 value. When nonzero, + * actions in clone will not change flow + * keys. False otherwise. + */ + #endif /* _LINUX_OPENVSWITCH_H */ diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 863aabaa5cc9..dbe0cbe4f1b7 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -840,6 +840,8 @@ struct ovs_action_push_eth { * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet. * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the * packet, or modify the packet (e.g., change the DSCP field). + * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of + * actions without affecting the original packet and key. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -873,6 +875,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */ OVS_ACTION_ATTR_POP_NSH, /* No argument. */ OVS_ACTION_ATTR_METER, /* u32 meter ID. */ + OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 30a5df27116e..85ae53d8fd09 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb, clone_flow_key); } +/* When 'last' is true, clone() should always consume the 'skb'. + * Otherwise, clone() should keep 'skb' intact regardless what + * actions are executed within clone(). + */ +static int clone(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + bool last) +{ + struct nlattr *actions; + struct nlattr *clone_arg; + int rem = nla_len(attr); + bool dont_clone_flow_key; + + /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + clone_arg = nla_data(attr); + dont_clone_flow_key = nla_get_u32(clone_arg); + actions = nla_next(clone_arg, &rem); + + return clone_execute(dp, skb, key, 0, actions, rem, last, + !dont_clone_flow_key); +} + static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { @@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, consume_skb(skb); return 0; } + break; + + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = clone(dp, skb, key, a, last); + if (last) + return err; + + break; + } } if (unlikely(err)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 391c4073a6dc..a70097ecf33c 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return 0; } +static int validate_and_copy_clone(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) +{ + int start, err; + u32 exec; + + if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN) + return -EINVAL; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log); + if (start < 0) + return start; + + exec = last || !actions_may_change_flow(attr); + + err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec, + sizeof(exec), log); + if (err) + return err; + + err = __ovs_nla_copy_actions(net, attr, key, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + + add_nested_action_end(*sfa, start); + + return 0; +} + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, bool reset_key, @@ -2849,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, [OVS_ACTION_ATTR_POP_NSH] = 0, [OVS_ACTION_ATTR_METER] = sizeof(u32), + [OVS_ACTION_ATTR_CLONE] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3038,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Non-existent meters are simply ignored. */ break; + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_clone(net, a, key, sfa, + eth_type, vlan_tci, + log, last); + if (err) + return err; + skip_copy = true; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3116,6 +3163,26 @@ out: return err; } +static int clone_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE); + if (!start) + return -EMSGSIZE; + + err = ovs_nla_put_actions(nla_data(attr), rem, skb); + + if (err) + nla_nest_cancel(skb, start); + else + nla_nest_end(skb, start); + + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3204,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_CLONE: + err = clone_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; -- cgit v1.2.3 From eec94fdb04806790c7b7e6ea347820064cc6d467 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:23 +0300 Subject: net: sched: use rcu for action cookie update Implement functions to atomically update and free action cookie using rcu mechanism. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/pkt_cls.h | 1 + net/sched/act_api.c | 44 ++++++++++++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 5ff11adbe2a6..ffc3ef321776 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -37,7 +37,7 @@ struct tc_action { spinlock_t tcfa_lock; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct tc_cookie *act_cookie; + struct tc_cookie __rcu *act_cookie; struct tcf_chain *goto_chain; }; #define tcf_index common.tcfa_index diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6641584b27f1..2081e4219f81 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -781,6 +781,7 @@ struct tc_mqprio_qopt_offload { struct tc_cookie { u8 *data; u32 len; + struct rcu_head rcu; }; struct tc_qopt_offload_stats { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3f4cf930f809..02670c7489e3 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } +static void tcf_free_cookie_rcu(struct rcu_head *p) +{ + struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); + + kfree(cookie->data); + kfree(cookie); +} + +static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, + struct tc_cookie *new_cookie) +{ + struct tc_cookie *old; + + old = xchg(old_cookie, new_cookie); + if (old) + call_rcu(&old->rcu, tcf_free_cookie_rcu); +} + /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected @@ -65,10 +83,7 @@ static void free_tcf(struct tc_action *p) free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); - if (p->act_cookie) { - kfree(p->act_cookie->data); - kfree(p->act_cookie); - } + tcf_set_action_cookie(&p->act_cookie, NULL); if (p->goto_chain) tcf_action_goto_chain_fini(p); @@ -567,16 +582,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; - if (a->act_cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, - a->act_cookie->data)) + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); goto nla_put_failure; + } } + rcu_read_unlock(); nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -719,13 +740,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err < 0) goto err_mod; - if (name == NULL && tb[TCA_ACT_COOKIE]) { - if (a->act_cookie) { - kfree(a->act_cookie->data); - kfree(a->act_cookie); - } - a->act_cookie = cookie; - } + if (!name && tb[TCA_ACT_COOKIE]) + tcf_set_action_cookie(&a->act_cookie, cookie); /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then -- cgit v1.2.3 From 036bb44327f50273e85ee4a2c9b56eebce1c0838 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:24 +0300 Subject: net: sched: change type of reference and bind counters Change type of action reference counter to refcount_t. Change type of action bind counter to atomic_t. This type is used to allow decrementing bind counter without testing for 0 result. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 32 ++++++++++++++++++++++---------- net/sched/act_bpf.c | 4 ++-- net/sched/act_connmark.c | 4 ++-- net/sched/act_csum.c | 4 ++-- net/sched/act_gact.c | 4 ++-- net/sched/act_ife.c | 4 ++-- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 4 ++-- net/sched/act_nat.c | 4 ++-- net/sched/act_pedit.c | 4 ++-- net/sched/act_police.c | 4 ++-- net/sched/act_sample.c | 4 ++-- net/sched/act_simple.c | 4 ++-- net/sched/act_skbedit.c | 4 ++-- net/sched/act_skbmod.c | 4 ++-- net/sched/act_tunnel_key.c | 4 ++-- net/sched/act_vlan.c | 4 ++-- 18 files changed, 57 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index ffc3ef321776..2759226527a2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -6,6 +6,7 @@ * Public action API for classifiers/qdiscs */ +#include #include #include #include @@ -26,8 +27,8 @@ struct tc_action { struct tcf_idrinfo *idrinfo; u32 tcfa_index; - int tcfa_refcnt; - int tcfa_bindcnt; + refcount_t tcfa_refcnt; + atomic_t tcfa_bindcnt; u32 tcfa_capab; int tcfa_action; struct tcf_t tcfa_tm; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 02670c7489e3..4f064ecab882 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -105,14 +105,26 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) ASSERT_RTNL(); + /* Release with strict==1 and bind==0 is only called through act API + * interface (classifiers always bind). Only case when action with + * positive reference count and zero bind count can exist is when it was + * also created with act API (unbinding last classifier will destroy the + * action if it was created by classifier). So only case when bind count + * can be changed after initial check is when unbound action is + * destroyed by act API while classifier binds to action with same id + * concurrently. This result either creation of new action(same behavior + * as before), or reusing existing action if concurrent process + * increments reference count before action is deleted. Both scenarios + * are acceptable. + */ if (p) { if (bind) - p->tcfa_bindcnt--; - else if (strict && p->tcfa_bindcnt > 0) + atomic_dec(&p->tcfa_bindcnt); + else if (strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - p->tcfa_refcnt--; - if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { + if (atomic_read(&p->tcfa_bindcnt) <= 0 && + refcount_dec_and_test(&p->tcfa_refcnt)) { if (p->ops->cleanup) p->ops->cleanup(p); tcf_idr_remove(p->idrinfo, p); @@ -304,8 +316,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, if (index && p) { if (bind) - p->tcfa_bindcnt++; - p->tcfa_refcnt++; + atomic_inc(&p->tcfa_bindcnt); + refcount_inc(&p->tcfa_refcnt); *a = p; return true; } @@ -324,9 +336,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, if (unlikely(!p)) return -ENOMEM; - p->tcfa_refcnt = 1; + refcount_set(&p->tcfa_refcnt, 1); if (bind) - p->tcfa_bindcnt = 1; + atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -782,7 +794,7 @@ static void cleanup_a(struct list_head *actions, int ovr) return; list_for_each_entry(a, actions, list) - a->tcfa_refcnt--; + refcount_dec(&a->tcfa_refcnt); } int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, @@ -810,7 +822,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, act->order = i; sz += tcf_action_fill_size(act); if (ovr) - act->tcfa_refcnt++; + refcount_inc(&act->tcfa_refcnt); list_add_tail(&act->list, actions); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 18089c02e557..15a2a53cbde1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, - .refcnt = prog->tcf_refcnt - ref, - .bindcnt = prog->tcf_bindcnt - bind, + .refcnt = refcount_read(&prog->tcf_refcnt) - ref, + .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, .action = prog->tcf_action, }; struct tcf_t tm; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e4b880fa51fe..188865034f9a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -154,8 +154,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, struct tc_connmark opt = { .index = ci->tcf_index, - .refcnt = ci->tcf_refcnt - ref, - .bindcnt = ci->tcf_bindcnt - bind, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, .action = ci->tcf_action, .zone = ci->zone, }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 526a8e491626..da865f7b390a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -597,8 +597,8 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4dc4f153cad8..ca83debd5a70 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -169,8 +169,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, - .refcnt = gact->tcf_refcnt - ref, - .bindcnt = gact->tcf_bindcnt - bind, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, .action = gact->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 20d7d36b2fc9..3536a23f46b5 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -596,8 +596,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, - .refcnt = ife->tcf_refcnt - ref, - .bindcnt = ife->tcf_bindcnt - bind, + .refcnt = refcount_read(&ife->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, .action = ife->tcf_action, .flags = p->flags, }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 14c312d7908f..7bce88dc11c9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -280,8 +280,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (unlikely(!t)) goto nla_put_failure; - c.bindcnt = ipt->tcf_bindcnt - bind; - c.refcnt = ipt->tcf_refcnt - ref; + c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; + c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fd34015331ab..82a8bdd67c47 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -250,8 +250,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, - .refcnt = m->tcf_refcnt - ref, - .bindcnt = m->tcf_bindcnt - bind, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, .eaction = m->tcfm_eaction, .ifindex = dev ? dev->ifindex : 0, }; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4b5848b6c252..457c2ae3de46 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -257,8 +257,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .index = p->tcf_index, .action = p->tcf_action, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e43aef28fdac..889690e0ec39 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,8 +409,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; opt->action = p->tcf_action; - opt->refcnt = p->tcf_refcnt - ref; - opt->bindcnt = p->tcf_bindcnt - bind; + opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; + opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (p->tcfp_keys_ex) { tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 4e72bc2a0dfb..a789b8060968 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -274,8 +274,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, .action = police->tcf_action, .mtu = police->tcfp_mtu, .burst = PSCHED_NS2TICKS(police->tcfp_burst), - .refcnt = police->tcf_refcnt - ref, - .bindcnt = police->tcf_bindcnt - bind, + .refcnt = refcount_read(&police->tcf_refcnt) - ref, + .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5db358497c9e..4a46978db092 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -173,8 +173,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tc_sample opt = { .index = s->tcf_index, .action = s->tcf_action, - .refcnt = s->tcf_refcnt - ref, - .bindcnt = s->tcf_bindcnt - bind, + .refcnt = refcount_read(&s->tcf_refcnt) - ref, + .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 98c4afe7c15b..c3a761097b01 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -145,8 +145,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index dfaf5d8028dd..cfd20d3d2ca9 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -208,8 +208,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbedit *d = to_skbedit(a); struct tc_skbedit opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ad050d7d4b46..ff90d720eda3 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -205,8 +205,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); struct tc_skbmod opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ea203e386a92..2354f07eba15 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -474,8 +474,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, - .refcnt = t->tcf_refcnt - ref, - .bindcnt = t->tcf_bindcnt - bind, + .refcnt = refcount_read(&t->tcf_refcnt) - ref, + .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, }; struct tcf_t tm; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1fb39e1f9d07..799e3deb44ac 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -239,8 +239,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, - .refcnt = v->tcf_refcnt - ref, - .bindcnt = v->tcf_bindcnt - bind, + .refcnt = refcount_read(&v->tcf_refcnt) - ref, + .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, .action = v->tcf_action, .v_action = p->tcfv_action, }; -- cgit v1.2.3 From 789871bb2a0381425b106d2a995bde1460d35a34 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:25 +0300 Subject: net: sched: implement unlocked action init API Add additional 'rtnl_held' argument to act API init functions. It is required to implement actions that need to release rtnl lock before loading kernel module and reacquire if afterwards. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 6 ++++-- net/sched/act_api.c | 18 +++++++++++------- net/sched/act_bpf.c | 3 ++- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_mirred.c | 5 +++-- net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 3 ++- net/sched/act_police.c | 2 +- net/sched/act_sample.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_skbmod.c | 3 ++- net/sched/act_tunnel_key.c | 3 ++- net/sched/act_vlan.c | 3 ++- net/sched/cls_api.c | 5 +++-- 19 files changed, 50 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 2759226527a2..27823f4e24c4 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -92,7 +92,8 @@ struct tc_action_ops { struct netlink_ext_ack *extack); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, - int bind, struct netlink_ext_ack *extack); + int bind, bool rtnl_held, + struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, @@ -168,10 +169,11 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack); + bool rtnl_held, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4f064ecab882..256b0c93916c 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -671,6 +671,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action *a; @@ -721,9 +722,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("act_%s", act_name); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); a_o = tc_lookup_action_n(act_name); @@ -746,9 +749,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - extack); + rtnl_held, extack); else - err = a_o->init(net, nla, est, &a, ovr, bind, extack); + err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, + extack); if (err < 0) goto err_mod; @@ -800,7 +804,7 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -814,7 +818,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - extack); + rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1173,7 +1177,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, LIST_HEAD(actions); ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - &attr_size, extack); + &attr_size, true, extack); if (ret) return ret; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 15a2a53cbde1..8ebf40a3506c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, struct netlink_ext_ack *extack) + int replace, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 188865034f9a..e3787aa0025a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index da865f7b390a..334261943f9f 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ca83debd5a70..b4dfb2b4addc 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3536a23f46b5..576ffbba61c3 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -448,7 +448,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 7bce88dc11c9..9c21663a86a6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -196,7 +196,8 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -204,7 +205,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool unlocked, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 82a8bdd67c47..5434f08f2eb7 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -68,8 +68,9 @@ static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 457c2ae3de46..e6487ad1e4a8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 889690e0ec39..f7965f35585b 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,7 +132,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a789b8060968..0e1c2fb0ebea 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { int ret = 0, err; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 4a46978db092..316fc645595d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index c3a761097b01..dc591cc87f4a 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cfd20d3d2ca9..c4ae4bd830aa 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -94,7 +94,8 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ff90d720eda3..026d6f58eda1 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2354f07eba15..15ea5ce0f9ed 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,7 +201,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 799e3deb44ac..c61775250722 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index bbf8dda96b0e..ebc2b9dd783f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1632,7 +1632,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, extack); + TCA_ACT_BIND, true, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1645,7 +1645,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, extack); + &actions, &attr_size, true, + extack); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3 From 3f7c72bc4227b169ba2c924a7987324e24bbc4b2 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:26 +0300 Subject: net: sched: always take reference to action Without rtnl lock protection it is no longer safe to use pointer to tc action without holding reference to it. (it can be destroyed concurrently) Remove unsafe action idr lookup function. Instead of it, implement safe tcf idr check function that atomically looks up action in idr and increments its reference and bind counters. Implement both action search and check using new safe function Reference taken by idr check is temporal and should not be accounted by userspace clients (both logically and to preserver current API behavior). Subtract temporal reference when dumping action to userspace using existing tca_get_fill function arguments. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 256b0c93916c..aa304d36fee0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -284,44 +284,38 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo) +static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, + struct tc_action **a, int bind) { - struct tc_action *p = NULL; + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + } spin_unlock(&idrinfo->lock); - return p; + if (p) { + *a = p; + return true; + } + return false; } int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { - struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); - - if (p) { - *a = p; - return 1; - } - return 0; + return __tcf_idr_check(tn, index, a, 0); } EXPORT_SYMBOL(tcf_idr_search); bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { - struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); - - if (index && p) { - if (bind) - atomic_inc(&p->tcfa_bindcnt); - refcount_inc(&p->tcfa_refcnt); - *a = p; - return true; - } - return false; + return __tcf_idr_check(tn, index, a, bind); } EXPORT_SYMBOL(tcf_idr_check); @@ -932,7 +926,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 0) <= 0) { + 0, 1) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1072,7 +1066,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 1) <= 0) { + 0, 2) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; @@ -1131,14 +1125,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ + cleanup_a(&actions, 1); /* lookup took reference */ ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); if (ret) goto err; return ret; } err: - if (event != RTM_GETACTION) - tcf_action_destroy(&actions, 0); + tcf_action_destroy(&actions, 0); return ret; } -- cgit v1.2.3 From 2a2ea349704fffade9526d5122299edbbfd122ca Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:27 +0300 Subject: net: sched: implement action API that deletes action by index Implement new action API function that atomically finds and deletes action from idr by index. Intended to be used by lockless actions that do not rely on rtnl lock. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_api.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 27823f4e24c4..a8eaae67c264 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -153,6 +153,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, int bind, bool cpustats); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); static inline int tcf_idr_release(struct tc_action *a, bool bind) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aa304d36fee0..0f31f09946ab 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -319,6 +319,45 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, } EXPORT_SYMBOL(tcf_idr_check); +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret = 0; + + spin_lock(&idrinfo->lock); + p = idr_find(&idrinfo->action_idr, index); + if (!p) { + spin_unlock(&idrinfo->lock); + return -ENOENT; + } + + if (!atomic_read(&p->tcfa_bindcnt)) { + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + struct module *owner = p->ops->owner; + + WARN_ON(p != idr_remove(&idrinfo->action_idr, + p->tcfa_index)); + spin_unlock(&idrinfo->lock); + + if (p->ops->cleanup) + p->ops->cleanup(p); + + gen_kill_estimator(&p->tcfa_rate_est); + free_tcf(p); + module_put(owner); + return 0; + } + ret = 0; + } else { + ret = -EPERM; + } + + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_delete_index); + int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) -- cgit v1.2.3 From b409074e6693bcdaa7abbee2a035f22a9eabda53 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:28 +0300 Subject: net: sched: add 'delete' function to action ops Extend action ops with 'delete' function. Each action type to implements its own delete function that doesn't depend on rtnl lock. Implement delete function that is required to delete actions without holding rtnl lock. Use action API function that atomically deletes action only if it is still in action idr. This implementation prevents concurrent threads from deleting same action twice. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_bpf.c | 8 ++++++++ net/sched/act_connmark.c | 8 ++++++++ net/sched/act_csum.c | 8 ++++++++ net/sched/act_gact.c | 8 ++++++++ net/sched/act_ife.c | 8 ++++++++ net/sched/act_ipt.c | 16 ++++++++++++++++ net/sched/act_mirred.c | 8 ++++++++ net/sched/act_nat.c | 8 ++++++++ net/sched/act_pedit.c | 8 ++++++++ net/sched/act_police.c | 8 ++++++++ net/sched/act_sample.c | 8 ++++++++ net/sched/act_simple.c | 8 ++++++++ net/sched/act_skbedit.c | 8 ++++++++ net/sched/act_skbmod.c | 8 ++++++++ net/sched/act_tunnel_key.c | 8 ++++++++ net/sched/act_vlan.c | 8 ++++++++ 17 files changed, 137 insertions(+) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index a8eaae67c264..b9ed2b8256a5 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,6 +101,7 @@ struct tc_action_ops { void (*stats_update)(struct tc_action *, u64, u32, u64); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a); + int (*delete)(struct net *net, u32 index); }; struct tc_action_net { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 8ebf40a3506c..7941dd66ff83 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -388,6 +388,13 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_bpf_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -398,6 +405,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .delete = tcf_bpf_delete, .size = sizeof(struct tcf_bpf), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e3787aa0025a..143c2d3de723 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -193,6 +193,13 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_connmark_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -202,6 +209,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .delete = tcf_connmark_delete, .size = sizeof(struct tcf_connmark_info), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 334261943f9f..3768539340e0 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -654,6 +654,13 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) return nla_total_size(sizeof(struct tc_csum)); } +static int tcf_csum_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -665,6 +672,7 @@ static struct tc_action_ops act_csum_ops = { .walk = tcf_csum_walker, .lookup = tcf_csum_search, .get_fill_size = tcf_csum_get_fill_size, + .delete = tcf_csum_delete, .size = sizeof(struct tcf_csum), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b4dfb2b4addc..a431a711f0dd 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -231,6 +231,13 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) return sz; } +static int tcf_gact_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -242,6 +249,7 @@ static struct tc_action_ops act_gact_ops = { .walk = tcf_gact_walker, .lookup = tcf_gact_search, .get_fill_size = tcf_gact_get_fill_size, + .delete = tcf_gact_delete, .size = sizeof(struct tcf_gact), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 576ffbba61c3..89a761395c94 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -844,6 +844,13 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ife_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ife_ops = { .kind = "ife", .type = TCA_ACT_IFE, @@ -854,6 +861,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .delete = tcf_ife_delete, .size = sizeof(struct tcf_ife_info), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 9c21663a86a6..6c234411c771 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -324,6 +324,13 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ipt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -334,6 +341,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .delete = tcf_ipt_delete, .size = sizeof(struct tcf_ipt), }; @@ -374,6 +382,13 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_xt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -384,6 +399,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .delete = tcf_xt_delete, .size = sizeof(struct tcf_ipt), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5434f08f2eb7..3d8300bce7e4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -322,6 +322,13 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) return rtnl_dereference(m->tcfm_dev); } +static int tcf_mirred_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, @@ -335,6 +342,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .delete = tcf_mirred_delete, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index e6487ad1e4a8..9eb27c89dc46 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -294,6 +294,13 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_nat_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -303,6 +310,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .delete = tcf_nat_delete, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index f7965f35585b..45871052840f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -454,6 +454,13 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_pedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -464,6 +471,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .delete = tcf_pedit_delete, .size = sizeof(struct tcf_pedit), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0e1c2fb0ebea..c955fb0d4f3f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -314,6 +314,13 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_police_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_idr_delete_index(tn, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -327,6 +334,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .delete = tcf_police_delete, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 316fc645595d..6f79d2afcba2 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -220,6 +220,13 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_sample_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_sample_ops = { .kind = "sample", .type = TCA_ACT_SAMPLE, @@ -230,6 +237,7 @@ static struct tc_action_ops act_sample_ops = { .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, + .delete = tcf_sample_delete, .size = sizeof(struct tcf_sample), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index dc591cc87f4a..446c750f3d3c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -184,6 +184,13 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_simp_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -194,6 +201,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .delete = tcf_simp_delete, .size = sizeof(struct tcf_defact), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index c4ae4bd830aa..b3eaa120c7f4 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -267,6 +267,13 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -276,6 +283,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .delete = tcf_skbedit_delete, .size = sizeof(struct tcf_skbedit), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 026d6f58eda1..30be3f767495 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -253,6 +253,13 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbmod_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, @@ -263,6 +270,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .walk = tcf_skbmod_walker, .lookup = tcf_skbmod_search, + .delete = tcf_skbmod_delete, .size = sizeof(struct tcf_skbmod), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 15ea5ce0f9ed..655ed0b3fc67 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -534,6 +534,13 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tunnel_key_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .type = TCA_ACT_TUNNEL_KEY, @@ -544,6 +551,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .cleanup = tunnel_key_release, .walk = tunnel_key_walker, .lookup = tunnel_key_search, + .delete = tunnel_key_delete, .size = sizeof(struct tcf_tunnel_key), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index c61775250722..e334d2751784 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -287,6 +287,13 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_vlan_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -297,6 +304,7 @@ static struct tc_action_ops act_vlan_ops = { .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .delete = tcf_vlan_delete, .size = sizeof(struct tcf_vlan), }; -- cgit v1.2.3 From 16af6067392c40e454e49eec834843ab03643d96 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:29 +0300 Subject: net: sched: implement reference counted action release Implement helper delete function that uses new action ops 'delete', instead of destroying action directly. This is required so act API could delete actions by index, without holding any references to action that is being deleted. Implement function __tcf_action_put() that releases reference to action and frees it, if necessary. Refactor action deletion code to use new put function and not to rely on rtnl lock. Remove rtnl lock assertions that are no longer needed. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 84 +++++++++++++++++++++++++++++++++++++++-------------- net/sched/cls_api.c | 1 - 2 files changed, 62 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 0f31f09946ab..a023873db713 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -90,21 +90,39 @@ static void free_tcf(struct tc_action *p) kfree(p); } -static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) +static void tcf_action_cleanup(struct tc_action *p) { - spin_lock(&idrinfo->lock); - idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock(&idrinfo->lock); + if (p->ops->cleanup) + p->ops->cleanup(p); + gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } +static int __tcf_action_put(struct tc_action *p, bool bind) +{ + struct tcf_idrinfo *idrinfo = p->idrinfo; + + if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { + if (bind) + atomic_dec(&p->tcfa_bindcnt); + idr_remove(&idrinfo->action_idr, p->tcfa_index); + spin_unlock(&idrinfo->lock); + + tcf_action_cleanup(p); + return 1; + } + + if (bind) + atomic_dec(&p->tcfa_bindcnt); + + return 0; +} + int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; - ASSERT_RTNL(); - /* Release with strict==1 and bind==0 is only called through act API * interface (classifiers always bind). Only case when action with * positive reference count and zero bind count can exist is when it was @@ -118,18 +136,11 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) * are acceptable. */ if (p) { - if (bind) - atomic_dec(&p->tcfa_bindcnt); - else if (strict && atomic_read(&p->tcfa_bindcnt) > 0) + if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - if (atomic_read(&p->tcfa_bindcnt) <= 0 && - refcount_dec_and_test(&p->tcfa_refcnt)) { - if (p->ops->cleanup) - p->ops->cleanup(p); - tcf_idr_remove(p->idrinfo, p); + if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; - } } return ret; @@ -340,11 +351,7 @@ int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) p->tcfa_index)); spin_unlock(&idrinfo->lock); - if (p->ops->cleanup) - p->ops->cleanup(p); - - gen_kill_estimator(&p->tcfa_rate_est); - free_tcf(p); + tcf_action_cleanup(p); module_put(owner); return 0; } @@ -615,6 +622,11 @@ int tcf_action_destroy(struct list_head *actions, int bind) return ret; } +static int tcf_action_put(struct tc_action *p) +{ + return __tcf_action_put(p, false); +} + int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -1092,6 +1104,35 @@ err_out: return err; } +static int tcf_action_delete(struct net *net, struct list_head *actions, + struct netlink_ext_ack *extack) +{ + struct tc_action *a, *tmp; + u32 act_index; + int ret; + + list_for_each_entry_safe(a, tmp, actions, list) { + const struct tc_action_ops *ops = a->ops; + + /* Actions can be deleted concurrently so we must save their + * type and id to search again after reference is released. + */ + act_index = a->tcfa_index; + + list_del(&a->list); + if (tcf_action_put(a)) { + /* last reference, action was deleted concurrently */ + module_put(ops->owner); + } else { + /* now do the delete */ + ret = ops->delete(net, act_index); + if (ret < 0) + return ret; + } + } + return 0; +} + static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, u32 portid, size_t attr_size, struct netlink_ext_ack *extack) @@ -1112,7 +1153,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } /* now do the delete */ - ret = tcf_action_destroy(actions, 0); + ret = tcf_action_delete(net, actions, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1164,7 +1205,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ - cleanup_a(&actions, 1); /* lookup took reference */ ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); if (ret) goto err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ebc2b9dd783f..9041f0e43e9a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1611,7 +1611,6 @@ void tcf_exts_destroy(struct tcf_exts *exts) #ifdef CONFIG_NET_CLS_ACT LIST_HEAD(actions); - ASSERT_RTNL(); tcf_exts_to_list(exts, &actions); tcf_action_destroy(&actions, TCA_ACT_UNBIND); kfree(exts->actions); -- cgit v1.2.3 From 4e8ddd7f1758ca4ddd0c1f7cf3e66fce736241d2 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:30 +0300 Subject: net: sched: don't release reference on action overwrite Return from action init function with reference to action taken, even when overwriting existing action. Action init API initializes its fourth argument (pointer to pointer to tc action) to either existing action with same index or newly created action. In case of existing index(and bind argument is zero), init function returns without incrementing action reference counter. Caller of action init then proceeds working with action, without actually holding reference to it. This means that action could be deleted concurrently. Change action init behavior to always take reference to action before returning successfully, in order to protect from concurrent deletion. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 -- net/sched/act_bpf.c | 8 ++++---- net/sched/act_connmark.c | 5 +++-- net/sched/act_csum.c | 8 ++++---- net/sched/act_gact.c | 5 +++-- net/sched/act_ife.c | 10 +++++----- net/sched/act_ipt.c | 5 +++-- net/sched/act_mirred.c | 5 ++--- net/sched/act_nat.c | 5 +++-- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 8 +++----- net/sched/act_sample.c | 8 +++----- net/sched/act_simple.c | 5 +++-- net/sched/act_skbedit.c | 5 +++-- net/sched/act_skbmod.c | 8 +++----- net/sched/act_tunnel_key.c | 11 ++++------- net/sched/act_vlan.c | 8 +++----- 17 files changed, 50 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a023873db713..f019f0464cec 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -870,8 +870,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } act->order = i; sz += tcf_action_fill_size(act); - if (ovr) - refcount_inc(&act->tcfa_refcnt); list_add_tail(&act->list, actions); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 7941dd66ff83..d3f4ac6f2c4b 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - tcf_idr_release(*act, bind); - if (!replace) + if (!replace) { + tcf_idr_release(*act, bind); return -EEXIST; + } } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; @@ -356,8 +357,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: - if (res == ACT_P_CREATED) - tcf_idr_release(*act, bind); + tcf_idr_release(*act, bind); return ret; } diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 143c2d3de723..701e90244eff 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -135,9 +135,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci = to_connmark(*a); if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3768539340e0..5dbee136b0a1 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -76,9 +76,10 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } p = to_tcf_csum(*a); @@ -86,8 +87,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } params_old = rtnl_dereference(p->params); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index a431a711f0dd..11c4de3f344e 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -100,9 +100,10 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } gact = to_gact(*a); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 89a761395c94..acea3feae762 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -498,12 +498,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) { - kfree(p); - return -EEXIST; - } + kfree(p); + return -EEXIST; } ife = to_ife(*a); @@ -548,6 +546,8 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + tcf_idr_release(*a, bind); + kfree(p); return err; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 6c234411c771..85e85dfba401 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -145,10 +145,11 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } hook = nla_get_u32(tb[TCA_IPT_HOOK]); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 3d8300bce7e4..e08aed06d7f8 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -132,10 +132,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (ret) return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } m = to_mirred(*a); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9eb27c89dc46..1f91e8e66c0f 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -66,9 +66,10 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, } else { if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } p = to_tcf_nat(*a); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 45871052840f..3a0e2f762f4e 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -194,8 +194,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } else { if (bind) goto out_free; - tcf_idr_release(*a, bind); if (!ovr) { + tcf_idr_release(*a, bind); ret = -EEXIST; goto out_free; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index c955fb0d4f3f..99335cca739e 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -111,10 +111,9 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, if (ret) return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } police = to_police(*a); @@ -195,8 +194,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return err; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 6f79d2afcba2..a8582e1347db 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -69,10 +69,9 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (ret) return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } s = to_sample(*a); @@ -81,8 +80,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } RCU_INIT_POINTER(s->psample_group, psample_group); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 446c750f3d3c..2da47c682a30 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -127,9 +127,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } else { d = to_defact(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } reset_policy(d, tb[TCA_DEF_DATA], parm); } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b3eaa120c7f4..4616a2c1821f 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -172,9 +172,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { d = to_skbedit(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } spin_lock_bh(&d->tcf_lock); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 30be3f767495..e844381af066 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -145,10 +145,9 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } d = to_skbmod(*a); @@ -156,8 +155,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 655ed0b3fc67..ab5bf5c13f87 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -329,12 +329,10 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) { - NL_SET_ERR_MSG(extack, "TC IDR already exists"); - return -EEXIST; - } + NL_SET_ERR_MSG(extack, "TC IDR already exists"); + return -EEXIST; } t = to_tunnel_key(*a); @@ -342,8 +340,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index e334d2751784..9b600faaccbb 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -187,10 +187,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } v = to_vlan(*a); @@ -198,8 +197,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } -- cgit v1.2.3 From cae422f379f37fe9105d2a113259788f989e7df5 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:31 +0300 Subject: net: sched: use reference counting action init Change action API to assume that action init function always takes reference to action, even when overwriting existing action. This is necessary because action API continues to use action pointer after init function is done. At this point action becomes accessible for concurrent modifications, so user must always hold reference to it. Implement helper put list function to atomically release list of actions after action API init code is done using them. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f019f0464cec..eefe8c2fe667 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -627,6 +627,18 @@ static int tcf_action_put(struct tc_action *p) return __tcf_action_put(p, false); } +static void tcf_action_put_lst(struct list_head *actions) +{ + struct tc_action *a, *tmp; + + list_for_each_entry_safe(a, tmp, actions, list) { + const struct tc_action_ops *ops = a->ops; + + if (tcf_action_put(a)) + module_put(ops->owner); + } +} + int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -835,17 +847,6 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct list_head *actions, int ovr) -{ - struct tc_action *a; - - if (!ovr) - return; - - list_for_each_entry(a, actions, list) - refcount_dec(&a->tcfa_refcnt); -} - int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, @@ -874,11 +875,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } *attr_size = tcf_action_full_attrs_size(sz); - - /* Remove the temp refcnt which was necessary to protect against - * destroying an existing action which was being replaced - */ - cleanup_a(actions, ovr); return 0; err: @@ -1209,7 +1205,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; } err: - tcf_action_destroy(&actions, 0); + tcf_action_put_lst(&actions); return ret; } @@ -1251,8 +1247,11 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, &attr_size, true, extack); if (ret) return ret; + ret = tcf_add_notify(net, n, &actions, portid, attr_size, extack); + if (ovr) + tcf_action_put_lst(&actions); - return tcf_add_notify(net, n, &actions, portid, attr_size, extack); + return ret; } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; -- cgit v1.2.3 From 0190c1d452a91c38a3462abdd81752be1b9006a8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:32 +0300 Subject: net: sched: atomically check-allocate action Implement function that atomically checks if action exists and either takes reference to it, or allocates idr slot for action index to prevent concurrent allocations of actions with same index. Use EBUSY error pointer to indicate that idr slot is reserved. Implement cleanup helper function that removes temporary error pointer from idr. (in case of error between idr allocation and insertion of newly created action to specified index) Refactor all action init functions to insert new action to idr using this API. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++ net/sched/act_api.c | 92 ++++++++++++++++++++++++++++++++++++---------- net/sched/act_bpf.c | 11 ++++-- net/sched/act_connmark.c | 10 +++-- net/sched/act_csum.c | 11 ++++-- net/sched/act_gact.c | 11 ++++-- net/sched/act_ife.c | 6 ++- net/sched/act_ipt.c | 13 ++++++- net/sched/act_mirred.c | 16 ++++++-- net/sched/act_nat.c | 11 ++++-- net/sched/act_pedit.c | 12 ++++-- net/sched/act_police.c | 9 ++++- net/sched/act_sample.c | 11 ++++-- net/sched/act_simple.c | 11 +++++- net/sched/act_skbedit.c | 11 +++++- net/sched/act_skbmod.c | 11 +++++- net/sched/act_tunnel_key.c | 9 ++++- net/sched/act_vlan.c | 17 ++++++++- 18 files changed, 216 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index b9ed2b8256a5..8090de2edab7 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -154,6 +154,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, int bind, bool cpustats); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind); int tcf_idr_delete_index(struct tc_action_net *tn, u32 index); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eefe8c2fe667..9511502e1cbb 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -303,7 +303,9 @@ static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); - if (p) { + if (IS_ERR(p)) { + p = NULL; + } else if (p) { refcount_inc(&p->tcfa_refcnt); if (bind) atomic_inc(&p->tcfa_bindcnt); @@ -371,7 +373,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct idr *idr = &idrinfo->action_idr; int err = -ENOMEM; if (unlikely(!p)) @@ -389,20 +390,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, goto err2; } spin_lock_init(&p->tcfa_lock); - idr_preload(GFP_KERNEL); - spin_lock(&idrinfo->lock); - /* user doesn't specify an index */ - if (!index) { - index = 1; - err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC); - } else { - err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); - } - spin_unlock(&idrinfo->lock); - idr_preload_end(); - if (err) - goto err3; - p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; @@ -412,7 +399,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err4; + goto err3; } p->idrinfo = idrinfo; @@ -420,8 +407,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, INIT_LIST_HEAD(&p->list); *a = p; return 0; -err4: - idr_remove(idr, index); err3: free_percpu(p->cpu_qstats); err2: @@ -437,11 +422,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); - idr_replace(&idrinfo->action_idr, a, a->tcfa_index); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); +/* Cleanup idr index that was allocated but not initialized. */ + +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + + spin_lock(&idrinfo->lock); + /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); + spin_unlock(&idrinfo->lock); +} +EXPORT_SYMBOL(tcf_idr_cleanup); + +/* Check if action with specified index exists. If actions is found, increments + * its reference and bind counters, and return 1. Otherwise insert temporary + * error pointer (to prevent concurrent users from inserting actions with same + * index) and return 0. + */ + +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret; + +again: + spin_lock(&idrinfo->lock); + if (*index) { + p = idr_find(&idrinfo->action_idr, *index); + if (IS_ERR(p)) { + /* This means that another process allocated + * index but did not assign the pointer yet. + */ + spin_unlock(&idrinfo->lock); + goto again; + } + + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + *a = p; + ret = 1; + } else { + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + *index, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, + ERR_PTR(-EBUSY), *index); + } + } else { + *index = 1; + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + UINT_MAX, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), + *index); + } + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_check_alloc); + void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index d3f4ac6f2c4b..06f743d8ed41 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -299,14 +299,17 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_idr_check(tn, parm->index, act, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, act, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, act, &act_bpf_ops, bind, true); - if (ret < 0) + if (ret < 0) { + tcf_idr_cleanup(tn, parm->index); return ret; + } res = ACT_P_CREATED; - } else { + } else if (ret > 0) { /* Don't override defaults. */ if (bind) return 0; @@ -315,6 +318,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, tcf_idr_release(*act, bind); return -EEXIST; } + } else { + return ret; } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 701e90244eff..1e31f0e448e2 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, a, &act_connmark_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ci = to_connmark(*a); ci->tcf_action = parm->action; @@ -131,7 +134,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; - } else { + } else if (ret > 0) { ci = to_connmark(*a); if (bind) return 0; @@ -142,6 +145,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; + ret = 0; } return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5dbee136b0a1..bd232d3bd022 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -67,19 +67,24 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_csum_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } p = to_tcf_csum(*a); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 11c4de3f344e..661b72b9147d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -91,19 +91,24 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_gact_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } gact = to_gact(*a); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index acea3feae762..a3eef00cd711 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -484,7 +484,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!p) return -ENOMEM; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) { kfree(p); return 0; @@ -494,6 +497,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, bind, true); if (ret) { + tcf_idr_cleanup(tn, parm->index); kfree(p); return ret; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 85e85dfba401..0dc787a57798 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - exists = tcf_idr_check(tn, index, a, bind); + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } @@ -133,14 +138,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, index); return ret; + } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e08aed06d7f8..6afd89a36c69 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -79,7 +79,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tcf_mirred *m; struct net_device *dev; bool exists = false; - int ret; + int ret, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); @@ -94,7 +94,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_MIRRED_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -107,6 +110,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } @@ -115,6 +120,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ENODEV; } mac_header_xmit = dev_is_mac_header_xmit(dev); @@ -124,13 +131,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!exists) { if (!dev) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 1f91e8e66c0f..4dd9188a72fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -57,19 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_nat_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } p = to_tcf_nat(*a); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3a0e2f762f4e..cc8ffcd1ddb5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -173,16 +173,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { if (!parm->nkeys) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_free; } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); goto out_free; + } p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { @@ -191,7 +195,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) goto out_free; if (!ovr) { @@ -207,6 +211,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } } + } else { + return err; } spin_lock_bh(&p->tcf_lock); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 99335cca739e..1f3192ea8df7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -101,15 +101,20 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, &act_police_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index a8582e1347db..3079e7be5bde 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -46,7 +46,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tc_sample *parm; struct tcf_sample *s; bool exists = false; - int ret; + int ret, err; if (!nla) return -EINVAL; @@ -59,15 +59,20 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SAMPLE_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_sample_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 2da47c682a30..aa51152e0066 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -100,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_defact(*a); ret = alloc_defdata(d, tb[TCA_DEF_DATA]); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 4616a2c1821f..86521a74ecdd 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -152,21 +152,28 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!flags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbedit_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_skbedit(*a); ret = ACT_P_CREATED; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index e844381af066..cdc6bacfb190 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -128,21 +128,28 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!lflags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbmod_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ab5bf5c13f87..3ec585d58762 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -237,7 +237,10 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -325,7 +328,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, &act_tunnel_key_ops, bind, true); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); - return ret; + goto err_out; } ret = ACT_P_CREATED; @@ -364,6 +367,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, err_out: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return ret; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 9b600faaccbb..ad37f308175a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -134,7 +134,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -146,12 +149,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ERANGE; } @@ -164,6 +171,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EPROTONOSUPPORT; } } else { @@ -176,6 +185,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } action = parm->v_action; @@ -183,8 +194,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_vlan_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { -- cgit v1.2.3 From 90b73b77d08ec395311411b545c756ca710aae59 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:33 +0300 Subject: net: sched: change action API to use array of pointers to actions Act API used linked list to pass set of actions to functions. It is intrusive data structure that stores list nodes inside action structure itself, which means it is not safe to modify such list concurrently. However, action API doesn't use any linked list specific operations on this set of actions, so it can be safely refactored into plain pointer array. Refactor action API to use array of pointers to tc_actions instead of linked list. Change argument 'actions' type of exported action init, destroy and dump functions. Acked-by: Jiri Pirko Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 7 ++-- net/sched/act_api.c | 89 +++++++++++++++++++++++++++++---------------------- net/sched/cls_api.c | 21 ++++-------- 3 files changed, 60 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8090de2edab7..683ce41053d9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -168,19 +168,20 @@ static inline int tcf_idr_release(struct tc_action *a, bool bind) int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); -int tcf_action_destroy(struct list_head *actions, int bind); +int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, + struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack); -int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, + int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9511502e1cbb..bf1c35f3deb6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -657,13 +657,15 @@ repeat: } EXPORT_SYMBOL(tcf_action_exec); -int tcf_action_destroy(struct list_head *actions, int bind) +int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; - struct tc_action *a, *tmp; - int ret = 0; + struct tc_action *a; + int ret = 0, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; + actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) @@ -679,11 +681,12 @@ static int tcf_action_put(struct tc_action *p) return __tcf_action_put(p, false); } -static void tcf_action_put_lst(struct list_head *actions) +static void tcf_action_put_many(struct tc_action *actions[]) { - struct tc_action *a, *tmp; + int i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; if (tcf_action_put(a)) @@ -735,14 +738,15 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref) { struct tc_action *a; - int err = -EINVAL; + int err = -EINVAL, i; struct nlattr *nest; - list_for_each_entry(a, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -878,10 +882,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { - LIST_HEAD(actions); + struct tc_action *actions[] = { a, NULL }; - list_add_tail(&a->list, &actions); - tcf_action_destroy(&actions, bind); + tcf_action_destroy(actions, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } @@ -899,9 +902,11 @@ err_out: return ERR_PTR(err); } +/* Returns numbers of initialized actions or negative error. */ + int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, + struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -923,11 +928,12 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } act->order = i; sz += tcf_action_fill_size(act); - list_add_tail(&act->list, actions); + /* Start from index 0 */ + actions[i - 1] = act; } *attr_size = tcf_action_full_attrs_size(sz); - return 0; + return i - 1; err: tcf_action_destroy(actions, bind); @@ -978,7 +984,7 @@ errout: return -1; } -static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, +static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { @@ -1014,7 +1020,7 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event, + struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1150,14 +1156,14 @@ err_out: return err; } -static int tcf_action_delete(struct net *net, struct list_head *actions, - struct netlink_ext_ack *extack) +static int tcf_action_delete(struct net *net, struct tc_action *actions[], + int *acts_deleted, struct netlink_ext_ack *extack) { - struct tc_action *a, *tmp; u32 act_index; - int ret; + int ret, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their @@ -1165,23 +1171,26 @@ static int tcf_action_delete(struct net *net, struct list_head *actions, */ act_index = a->tcfa_index; - list_del(&a->list); if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { /* now do the delete */ ret = ops->delete(net, act_index); - if (ret < 0) + if (ret < 0) { + *acts_deleted = i + 1; return ret; + } } } + *acts_deleted = i; return 0; } static int -tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], + int *acts_deleted, u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; @@ -1199,7 +1208,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } /* now do the delete */ - ret = tcf_action_delete(net, actions, extack); + ret = tcf_action_delete(net, actions, acts_deleted, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1221,7 +1230,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO + 1] = {}; + int acts_deleted = 0; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) @@ -1243,26 +1253,27 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } act->order = i; attr_size += tcf_action_fill_size(act); - list_add_tail(&act->list, &actions); + actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event, extack); + ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_del_notify(net, n, actions, &acts_deleted, portid, + attr_size, extack); if (ret) goto err; return ret; } err: - tcf_action_put_lst(&actions); + tcf_action_put_many(&actions[acts_deleted]); return ret; } static int -tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1293,15 +1304,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, { size_t attr_size = 0; int ret = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, &attr_size, true, extack); - if (ret) + if (ret < 0) return ret; - ret = tcf_add_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); if (ovr) - tcf_action_put_lst(&actions); + tcf_action_put_many(actions); return ret; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9041f0e43e9a..73d9967c3739 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1609,10 +1609,7 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(actions); - - tcf_exts_to_list(exts, &actions); - tcf_action_destroy(&actions, TCA_ACT_UNBIND); + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); exts->nr_actions = 0; #endif @@ -1639,18 +1636,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[0] = act; exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - LIST_HEAD(actions); - int err, i = 0; + int err; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, true, + exts->actions, &attr_size, true, extack); - if (err) + if (err < 0) return err; - list_for_each_entry(act, &actions, list) - exts->actions[i++] = act; - exts->nr_actions = i; + exts->nr_actions = err; } exts->net = net; } @@ -1699,14 +1693,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { - LIST_HEAD(actions); - nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - tcf_exts_to_list(exts, &actions); - if (tcf_action_dump(skb, &actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { -- cgit v1.2.3 From 0dbc81eab4d13f6d295da69c00e6efee2427b55c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 8 Jul 2018 17:02:59 +0900 Subject: net: sched: Fix warnings from xchg() on RCU'd cookie pointer. The kbuild test robot reports: >> net/sched/act_api.c:71:15: sparse: incorrect type in initializer (different address spaces) @@ expected struct tc_cookie [noderef] *__ret @@ got [noderef] *__ret @@ net/sched/act_api.c:71:15: expected struct tc_cookie [noderef] *__ret net/sched/act_api.c:71:15: got struct tc_cookie *new_cookie >> net/sched/act_api.c:71:13: sparse: incorrect type in assignment (different address spaces) @@ expected struct tc_cookie *old @@ got struct tc_cookie [noderef] *[assigned] __ret >> net/sched/act_api.c:132:48: sparse: dereference of noderef expression Handle this in the usual way by force casting away the __rcu annotation when we are using xchg() on it. Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update") Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index bf1c35f3deb6..66dc19746c63 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -68,7 +68,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, { struct tc_cookie *old; - old = xchg(old_cookie, new_cookie); + old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } -- cgit v1.2.3 From c47078d6a33fd78d882200cdaacbcfcd63318234 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Jul 2018 23:15:56 -0700 Subject: tcp: remove redundant SOCK_DONE checks In both tcp_splice_read() and tcp_recvmsg(), we already test sock_flag(sk, SOCK_DONE) right before evaluating sk->sk_state, so "!sock_flag(sk, SOCK_DONE)" is always true. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 850dc8f15afc..c4082cd50257 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -817,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, * This occurs when user tries to read * from never connected socket. */ - if (!sock_flag(sk, SOCK_DONE)) - ret = -ENOTCONN; + ret = -ENOTCONN; break; } if (!timeo) { @@ -2042,13 +2041,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; break; } -- cgit v1.2.3 From 993a4a5f7cd3aef53be3953d11f86b2d3630ebb8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 7 Jul 2018 21:50:18 +0200 Subject: batman-adv: Convert batadv_dat_addr_t to proper type The #define for batadv_dat_addr_t is doing nothing else than giving u16 a new typename. But C already has the special keyword "typedef" which is also better supported by kernel-doc. Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/types.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 360357f83f20..343d304851a5 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -43,12 +43,13 @@ struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is - * changed, BATADV_DAT_ADDR_MAX is changed as well. + * typedef batadv_dat_addr_t - type used for all DHT addresses + * + * If it is changed, BATADV_DAT_ADDR_MAX is changed as well. * * *Please be careful: batadv_dat_addr_t must be UNSIGNED* */ -#define batadv_dat_addr_t u16 +typedef u16 batadv_dat_addr_t; #endif /* CONFIG_BATMAN_ADV_DAT */ -- cgit v1.2.3 From 0832b603c7583e75f149ea984827b6d929f336b5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 3 Jul 2018 14:47:25 +0200 Subject: mac80211: don't put null-data frames on the normal TXQ Since (QoS) NDP frames shouldn't be put into aggregation nor are assigned real sequence numbers, etc. it's better to treat them as non-data packets and not put them on the normal TXQs, for example when building A-MPDUs they need to be treated specially, and they are more used for management (e.g. to see if the station is alive) anyway. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6a79d564de35..cd332e3e1134 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1249,7 +1249,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) return NULL; - if (!ieee80211_is_data(hdr->frame_control)) + if (!ieee80211_is_data_present(hdr->frame_control)) return NULL; if (sta) { -- cgit v1.2.3 From d7be97756f8a4874ac17003de5843c742dd84153 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:32 -0400 Subject: net-sysfs: Drop support for XPS and traffic_class on single queue device This patch makes it so that we do not report the traffic class or allow XPS configuration on single queue devices. This is mostly to avoid unnecessary complexity with changes I have planned that will allow us to reuse the unused tc_to_txq and XPS configuration on a single queue device to allow it to make use of a subset of queues on an underlying device. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- net/core/net-sysfs.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f25ac5ff48a6..dce3ae0fbca2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1047,9 +1047,14 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int index = get_netdev_queue_index(queue); - int tc = netdev_txq_to_tc(dev, index); + int index; + int tc; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -1214,6 +1219,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, cpumask_var_t mask; unsigned long index; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + index = get_netdev_queue_index(queue); if (dev->num_tc) { @@ -1260,6 +1268,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, cpumask_var_t mask; int err; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + if (!capable(CAP_NET_ADMIN)) return -EPERM; -- cgit v1.2.3 From ffcfe25bb50f27395e15fa999f1a7eb769f55360 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:38 -0400 Subject: net: Add support for subordinate device traffic classes This patch is meant to provide the basic tools needed to allow us to create subordinate device traffic classes. The general idea here is to allow subdividing the queues of a device into queue groups accessible through an upper device such as a macvlan. The idea here is to enforce the idea that an upper device has to be a single queue device, ideally with IFF_NO_QUQUE set. With that being the case we can pretty much guarantee that the tc_to_txq mappings and XPS maps for the upper device are unused. As such we could reuse those in order to support subdividing the lower device and distributing those queues between the subordinate devices. In order to distinguish between a regular set of traffic classes and if a device is carrying subordinate traffic classes I changed num_tc from a u8 to a s16 value and use the negative values to represent the subordinate pool values. So starting at -1 and running to -32768 we can encode those as pool values, and the existing values of 0 to 15 can be maintained. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 16 ++++++++- net/core/dev.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/net-sysfs.c | 21 ++++++++++- 3 files changed, 124 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..b1ff77276bc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -575,6 +575,9 @@ struct netdev_queue { * (/sys/class/net/DEV/Q/trans_timeout) */ unsigned long trans_timeout; + + /* Subordinate device that the queue has been assigned to */ + struct net_device *sb_dev; /* * write-mostly part */ @@ -1991,7 +1994,7 @@ struct net_device { #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - u8 num_tc; + s16 num_tc; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; @@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev); +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset); +int netdev_set_sb_channel(struct net_device *dev, u16 channel); +static inline int netdev_get_sb_channel(struct net_device *dev) +{ + return max_t(int, -dev->num_tc, 0); +} + static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) diff --git a/net/core/dev.c b/net/core/dev.c index 89825c1eccdc..cc1d6bba017a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; + /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } + /* didn't find it, just return -1 to indicate no match */ return -1; } @@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, unsigned int nr_ids; if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); @@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc) #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index dce3ae0fbca2..ffa1d18f2c2c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, return -ENOENT; index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; - return sprintf(buf, "%u\n", tc); + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : + sprintf(buf, "%u\n", tc); } #ifdef CONFIG_XPS @@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; -- cgit v1.2.3 From eadec877ce9ca46a94e9036b5a44e7941d4fc501 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:48 -0400 Subject: net: Add support for subordinate traffic classes to netdev_pick_tx This change makes it so that we can support the concept of subordinate device traffic classes to the core networking code. In doing this we can start pulling out the driver specific bits needed to support selecting a queue based on an upper device. The solution at is currently stands is only partially implemented. I have the start of some XPS bits in here, but I would still need to allow for configuration of the XPS maps on the queues reserved for the subordinate devices. For now I am using the reference to the sb_dev XPS map as just a way to skip the lookup of the lower device XPS map for now as that would result in the wrong queue being picked. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 19 +++------ drivers/net/macvlan.c | 10 +---- include/linux/netdevice.h | 4 +- net/core/dev.c | 58 ++++++++++++++++----------- 4 files changed, 45 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 80225af2acb1..abb176df2e7f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8208,20 +8208,17 @@ static void ixgbe_atr(struct ixgbe_ring *ring, input, common, ring->queue_index); } +#ifdef IXGBE_FCOE static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, void *accel_priv, select_queue_fallback_t fallback) { - struct ixgbe_fwd_adapter *fwd_adapter = accel_priv; -#ifdef IXGBE_FCOE struct ixgbe_adapter *adapter; struct ixgbe_ring_feature *f; -#endif int txq; - if (fwd_adapter) { - u8 tc = netdev_get_num_tc(dev) ? - netdev_get_prio_tc_map(dev, skb->priority) : 0; - struct net_device *vdev = fwd_adapter->netdev; + if (accel_priv) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + struct net_device *vdev = accel_priv; txq = vdev->tc_to_txq[tc].offset; txq += reciprocal_scale(skb_get_hash(skb), @@ -8230,8 +8227,6 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, return txq; } -#ifdef IXGBE_FCOE - /* * only execute the code below if protocol is FCoE * or FIP and we have FCoE enabled on the adapter @@ -8257,11 +8252,9 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, txq -= f->indices; return txq + f->offset; -#else - return fallback(dev, skb); -#endif } +#endif static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, struct xdp_frame *xdpf) { @@ -10058,7 +10051,6 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_open = ixgbe_open, .ndo_stop = ixgbe_close, .ndo_start_xmit = ixgbe_xmit_frame, - .ndo_select_queue = ixgbe_select_queue, .ndo_set_rx_mode = ixgbe_set_rx_mode, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ixgbe_set_mac, @@ -10081,6 +10073,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_poll_controller = ixgbe_netpoll, #endif #ifdef IXGBE_FCOE + .ndo_select_queue = ixgbe_select_queue, .ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get, .ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target, .ndo_fcoe_ddp_done = ixgbe_fcoe_ddp_put, diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index adde8fc45588..401e1d1ce1ec 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -514,7 +514,6 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; - void *accel_priv = NULL; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = (void *)skb->data; @@ -533,15 +532,10 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) return NET_XMIT_SUCCESS; } } - - /* For packets that are non-multicast and not bridged we will pass - * the necessary information so that the lowerdev can distinguish - * the source of the packets via the accel_priv value. - */ - accel_priv = vlan->accel_priv; xmit_world: skb->dev = vlan->lowerdev; - return dev_queue_xmit_accel(skb, accel_priv); + return dev_queue_xmit_accel(skb, + netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1ff77276bc4..fda0bcda7a42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2103,7 +2103,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev @@ -2568,7 +2568,7 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); diff --git a/net/core/dev.c b/net/core/dev.c index cc1d6bba017a..09a7cc2f3c55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2786,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + } + if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); while (unlikely(hash >= qcount)) hash -= qcount; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; + return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; @@ -3573,7 +3575,8 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, } #endif -static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; @@ -3587,7 +3590,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); if (dev_maps) { int tci = sk_rx_queue_get(sk); @@ -3598,7 +3601,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_cpus_map); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; @@ -3614,17 +3617,20 @@ get_cpus_map: #endif } -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); + sb_dev = sb_dev ? : dev; + if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); + int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) - new_index = skb_tx_hash(dev, skb); + new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && @@ -3637,9 +3643,15 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) return queue_index; } +static u16 __netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + return ___netdev_pick_tx(dev, skb, NULL); +} + struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + struct net_device *sb_dev) { int queue_index = 0; @@ -3654,10 +3666,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else - queue_index = __netdev_pick_tx(dev, skb); + queue_index = ___netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } @@ -3669,7 +3681,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit - * @accel_priv: private data used for L2 forwarding offload + * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -3692,7 +3704,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) +static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3731,7 +3743,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); - txq = netdev_pick_tx(dev, skb, accel_priv); + txq = netdev_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -3805,9 +3817,9 @@ int dev_queue_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(dev_queue_xmit); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { - return __dev_queue_xmit(skb, accel_priv); + return __dev_queue_xmit(skb, sb_dev); } EXPORT_SYMBOL(dev_queue_xmit_accel); -- cgit v1.2.3 From a4ea8a3dacc312c3402c78f6e4843afdda9b43a0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:54 -0400 Subject: net: Add generic ndo_select_queue functions This patch adds a generic version of the ndo_select_queue functions for either returning 0 or selecting a queue based on the processor ID. This is generally meant to just reduce the number of functions we have to change in the future when we have to deal with ndo_select_queue changes. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/lantiq_etop.c | 10 +--------- drivers/net/ethernet/ti/netcp_core.c | 9 +-------- drivers/staging/netlogic/xlr_net.c | 9 +-------- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 14 ++++++++++++++ net/packet/af_packet.c | 2 +- 6 files changed, 22 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index afc810069440..7a637b51c7d2 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -563,14 +563,6 @@ ltq_etop_set_multicast_list(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); } -static u16 -ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) -{ - /* we are currently only using the first queue */ - return 0; -} - static int ltq_etop_init(struct net_device *dev) { @@ -641,7 +633,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = { .ndo_set_mac_address = ltq_etop_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = ltq_etop_set_multicast_list, - .ndo_select_queue = ltq_etop_select_queue, + .ndo_select_queue = dev_pick_tx_zero, .ndo_init = ltq_etop_init, .ndo_tx_timeout = ltq_etop_tx_timeout, }; diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 6ebf110cd594..a1d335a3c5e4 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1889,13 +1889,6 @@ static int netcp_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) return err; } -static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, - select_queue_fallback_t fallback) -{ - return 0; -} - static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { @@ -1972,7 +1965,7 @@ static const struct net_device_ops netcp_netdev_ops = { .ndo_vlan_rx_add_vid = netcp_rx_add_vid, .ndo_vlan_rx_kill_vid = netcp_rx_kill_vid, .ndo_tx_timeout = netcp_ndo_tx_timeout, - .ndo_select_queue = netcp_select_queue, + .ndo_select_queue = dev_pick_tx_zero, .ndo_setup_tc = netcp_setup_tc, }; diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c index e461168313bf..4e6611e4c59b 100644 --- a/drivers/staging/netlogic/xlr_net.c +++ b/drivers/staging/netlogic/xlr_net.c @@ -290,13 +290,6 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } -static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, - select_queue_fallback_t fallback) -{ - return (u16)smp_processor_id(); -} - static void xlr_hw_set_mac_addr(struct net_device *ndev) { struct xlr_net_priv *priv = netdev_priv(ndev); @@ -403,7 +396,7 @@ static const struct net_device_ops xlr_netdev_ops = { .ndo_open = xlr_net_open, .ndo_stop = xlr_net_stop, .ndo_start_xmit = xlr_net_start_xmit, - .ndo_select_queue = xlr_net_select_queue, + .ndo_select_queue = dev_pick_tx_cpu_id, .ndo_set_mac_address = xlr_net_set_mac_addr, .ndo_set_rx_mode = xlr_set_rx_mode, .ndo_get_stats64 = xlr_stats, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fda0bcda7a42..46f4c44ce3e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2567,6 +2567,10 @@ void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); diff --git a/net/core/dev.c b/net/core/dev.c index 09a7cc2f3c55..b5e538032d5e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3617,6 +3617,20 @@ get_cpus_map: #endif } +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; +} +EXPORT_SYMBOL(dev_pick_tx_cpu_id); + static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 47931ebfaef3..f37d087ae652 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -277,7 +277,7 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) { - return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; + return dev_pick_tx_cpu_id(dev, skb, NULL, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) -- cgit v1.2.3 From 4f49dec9075aa0277b8c9c657ec31e6361f88724 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:59 -0400 Subject: net: allow ndo_select_queue to pass netdev This patch makes it so that instead of passing a void pointer as the accel_priv we instead pass a net_device pointer as sb_dev. Making this change allows us to pass the subordinate device through to the fallback function eventually so that we can keep the actual code in the ndo_select_queue call as focused on possible on the exception cases. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/infiniband/hw/hfi1/vnic_main.c | 2 +- drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c | 4 ++-- drivers/net/bonding/bond_main.c | 3 ++- drivers/net/ethernet/amazon/ena/ena_netdev.c | 3 ++- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 ++- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 ++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 3 ++- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 3 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 ++++--- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 3 ++- drivers/net/ethernet/renesas/ravb_main.c | 3 ++- drivers/net/ethernet/sun/ldmvsw.c | 3 ++- drivers/net/ethernet/sun/sunvnet.c | 3 ++- drivers/net/hyperv/netvsc_drv.c | 4 ++-- drivers/net/net_failover.c | 5 +++-- drivers/net/team/team.c | 3 ++- drivers/net/tun.c | 3 ++- drivers/net/wireless/marvell/mwifiex/main.c | 3 ++- drivers/net/xen-netback/interface.c | 2 +- drivers/net/xen-netfront.c | 3 ++- drivers/staging/rtl8188eu/os_dep/os_intfs.c | 3 ++- drivers/staging/rtl8723bs/os_dep/os_intfs.c | 7 +++---- include/linux/netdevice.h | 11 +++++++---- net/core/dev.c | 6 ++++-- net/mac80211/iface.c | 4 ++-- 29 files changed, 66 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index 5d65582fe4d9..616fc9b6fad8 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -423,7 +423,7 @@ tx_finish: static u16 hfi1_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c index 0c8aec62a425..61558788b3fa 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c @@ -95,7 +95,7 @@ static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb, } static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); @@ -107,7 +107,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, mdata->entropy = opa_vnic_calc_entropy(skb); mdata->vl = opa_vnic_get_vl(adapter, skb); rc = adapter->rn_ops->ndo_select_queue(netdev, skb, - accel_priv, fallback); + sb_dev, fallback); skb_pull(skb, sizeof(*mdata)); return rc; } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 63e3844c5bec..9a2ea3c1f949 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4094,7 +4094,8 @@ static inline int bond_slave_override(struct bonding *bond, static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { /* This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index f2af87d70594..e3befb1f9204 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -2213,7 +2213,8 @@ static void ena_netpoll(struct net_device *netdev) #endif /* CONFIG_NET_POLL_CONTROLLER */ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { u16 qid; /* we suspect that this is good for in--kernel network services that diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index d5fca2e5a9bc..32f548e6431d 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2107,7 +2107,7 @@ static const struct ethtool_ops bcm_sysport_ethtool_ops = { }; static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct bcm_sysport_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index af7b5a4d8ba0..e4e1cf907ac6 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1910,7 +1910,8 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw) } u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct bnx2x *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index a8ce5c55bbb0..0e508e5defce 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -497,7 +497,8 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); static inline void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp, diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 0d91716a2566..5dc5e5604f05 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -930,7 +930,8 @@ freeout: } static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { int txq; diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index ef9ef703d13a..ff7a74ec8f11 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -2022,7 +2022,8 @@ static void hns_nic_get_stats64(struct net_device *ndev, static u16 hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct ethhdr *eth_hdr = (struct ethhdr *)skb->data; struct hns_nic_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index abb176df2e7f..8c7a68c57afa 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8210,15 +8210,16 @@ static void ixgbe_atr(struct ixgbe_ring *ring, #ifdef IXGBE_FCOE static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct ixgbe_adapter *adapter; struct ixgbe_ring_feature *f; int txq; - if (accel_priv) { + if (sb_dev) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - struct net_device *vdev = accel_priv; + struct net_device *vdev = sb_dev; txq = vdev->tc_to_txq[tc].offset; txq += reciprocal_scale(skb_get_hash(skb), diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 0227786308af..df2996618cd1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -688,7 +688,8 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, } u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 rings_p_up = priv->num_tx_rings_p_up; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index ace6545f82e6..c3228b89df46 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -699,7 +699,8 @@ void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, struct mlx4_en_rx_alloc *frame, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e2b7586ed7a0..e1b237ccdf56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -865,7 +865,8 @@ struct mlx5e_profile { void mlx5e_build_ptys2ethtool_map(void); u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_tx_wqe *wqe, u16 pi); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index f0739dae7b56..dfcc3710b65f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -111,7 +111,8 @@ static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb #endif u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct mlx5e_priv *priv = netdev_priv(dev); int channel_ix = fallback(dev, skb); diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 68f122140966..4a7f54c8e7aa 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1656,7 +1656,8 @@ drop: } static u16 ravb_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { /* If skb needs TX timestamp, it is handled in network control queue */ return (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) ? RAVB_NC : diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index a5dd627fe2f9..d42f47f6c632 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -101,7 +101,8 @@ static struct vnet_port *vsw_tx_port_find(struct sk_buff *skb, } static u16 vsw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct vnet_port *port = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index a94f50442613..12539b357a78 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -234,7 +234,8 @@ static struct vnet_port *vnet_tx_port_find(struct sk_buff *skb, } static u16 vnet_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct vnet *vp = netdev_priv(dev); struct vnet_port *port = __tx_port_find(vp, skb); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index dd1d6e115145..98c0107d6ca1 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -329,7 +329,7 @@ static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb) } static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct net_device_context *ndc = netdev_priv(ndev); @@ -343,7 +343,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, if (vf_ops->ndo_select_queue) txq = vf_ops->ndo_select_queue(vf_netdev, skb, - accel_priv, fallback); + sb_dev, fallback); else txq = fallback(vf_netdev, skb); diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 4f390fa557e4..78b549698b7b 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -115,7 +115,8 @@ static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb, } static u16 net_failover_select_queue(struct net_device *dev, - struct sk_buff *skb, void *accel_priv, + struct sk_buff *skb, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct net_failover_info *nfo_info = netdev_priv(dev); @@ -128,7 +129,7 @@ static u16 net_failover_select_queue(struct net_device *dev, if (ops->ndo_select_queue) txq = ops->ndo_select_queue(primary_dev, skb, - accel_priv, fallback); + sb_dev, fallback); else txq = fallback(primary_dev, skb); diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index b070959737ff..3a95eaae0c98 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1707,7 +1707,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a192a017cc68..76f0f4131197 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -607,7 +607,8 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct tun_struct *tun = netdev_priv(dev); u16 ret; diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 510f6b8e717d..fa3e8ddfe9a9 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1279,7 +1279,8 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev) static u16 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { skb->priority = cfg80211_classify8021d(skb, NULL); return mwifiex_1d_to_wmm_queue[skb->priority]; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 78ebe494fef0..19c4c585f472 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -148,7 +148,7 @@ void xenvif_wake_queue(struct xenvif_queue *queue) } static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct xenvif *vif = netdev_priv(dev); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a57daecf1d57..d67cd379d156 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -545,7 +545,8 @@ static int xennet_count_skb_slots(struct sk_buff *skb) } static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { unsigned int num_queues = dev->real_num_tx_queues; u32 hash; diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c index add1ba00f3e9..38e85c8a85c8 100644 --- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c +++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c @@ -253,7 +253,8 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/drivers/staging/rtl8723bs/os_dep/os_intfs.c b/drivers/staging/rtl8723bs/os_dep/os_intfs.c index ace68f023b49..181642358e3f 100644 --- a/drivers/staging/rtl8723bs/os_dep/os_intfs.c +++ b/drivers/staging/rtl8723bs/os_dep/os_intfs.c @@ -403,10 +403,9 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } -static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb - , void *accel_priv - , select_queue_fallback_t fallback -) +static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 46f4c44ce3e4..bbf062c1ca8a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -957,7 +957,8 @@ struct dev_ifalias { * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * void *accel_priv, select_queue_fallback_t fallback); + * struct net_device *sb_dev, + * select_queue_fallback_t fallback); * Called to decide which queue to use when device supports multiple * transmit queues. * @@ -1229,7 +1230,7 @@ struct net_device_ops { netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); @@ -2568,9 +2569,11 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); diff --git a/net/core/dev.c b/net/core/dev.c index b5e538032d5e..a051ce27198b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3618,14 +3618,16 @@ get_cpus_map: } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 555e389b7dfa..5e6cf2cee965 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); @@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); -- cgit v1.2.3 From 8ec56fc3c5ee6f9700adac190e9ce5b8859a58b6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:20:04 -0400 Subject: net: allow fallback function to pass netdev For most of these calls we can just pass NULL through to the fallback function as the sb_dev. The only cases where we cannot are the cases where we might be dealing with either an upper device or a driver that would have configured things to support an sb_dev itself. The only driver that has any significant change in this patch set should be ixgbe as we can drop the redundant functionality that existed in both the ndo_select_queue function and the fallback function that was passed through to us. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/broadcom/bcmsysport.c | 4 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 ++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 2 +- drivers/net/net_failover.c | 2 +- drivers/net/xen-netback/interface.c | 2 +- include/linux/netdevice.h | 3 ++- net/core/dev.c | 12 +++--------- net/packet/af_packet.c | 7 ++++--- 14 files changed, 24 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index e3befb1f9204..c673ac2df65b 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -2224,7 +2224,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, if (skb_rx_queue_recorded(skb)) qid = skb_get_rx_queue(skb); else - qid = fallback(dev, skb); + qid = fallback(dev, skb, NULL); return qid; } diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 32f548e6431d..eb890c4b3b2d 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2116,7 +2116,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, unsigned int q, port; if (!netdev_uses_dsa(dev)) - return fallback(dev, skb); + return fallback(dev, skb, NULL); /* DSA tagging layer will have configured the correct queue */ q = BRCM_TAG_GET_QUEUE(queue); @@ -2124,7 +2124,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues]; if (unlikely(!tx_ring)) - return fallback(dev, skb); + return fallback(dev, skb, NULL); return tx_ring->index; } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index e4e1cf907ac6..5a727d4729da 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1933,7 +1933,8 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, } /* select a non-FCoE queue */ - return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos); + return fallback(dev, skb, NULL) % + (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos); } void bnx2x_set_num_queues(struct bnx2x *bp) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 5dc5e5604f05..40cf8dc9f163 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -973,7 +973,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, return txq; } - return fallback(dev, skb) % dev->real_num_tx_queues; + return fallback(dev, skb, NULL) % dev->real_num_tx_queues; } static int closest_timer(const struct sge *s, int time) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index ff7a74ec8f11..948b3e0d18f4 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -2033,7 +2033,7 @@ hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb, is_multicast_ether_addr(eth_hdr->h_dest)) return 0; else - return fallback(ndev, skb); + return fallback(ndev, skb, NULL); } static const struct net_device_ops hns_nic_netdev_ops = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 8c7a68c57afa..bd6d9ea27b4b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8237,11 +8237,11 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, case htons(ETH_P_FIP): adapter = netdev_priv(dev); - if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) + if (!sb_dev && (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)) break; /* fall through */ default: - return fallback(dev, skb); + return fallback(dev, skb, sb_dev); } f = &adapter->ring_feature[RING_F_FCOE]; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index df2996618cd1..1857ee0f0871 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -695,9 +695,9 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, u16 rings_p_up = priv->num_tx_rings_p_up; if (netdev_get_num_tc(dev)) - return fallback(dev, skb); + return fallback(dev, skb, NULL); - return fallback(dev, skb) % rings_p_up; + return fallback(dev, skb, NULL) % rings_p_up; } static void mlx4_bf_copy(void __iomem *dst, const void *src, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index dfcc3710b65f..9106ea45e3cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -115,7 +115,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, select_queue_fallback_t fallback) { struct mlx5e_priv *priv = netdev_priv(dev); - int channel_ix = fallback(dev, skb); + int channel_ix = fallback(dev, skb, NULL); u16 num_channels; int up = 0; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 98c0107d6ca1..cf4f40a04194 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -345,7 +345,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, txq = vf_ops->ndo_select_queue(vf_netdev, skb, sb_dev, fallback); else - txq = fallback(vf_netdev, skb); + txq = fallback(vf_netdev, skb, NULL); /* Record the queue selected by VF so that it can be * used for common case where VF has more queues than diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 78b549698b7b..d00d42c845b7 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -131,7 +131,7 @@ static u16 net_failover_select_queue(struct net_device *dev, txq = ops->ndo_select_queue(primary_dev, skb, sb_dev, fallback); else - txq = fallback(primary_dev, skb); + txq = fallback(primary_dev, skb, NULL); qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 19c4c585f472..92274c237200 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -155,7 +155,7 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, unsigned int size = vif->hash.size; if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE) - return fallback(dev, skb) % dev->real_num_tx_queues; + return fallback(dev, skb, NULL) % dev->real_num_tx_queues; xenvif_set_skb_hash(vif, skb); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bbf062c1ca8a..2daf2fa6554f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -793,7 +793,8 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, } typedef u16 (*select_queue_fallback_t)(struct net_device *dev, - struct sk_buff *skb); + struct sk_buff *skb, + struct net_device *sb_dev); enum tc_setup_type { TC_SETUP_QDISC_MQPRIO, diff --git a/net/core/dev.c b/net/core/dev.c index a051ce27198b..e18d81837a6c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3633,8 +3633,8 @@ u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_cpu_id); -static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -3659,12 +3659,6 @@ static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, return queue_index; } -static u16 __netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - return ___netdev_pick_tx(dev, skb, NULL); -} - struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) @@ -3685,7 +3679,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else - queue_index = ___netdev_pick_tx(dev, skb, sb_dev); + queue_index = __netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f37d087ae652..00189a3b07f2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { - return dev_pick_tx_cpu_id(dev, skb, NULL, NULL); + return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) @@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb) __packet_pick_tx_queue); queue_index = netdev_cap_txqueue(dev, queue_index); } else { - queue_index = __packet_pick_tx_queue(dev, skb); + queue_index = __packet_pick_tx_queue(dev, skb, NULL); } return queue_index; -- cgit v1.2.3 From 8c057efaebb557b60ba514b5e39e8000a1eab0f1 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 9 Jul 2018 18:09:54 +0100 Subject: net: core: fix uses-after-free in list processing In netif_receive_skb_list_internal(), all of skb_defer_rx_timestamp(), do_xdp_generic() and enqueue_to_backlog() can lead to kfree(skb). Thus, we cannot wait until after they return to remove the skb from the list; instead, we remove it first and, in the pass case, add it to a sublist afterwards. In the case of enqueue_to_backlog() we have already decided not to pass when we call the function, so we do not need a sublist. Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing") Reported-by: Dan Carpenter Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 89825c1eccdc..ce4583564e00 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4982,25 +4982,30 @@ static void netif_receive_skb_list_internal(struct list_head *head) { struct bpf_prog *xdp_prog = NULL; struct sk_buff *skb, *next; + struct list_head sublist; + INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(netdev_tstamp_prequeue, skb); - if (skb_defer_rx_timestamp(skb)) - /* Handled, remove from list */ - list_del(&skb->list); + list_del(&skb->list); + if (!skb_defer_rx_timestamp(skb)) + list_add_tail(&skb->list, &sublist); } + list_splice_init(&sublist, head); if (static_branch_unlikely(&generic_xdp_needed_key)) { preempt_disable(); rcu_read_lock(); list_for_each_entry_safe(skb, next, head, list) { xdp_prog = rcu_dereference(skb->dev->xdp_prog); - if (do_xdp_generic(xdp_prog, skb) != XDP_PASS) - /* Dropped, remove from list */ - list_del(&skb->list); + list_del(&skb->list); + if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) + list_add_tail(&skb->list, &sublist); } rcu_read_unlock(); preempt_enable(); + /* Put passed packets back on main list */ + list_splice_init(&sublist, head); } rcu_read_lock(); @@ -5011,9 +5016,9 @@ static void netif_receive_skb_list_internal(struct list_head *head) int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { - enqueue_to_backlog(skb, cpu, &rflow->last_qtail); - /* Handled, remove from list */ + /* Will be handled, remove from list */ list_del(&skb->list); + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } -- cgit v1.2.3 From 9af86f9338949a9369bda5e6fed69347d1813054 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 9 Jul 2018 18:10:19 +0100 Subject: net: core: fix use-after-free in __netif_receive_skb_list_core __netif_receive_skb_core can free the skb, so we have to use the dequeue- enqueue model when calling it from __netif_receive_skb_list_core. Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type lookup") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index ce4583564e00..d13cddcac41f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4830,23 +4830,28 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct list_head sublist; struct sk_buff *skb, *next; + INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; + list_del(&skb->list); __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (!pt_prev) + continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ - list_cut_before(&sublist, head, &skb->list); __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ + INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } + list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ - __netif_receive_skb_list_ptype(head, pt_curr, od_curr); + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) -- cgit v1.2.3 From 95765a6ca1288121ddcbf07cd60ec65341829ddc Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 9 Jul 2018 09:45:14 +0200 Subject: tcp: remove SG-related comment in tcp_sendmsg() Since commit 74d4a8f8d378 ("tcp: remove sk_can_gso() use"), the code doesn't care whether the interface supports SG. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4082cd50257..e3704a49164b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1274,9 +1274,6 @@ restart: int linear; new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; -- cgit v1.2.3 From 046f6fd5daefac7f5abdafb436b30f63bc7c602b Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sched: Add Common Applications Kept Enhanced (cake) qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sch_cake targets the home router use case and is intended to squeeze the most bandwidth and latency out of even the slowest ISP links and routers, while presenting an API simple enough that even an ISP can configure it. Example of use on a cable ISP uplink: tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter To shape a cable download link (ifb and tc-mirred setup elided) tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash CAKE is filled with: * A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel derived Flow Queuing system, which autoconfigures based on the bandwidth. * A novel "triple-isolate" mode (the default) which balances per-host and per-flow FQ even through NAT. * An deficit based shaper, that can also be used in an unlimited mode. * 8 way set associative hashing to reduce flow collisions to a minimum. * A reasonable interpretation of various diffserv latency/loss tradeoffs. * Support for zeroing diffserv markings for entering and exiting traffic. * Support for interacting well with Docsis 3.0 shaper framing. * Extensive support for DSL framing types. * Support for ack filtering. * Extensive statistics for measuring, loss, ecn markings, latency variation. A paper describing the design of CAKE is available at https://arxiv.org/abs/1804.07617, and will be published at the 2018 IEEE International Symposium on Local and Metropolitan Area Networks (LANMAN). This patch adds the base shaper and packet scheduler, while subsequent commits add the optional (configurable) features. The full userspace API and most data structures are included in this commit, but options not understood in the base version will be ignored. Various versions baking have been available as an out of tree build for kernel versions going back to 3.10, as the embedded router world has been running a few years behind mainline Linux. A stable version has been generally available on lede-17.01 and later. sch_cake replaces a combination of iptables, tc filter, htb and fq_codel in the sqm-scripts, with sane defaults and vastly simpler configuration. CAKE's principal author is Jonathan Morton, with contributions from Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller, Ryan Mounce, Tony Ambardar, Dean Scarff, Nils Andreas Svee, Dave Täht, and Loganaden Velvindron. Testing from Pete Heist, Georgios Amanakis, and the many other members of the cake@lists.bufferbloat.net mailing list. tc -s qdisc show dev eth2 qdisc cake 8017: root refcnt 2 bandwidth 1Gbit diffserv3 triple-isolate split-gso rtt 100.0ms noatm overhead 38 mpu 84 Sent 51504294511 bytes 37724591 pkt (dropped 6, overlimits 64958695 requeues 12) backlog 0b 0p requeues 12 memory used: 1053008b of 15140Kb capacity estimate: 970Mbit min/max network layer size: 28 / 1500 min/max overhead-adjusted size: 84 / 1538 average network hdr offset: 14 Bulk Best Effort Voice thresh 62500Kbit 1Gbit 250Mbit target 5.0ms 5.0ms 5.0ms interval 100.0ms 100.0ms 100.0ms pk_delay 5us 5us 6us av_delay 3us 2us 2us sp_delay 2us 1us 1us backlog 0b 0b 0b pkts 3164050 25030267 9530280 bytes 3227519915 35396974782 12879808898 way_inds 0 8 0 way_miss 21 366 25 way_cols 0 0 0 drops 5 0 1 marks 0 0 0 ack_drop 0 0 0 sp_flows 1 3 0 bk_flows 0 1 1 un_flows 0 0 0 max_len 68130 68130 68130 Tested-by: Pete Heist Tested-by: Georgios Amanakis Signed-off-by: Dave Taht Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 114 +++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/sch_cake.c | 1867 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1993 insertions(+) create mode 100644 net/sched/sch_cake.c (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 949118461009..d9cc9dc4f547 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -955,4 +955,118 @@ enum { #define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index fcc89706745b..7af246764a35 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -295,6 +295,17 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_CAKE + tristate "Common Applications Kept Enhanced (CAKE)" + help + Say Y here if you want to use the Common Applications Kept Enhanced + (CAKE) queue management algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_cake. + + If unsure, say N. + config NET_SCH_FQ tristate "Fair Queue" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 9a5a7077d217..673ee7d26ff2 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c new file mode 100644 index 000000000000..ea0272615d63 --- /dev/null +++ b/net/sched/sch_cake.c @@ -0,0 +1,1867 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* COMMON Applications Kept Enhanced (CAKE) discipline + * + * Copyright (C) 2014-2018 Jonathan Morton + * Copyright (C) 2015-2018 Toke Høiland-Jørgensen + * Copyright (C) 2014-2018 Dave Täht + * Copyright (C) 2015-2018 Sebastian Moeller + * (C) 2015-2018 Kevin Darbyshire-Bryant + * Copyright (C) 2017-2018 Ryan Mounce + * + * The CAKE Principles: + * (or, how to have your cake and eat it too) + * + * This is a combination of several shaping, AQM and FQ techniques into one + * easy-to-use package: + * + * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE + * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), + * eliminating the need for any sort of burst parameter (eg. token bucket + * depth). Burst support is limited to that necessary to overcome scheduling + * latency. + * + * - A Diffserv-aware priority queue, giving more priority to certain classes, + * up to a specified fraction of bandwidth. Above that bandwidth threshold, + * the priority is reduced to avoid starving other tins. + * + * - Each priority tin has a separate Flow Queue system, to isolate traffic + * flows from each other. This prevents a burst on one flow from increasing + * the delay to another. Flows are distributed to queues using a + * set-associative hash function. + * + * - Each queue is actively managed by Cobalt, which is a combination of the + * Codel and Blue AQM algorithms. This serves flows fairly, and signals + * congestion early via ECN (if available) and/or packet drops, to keep + * latency low. The codel parameters are auto-tuned based on the bandwidth + * setting, as is necessary at low bandwidths. + * + * The configuration parameters are kept deliberately simple for ease of use. + * Everything has sane defaults. Complete generality of configuration is *not* + * a goal. + * + * The priority queue operates according to a weighted DRR scheme, combined with + * a bandwidth tracker which reuses the shaper logic to detect which side of the + * bandwidth sharing threshold the tin is operating. This determines whether a + * priority-based weight (high) or a bandwidth-based weight (low) is used for + * that tin in the current pass. + * + * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly + * granted us permission to leverage. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAKE_SET_WAYS (8) +#define CAKE_MAX_TINS (8) +#define CAKE_QUEUES (1024) +#define CAKE_FLOW_MASK 63 +#define CAKE_FLOW_NAT_FLAG 64 + +/* struct cobalt_params - contains codel and blue parameters + * @interval: codel initial drop rate + * @target: maximum persistent sojourn time & blue update rate + * @mtu_time: serialisation delay of maximum-size packet + * @p_inc: increment of blue drop probability (0.32 fxp) + * @p_dec: decrement of blue drop probability (0.32 fxp) + */ +struct cobalt_params { + u64 interval; + u64 target; + u64 mtu_time; + u32 p_inc; + u32 p_dec; +}; + +/* struct cobalt_vars - contains codel and blue variables + * @count: codel dropping frequency + * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 + * @drop_next: time to drop next packet, or when we dropped last + * @blue_timer: Blue time to next drop + * @p_drop: BLUE drop probability (0.32 fxp) + * @dropping: set if in dropping state + * @ecn_marked: set if marked + */ +struct cobalt_vars { + u32 count; + u32 rec_inv_sqrt; + ktime_t drop_next; + ktime_t blue_timer; + u32 p_drop; + bool dropping; + bool ecn_marked; +}; + +enum { + CAKE_SET_NONE = 0, + CAKE_SET_SPARSE, + CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ + CAKE_SET_BULK, + CAKE_SET_DECAYING +}; + +struct cake_flow { + /* this stuff is all needed per-flow at dequeue time */ + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + s32 deficit; + u32 dropped; + struct cobalt_vars cvars; + u16 srchost; /* index into cake_host table */ + u16 dsthost; + u8 set; +}; /* please try to keep this structure <= 64 bytes */ + +struct cake_host { + u32 srchost_tag; + u32 dsthost_tag; + u16 srchost_refcnt; + u16 dsthost_refcnt; +}; + +struct cake_heap_entry { + u16 t:3, b:10; +}; + +struct cake_tin_data { + struct cake_flow flows[CAKE_QUEUES]; + u32 backlogs[CAKE_QUEUES]; + u32 tags[CAKE_QUEUES]; /* for set association */ + u16 overflow_idx[CAKE_QUEUES]; + struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ + u16 flow_quantum; + + struct cobalt_params cparams; + u32 drop_overlimit; + u16 bulk_flow_count; + u16 sparse_flow_count; + u16 decaying_flow_count; + u16 unresponsive_flow_count; + + u32 max_skblen; + + struct list_head new_flows; + struct list_head old_flows; + struct list_head decaying_flows; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + ktime_t time_next_packet; + u64 tin_rate_ns; + u64 tin_rate_bps; + u16 tin_rate_shft; + + u16 tin_quantum_prio; + u16 tin_quantum_band; + s32 tin_deficit; + u32 tin_backlog; + u32 tin_dropped; + u32 tin_ecn_mark; + + u32 packets; + u64 bytes; + + u32 ack_drops; + + /* moving averages */ + u64 avge_delay; + u64 peak_delay; + u64 base_delay; + + /* hash function stats */ + u32 way_directs; + u32 way_hits; + u32 way_misses; + u32 way_collisions; +}; /* number of tins is small, so size of this struct doesn't matter much */ + +struct cake_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct cake_tin_data *tins; + + struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; + u16 overflow_timeout; + + u16 tin_cnt; + u8 tin_mode; + u8 flow_mode; + u8 ack_filter; + u8 atm_mode; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + u16 rate_shft; + ktime_t time_next_packet; + ktime_t failsafe_next_packet; + u64 rate_ns; + u64 rate_bps; + u16 rate_flags; + s16 rate_overhead; + u16 rate_mpu; + u64 interval; + u64 target; + + /* resource tracking */ + u32 buffer_used; + u32 buffer_max_used; + u32 buffer_limit; + u32 buffer_config_limit; + + /* indices for dequeue */ + u16 cur_tin; + u16 cur_flow; + + struct qdisc_watchdog watchdog; + const u8 *tin_index; + const u8 *tin_order; + + /* bandwidth capacity estimate */ + ktime_t last_packet_time; + ktime_t avg_window_begin; + u64 avg_packet_interval; + u64 avg_window_bytes; + u64 avg_peak_bandwidth; + ktime_t last_reconfig_time; + + /* packet length stats */ + u32 avg_netoff; + u16 max_netlen; + u16 max_adjlen; + u16 min_netlen; + u16 min_adjlen; +}; + +enum { + CAKE_FLAG_OVERHEAD = BIT(0), + CAKE_FLAG_AUTORATE_INGRESS = BIT(1), + CAKE_FLAG_INGRESS = BIT(2), + CAKE_FLAG_WASH = BIT(3), + CAKE_FLAG_SPLIT_GSO = BIT(4) +}; + +/* COBALT operates the Codel and BLUE algorithms in parallel, in order to + * obtain the best features of each. Codel is excellent on flows which + * respond to congestion signals in a TCP-like way. BLUE is more effective on + * unresponsive flows. + */ + +struct cobalt_skb_cb { + ktime_t enqueue_time; +}; + +static u64 us_to_ns(u64 us) +{ + return us * NSEC_PER_USEC; +} + +static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); + return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) +{ + return get_cobalt_cb(skb)->enqueue_time; +} + +static void cobalt_set_enqueue_time(struct sk_buff *skb, + ktime_t now) +{ + get_cobalt_cb(skb)->enqueue_time = now; +} + +static u16 quantum_div[CAKE_QUEUES + 1] = {0}; + +#define REC_INV_SQRT_CACHE (16) +static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; + +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ + +static void cobalt_newton_step(struct cobalt_vars *vars) +{ + u32 invsqrt, invsqrt2; + u64 val; + + invsqrt = vars->rec_inv_sqrt; + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; + val = (3LL << 32) - ((u64)vars->count * invsqrt2); + + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + vars->rec_inv_sqrt = val; +} + +static void cobalt_invsqrt(struct cobalt_vars *vars) +{ + if (vars->count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + else + cobalt_newton_step(vars); +} + +/* There is a big difference in timing between the accurate values placed in + * the cache and the approximations given by a single Newton step for small + * count values, particularly when stepping from count 1 to 2 or vice versa. + * Above 16, a single Newton step gives sufficient accuracy in either + * direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give + * the value that *should* have been produced at count 4. + */ + +static void cobalt_cache_init(void) +{ + struct cobalt_vars v; + + memset(&v, 0, sizeof(v)); + v.rec_inv_sqrt = ~0U; + cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; + + for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + + cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; + } +} + +static void cobalt_vars_init(struct cobalt_vars *vars) +{ + memset(vars, 0, sizeof(*vars)); + + if (!cobalt_rec_inv_sqrt_cache[0]) { + cobalt_cache_init(); + cobalt_rec_inv_sqrt_cache[0] = ~0; + } +} + +/* CoDel control_law is t + interval/sqrt(count) + * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid + * both sqrt() and divide operation. + */ +static ktime_t cobalt_control(ktime_t t, + u64 interval, + u32 rec_inv_sqrt) +{ + return ktime_add_ns(t, reciprocal_scale(interval, + rec_inv_sqrt)); +} + +/* Call this when a packet had to be dropped due to queue overflow. Returns + * true if the BLUE state was quiescent before but active after this call. + */ +static bool cobalt_queue_full(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool up = false; + + if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + up = !vars->p_drop; + vars->p_drop += p->p_inc; + if (vars->p_drop < p->p_inc) + vars->p_drop = ~0; + vars->blue_timer = now; + } + vars->dropping = true; + vars->drop_next = now; + if (!vars->count) + vars->count = 1; + + return up; +} + +/* Call this when the queue was serviced but turned out to be empty. Returns + * true if the BLUE state was active before but quiescent after this call. + */ +static bool cobalt_queue_empty(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool down = false; + + if (vars->p_drop && + ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + if (vars->p_drop < p->p_dec) + vars->p_drop = 0; + else + vars->p_drop -= p->p_dec; + vars->blue_timer = now; + down = !vars->p_drop; + } + vars->dropping = false; + + if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + } + + return down; +} + +/* Call this with a freshly dequeued packet for possible congestion marking. + * Returns true as an instruction to drop the packet, false for delivery. + */ +static bool cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb) +{ + bool next_due, over_target, drop = false; + ktime_t schedule; + u64 sojourn; + +/* The 'schedule' variable records, in its sign, whether 'now' is before or + * after 'drop_next'. This allows 'drop_next' to be updated before the next + * scheduling decision is actually branched, without destroying that + * information. Similarly, the first 'schedule' value calculated is preserved + * in the boolean 'next_due'. + * + * As for 'drop_next', we take advantage of the fact that 'interval' is both + * the delay between first exceeding 'target' and the first signalling event, + * *and* the scaling factor for the signalling frequency. It's therefore very + * natural to use a single mechanism for both purposes, and eliminates a + * significant amount of reference Codel's spaghetti code. To help with this, + * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close + * as possible to 1.0 in fixed-point. + */ + + sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + schedule = ktime_sub(now, vars->drop_next); + over_target = sojourn > p->target && + sojourn > p->mtu_time * 4; + next_due = vars->count && ktime_to_ns(schedule) >= 0; + + vars->ecn_marked = false; + + if (over_target) { + if (!vars->dropping) { + vars->dropping = true; + vars->drop_next = cobalt_control(now, + p->interval, + vars->rec_inv_sqrt); + } + if (!vars->count) + vars->count = 1; + } else if (vars->dropping) { + vars->dropping = false; + } + + if (next_due && vars->dropping) { + /* Use ECN mark if possible, otherwise drop */ + drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + + vars->count++; + if (!vars->count) + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + } else { + while (next_due) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + next_due = vars->count && ktime_to_ns(schedule) >= 0; + } + } + + /* Simple BLUE implementation. Lack of ECN is deliberate. */ + if (vars->p_drop) + drop |= (prandom_u32() < vars->p_drop); + + /* Overload the drop_next field as an activity timeout */ + if (!vars->count) + vars->drop_next = ktime_add_ns(now, p->interval); + else if (ktime_to_ns(schedule) > 0 && !drop) + vars->drop_next = now; + + return drop; +} + +/* Cake has several subtle multiple bit settings. In these cases you + * would be matching triple isolate mode as well. + */ + +static bool cake_dsrc(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; +} + +static bool cake_ddst(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; +} + +static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, + int flow_mode) +{ + u32 flow_hash = 0, srchost_hash, dsthost_hash; + u16 reduced_hash, srchost_idx, dsthost_idx; + struct flow_keys keys, host_keys; + + if (unlikely(flow_mode == CAKE_FLOW_NONE)) + return 0; + + skb_flow_dissect_flow_keys(skb, &keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + /* flow_hash_from_keys() sorts the addresses by value, so we have + * to preserve their order in a separate data structure to treat + * src and dst host addresses as independently selectable. + */ + host_keys = keys; + host_keys.ports.ports = 0; + host_keys.basic.ip_proto = 0; + host_keys.keyid.keyid = 0; + host_keys.tags.flow_label = 0; + + switch (host_keys.control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + host_keys.addrs.v4addrs.src = 0; + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + host_keys.addrs.v4addrs.dst = 0; + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + memset(&host_keys.addrs.v6addrs.src, 0, + sizeof(host_keys.addrs.v6addrs.src)); + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + memset(&host_keys.addrs.v6addrs.dst, 0, + sizeof(host_keys.addrs.v6addrs.dst)); + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + default: + dsthost_hash = 0; + srchost_hash = 0; + } + + /* This *must* be after the above switch, since as a + * side-effect it sorts the src and dst addresses. + */ + if (flow_mode & CAKE_FLOW_FLOWS) + flow_hash = flow_hash_from_keys(&keys); + + if (!(flow_mode & CAKE_FLOW_FLOWS)) { + if (flow_mode & CAKE_FLOW_SRC_IP) + flow_hash ^= srchost_hash; + + if (flow_mode & CAKE_FLOW_DST_IP) + flow_hash ^= dsthost_hash; + } + + reduced_hash = flow_hash % CAKE_QUEUES; + + /* set-associative hashing */ + /* fast path if no hash collision (direct lookup succeeds) */ + if (likely(q->tags[reduced_hash] == flow_hash && + q->flows[reduced_hash].set)) { + q->way_directs++; + } else { + u32 inner_hash = reduced_hash % CAKE_SET_WAYS; + u32 outer_hash = reduced_hash - inner_hash; + bool allocate_src = false; + bool allocate_dst = false; + u32 i, k; + + /* check if any active queue in the set is reserved for + * this flow. + */ + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->tags[outer_hash + k] == flow_hash) { + if (i) + q->way_hits++; + + if (!q->flows[outer_hash + k].set) { + /* need to increment host refcnts */ + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + } + + goto found; + } + } + + /* no queue is reserved for this flow, look for an + * empty one. + */ + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->flows[outer_hash + k].set) { + q->way_misses++; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + goto found; + } + } + + /* With no empty queues, default to the original + * queue, accept the collision, update the host tags. + */ + q->way_collisions++; + q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; + q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); +found: + /* reserve queue for future packets in same flow */ + reduced_hash = outer_hash + k; + q->tags[reduced_hash] = flow_hash; + + if (allocate_src) { + srchost_idx = srchost_hash % CAKE_QUEUES; + inner_hash = srchost_idx % CAKE_SET_WAYS; + outer_hash = srchost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].srchost_tag == + srchost_hash) + goto found_src; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].srchost_refcnt) + break; + } + q->hosts[outer_hash + k].srchost_tag = srchost_hash; +found_src: + srchost_idx = outer_hash + k; + q->hosts[srchost_idx].srchost_refcnt++; + q->flows[reduced_hash].srchost = srchost_idx; + } + + if (allocate_dst) { + dsthost_idx = dsthost_hash % CAKE_QUEUES; + inner_hash = dsthost_idx % CAKE_SET_WAYS; + outer_hash = dsthost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].dsthost_tag == + dsthost_hash) + goto found_dst; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].dsthost_refcnt) + break; + } + q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; +found_dst: + dsthost_idx = outer_hash + k; + q->hosts[dsthost_idx].dsthost_refcnt++; + q->flows[reduced_hash].dsthost = dsthost_idx; + } + } + + return reduced_hash; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ +/* remove one skb from head of slot queue */ + +static struct sk_buff *dequeue_head(struct cake_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + } + + return skb; +} + +/* add skb to flow queue (tail add) */ + +static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static u64 cake_ewma(u64 avg, u64 sample, u32 shift) +{ + avg -= avg >> shift; + avg += sample >> shift; + return avg; +} + +static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + struct cake_heap_entry jj = q->overflow_heap[j]; + + q->overflow_heap[i] = jj; + q->overflow_heap[j] = ii; + + q->tins[ii.t].overflow_idx[ii.b] = j; + q->tins[jj.t].overflow_idx[jj.b] = i; +} + +static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + + return q->tins[ii.t].backlogs[ii.b]; +} + +static void cake_heapify(struct cake_sched_data *q, u16 i) +{ + static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; + u32 mb = cake_heap_get_backlog(q, i); + u32 m = i; + + while (m < a) { + u32 l = m + m + 1; + u32 r = l + 1; + + if (l < a) { + u32 lb = cake_heap_get_backlog(q, l); + + if (lb > mb) { + m = l; + mb = lb; + } + } + + if (r < a) { + u32 rb = cake_heap_get_backlog(q, r); + + if (rb > mb) { + m = r; + mb = rb; + } + } + + if (m != i) { + cake_heap_swap(q, i, m); + i = m; + } else { + break; + } + } +} + +static void cake_heapify_up(struct cake_sched_data *q, u16 i) +{ + while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { + u16 p = (i - 1) >> 1; + u32 ib = cake_heap_get_backlog(q, i); + u32 pb = cake_heap_get_backlog(q, p); + + if (ib > pb) { + cake_heap_swap(q, i, p); + i = p; + } else { + break; + } + } +} + +static int cake_advance_shaper(struct cake_sched_data *q, + struct cake_tin_data *b, + struct sk_buff *skb, + ktime_t now, bool drop) +{ + u32 len = qdisc_pkt_len(skb); + + /* charge packet bandwidth to this tin + * and to the global shaper. + */ + if (q->rate_ns) { + u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; + u64 global_dur = (len * q->rate_ns) >> q->rate_shft; + u64 failsafe_dur = global_dur + (global_dur >> 1); + + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = ktime_add_ns(b->time_next_packet, + tin_dur); + + else if (ktime_before(b->time_next_packet, + ktime_add_ns(now, tin_dur))) + b->time_next_packet = ktime_add_ns(now, tin_dur); + + q->time_next_packet = ktime_add_ns(q->time_next_packet, + global_dur); + if (!drop) + q->failsafe_next_packet = \ + ktime_add_ns(q->failsafe_next_packet, + failsafe_dur); + } + return len; +} + +static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + ktime_t now = ktime_get(); + u32 idx = 0, tin = 0, len; + struct cake_heap_entry qq; + struct cake_tin_data *b; + struct cake_flow *flow; + struct sk_buff *skb; + + if (!q->overflow_timeout) { + int i; + /* Build fresh max-heap */ + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + cake_heapify(q, i); + } + q->overflow_timeout = 65535; + + /* select longest queue for pruning */ + qq = q->overflow_heap[0]; + tin = qq.t; + idx = qq.b; + + b = &q->tins[tin]; + flow = &b->flows[idx]; + skb = dequeue_head(flow); + if (unlikely(!skb)) { + /* heap has gone wrong, rebuild it next time */ + q->overflow_timeout = 0; + return idx + (tin << 16); + } + + if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count++; + + len = qdisc_pkt_len(skb); + q->buffer_used -= skb->truesize; + b->backlogs[idx] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + qdisc_tree_reduce_backlog(sch, 1, len); + + flow->dropped++; + b->tin_dropped++; + sch->qstats.drops++; + + __qdisc_drop(skb, to_free); + sch->q.qlen--; + + cake_heapify(q, 0); + + return idx + (tin << 16); +} + +static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, + struct sk_buff *skb, int flow_mode, int *qerr) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + int result; + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + return cake_hash(t, skb, flow_mode) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= CAKE_QUEUES) + return TC_H_MIN(res.classid); + } + return 0; +} + +static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int len = qdisc_pkt_len(skb); + int uninitialized_var(ret); + ktime_t now = ktime_get(); + struct cake_tin_data *b; + struct cake_flow *flow; + u32 idx, tin; + + tin = 0; + b = &q->tins[tin]; + + /* choose flow to insert into */ + idx = cake_classify(sch, b, skb, q->flow_mode, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return ret; + } + idx--; + flow = &b->flows[idx]; + + /* ensure shaper state isn't stale */ + if (!b->tin_backlog) { + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = now; + + if (!sch->q.qlen) { + if (ktime_before(q->time_next_packet, now)) { + q->failsafe_next_packet = now; + q->time_next_packet = now; + } else if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = \ + min(ktime_to_ns(q->time_next_packet), + ktime_to_ns( + q->failsafe_next_packet)); + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } + } + } + + if (unlikely(len > b->max_skblen)) + b->max_skblen = len; + + cobalt_set_enqueue_time(skb, now); + flow_queue_add(flow, skb); + + sch->q.qlen++; + q->buffer_used += skb->truesize; + + /* stats */ + b->packets++; + b->bytes += len; + b->backlogs[idx] += len; + b->tin_backlog += len; + sch->qstats.backlog += len; + q->avg_window_bytes += len; + + if (q->overflow_timeout) + cake_heapify_up(q, b->overflow_idx[idx]); + + /* incoming bandwidth capacity estimate */ + q->avg_window_bytes = 0; + q->last_packet_time = now; + + /* flowchain */ + if (!flow->set || flow->set == CAKE_SET_DECAYING) { + struct cake_host *srchost = &b->hosts[flow->srchost]; + struct cake_host *dsthost = &b->hosts[flow->dsthost]; + u16 host_load = 1; + + if (!flow->set) { + list_add_tail(&flow->flowchain, &b->new_flows); + } else { + b->decaying_flow_count--; + list_move_tail(&flow->flowchain, &b->new_flows); + } + flow->set = CAKE_SET_SPARSE; + b->sparse_flow_count++; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + flow->deficit = (b->flow_quantum * + quantum_div[host_load]) >> 16; + } else if (flow->set == CAKE_SET_SPARSE_WAIT) { + /* this flow was empty, accounted as a sparse flow, but actually + * in the bulk rotation. + */ + flow->set = CAKE_SET_BULK; + b->sparse_flow_count--; + b->bulk_flow_count++; + } + + if (q->buffer_used > q->buffer_max_used) + q->buffer_max_used = q->buffer_used; + + if (q->buffer_used > q->buffer_limit) { + u32 dropped = 0; + + while (q->buffer_used > q->buffer_limit) { + dropped++; + cake_drop(sch, to_free); + } + b->drop_overlimit += dropped; + } + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_flow *flow = &b->flows[q->cur_flow]; + struct sk_buff *skb = NULL; + u32 len; + + if (flow->head) { + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + b->backlogs[q->cur_flow] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + q->buffer_used -= skb->truesize; + sch->q.qlen--; + + if (q->overflow_timeout) + cake_heapify(q, b->overflow_idx[q->cur_flow]); + } + return skb; +} + +/* Discard leftover packets from a tin no longer in use. */ +static void cake_clear_tin(struct Qdisc *sch, u16 tin) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + q->cur_tin = tin; + for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) + while (!!(skb = cake_dequeue_one(sch))) + kfree_skb(skb); +} + +static struct sk_buff *cake_dequeue(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_host *srchost, *dsthost; + ktime_t now = ktime_get(); + struct cake_flow *flow; + struct list_head *head; + bool first_flow = true; + struct sk_buff *skb; + u16 host_load; + u64 delay; + u32 len; + +begin: + if (!sch->q.qlen) + return NULL; + + /* global hard shaper */ + if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + return NULL; + } + + /* Choose a class to work on. */ + if (!q->rate_ns) { + /* In unlimited mode, can't rely on shaper timings, just balance + * with DRR + */ + bool wrapped = false, empty = true; + + while (b->tin_deficit < 0 || + !(b->sparse_flow_count + b->bulk_flow_count)) { + if (b->tin_deficit <= 0) + b->tin_deficit += b->tin_quantum_band; + if (b->sparse_flow_count + b->bulk_flow_count) + empty = false; + + q->cur_tin++; + b++; + if (q->cur_tin >= q->tin_cnt) { + q->cur_tin = 0; + b = q->tins; + + if (wrapped) { + /* It's possible for q->qlen to be + * nonzero when we actually have no + * packets anywhere. + */ + if (empty) + return NULL; + } else { + wrapped = true; + } + } + } + } else { + /* In shaped mode, choose: + * - Highest-priority tin with queue and meeting schedule, or + * - The earliest-scheduled tin with queue. + */ + ktime_t best_time = KTIME_MAX; + int tin, best_tin = 0; + + for (tin = 0; tin < q->tin_cnt; tin++) { + b = q->tins + tin; + if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { + ktime_t time_to_pkt = \ + ktime_sub(b->time_next_packet, now); + + if (ktime_to_ns(time_to_pkt) <= 0 || + ktime_compare(time_to_pkt, + best_time) <= 0) { + best_time = time_to_pkt; + best_tin = tin; + } + } + } + + q->cur_tin = best_tin; + b = q->tins + best_tin; + + /* No point in going further if no packets to deliver. */ + if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) + return NULL; + } + +retry: + /* service this class */ + head = &b->decaying_flows; + if (!first_flow || list_empty(head)) { + head = &b->new_flows; + if (list_empty(head)) { + head = &b->old_flows; + if (unlikely(list_empty(head))) { + head = &b->decaying_flows; + if (unlikely(list_empty(head))) + goto begin; + } + } + } + flow = list_first_entry(head, struct cake_flow, flowchain); + q->cur_flow = flow - b->flows; + first_flow = false; + + /* triple isolation (modified DRR++) */ + srchost = &b->hosts[flow->srchost]; + dsthost = &b->hosts[flow->dsthost]; + host_load = 1; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + WARN_ON(host_load > CAKE_QUEUES); + + /* flow isolation (DRR++) */ + if (flow->deficit <= 0) { + /* The shifted prandom_u32() is a way to apply dithering to + * avoid accumulating roundoff errors + */ + flow->deficit += (b->flow_quantum * quantum_div[host_load] + + (prandom_u32() >> 16)) >> 16; + list_move_tail(&flow->flowchain, &b->old_flows); + + /* Keep all flows with deficits out of the sparse and decaying + * rotations. No non-empty flow can go into the decaying + * rotation, so they can't get deficits + */ + if (flow->set == CAKE_SET_SPARSE) { + if (flow->head) { + b->sparse_flow_count--; + b->bulk_flow_count++; + flow->set = CAKE_SET_BULK; + } else { + /* we've moved it to the bulk rotation for + * correct deficit accounting but we still want + * to count it as a sparse flow, not a bulk one. + */ + flow->set = CAKE_SET_SPARSE_WAIT; + } + } + goto retry; + } + + /* Retrieve a packet via the AQM */ + while (1) { + skb = cake_dequeue_one(sch); + if (!skb) { + /* this queue was actually empty */ + if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count--; + + if (flow->cvars.p_drop || flow->cvars.count || + ktime_before(now, flow->cvars.drop_next)) { + /* keep in the flowchain until the state has + * decayed to rest + */ + list_move_tail(&flow->flowchain, + &b->decaying_flows); + if (flow->set == CAKE_SET_BULK) { + b->bulk_flow_count--; + b->decaying_flow_count++; + } else if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) { + b->sparse_flow_count--; + b->decaying_flow_count++; + } + flow->set = CAKE_SET_DECAYING; + } else { + /* remove empty queue from the flowchain */ + list_del_init(&flow->flowchain); + if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) + b->sparse_flow_count--; + else if (flow->set == CAKE_SET_BULK) + b->bulk_flow_count--; + else + b->decaying_flow_count--; + + flow->set = CAKE_SET_NONE; + srchost->srchost_refcnt--; + dsthost->dsthost_refcnt--; + } + goto begin; + } + + /* Last packet in queue may be marked, shouldn't be dropped */ + if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb) || + !flow->head) + break; + + flow->dropped++; + b->tin_dropped++; + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); + qdisc_qstats_drop(sch); + kfree_skb(skb); + } + + b->tin_ecn_mark += !!flow->cvars.ecn_marked; + qdisc_bstats_update(sch, skb); + + /* collect delay stats */ + delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + b->avge_delay = cake_ewma(b->avge_delay, delay, 8); + b->peak_delay = cake_ewma(b->peak_delay, delay, + delay > b->peak_delay ? 2 : 8); + b->base_delay = cake_ewma(b->base_delay, delay, + delay < b->base_delay ? 2 : 8); + + len = cake_advance_shaper(q, b, skb, now, false); + flow->deficit -= len; + b->tin_deficit -= len; + + if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } else if (!sch->q.qlen) { + int i; + + for (i = 0; i < q->tin_cnt; i++) { + if (q->tins[i].decaying_flow_count) { + ktime_t next = \ + ktime_add_ns(now, + q->tins[i].cparams.target); + + qdisc_watchdog_schedule_ns(&q->watchdog, + ktime_to_ns(next)); + break; + } + } + } + + if (q->overflow_timeout) + q->overflow_timeout--; + + return skb; +} + +static void cake_reset(struct Qdisc *sch) +{ + u32 c; + + for (c = 0; c < CAKE_MAX_TINS; c++) + cake_clear_tin(sch, c); +} + +static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { + [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, + [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_ATM] = { .type = NLA_U32 }, + [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, + [TCA_CAKE_RTT] = { .type = NLA_U32 }, + [TCA_CAKE_TARGET] = { .type = NLA_U32 }, + [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, + [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, + [TCA_CAKE_NAT] = { .type = NLA_U32 }, + [TCA_CAKE_RAW] = { .type = NLA_U32 }, + [TCA_CAKE_WASH] = { .type = NLA_U32 }, + [TCA_CAKE_MPU] = { .type = NLA_U32 }, + [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, + [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, +}; + +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, + u64 target_ns, u64 rtt_est_ns) +{ + /* convert byte-rate into time-per-byte + * so it will always unwedge in reasonable time. + */ + static const u64 MIN_RATE = 64; + u32 byte_target = mtu; + u64 byte_target_ns; + u8 rate_shft = 0; + u64 rate_ns = 0; + + b->flow_quantum = 1514; + if (rate) { + b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); + rate_shft = 34; + rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; + rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); + while (!!(rate_ns >> 34)) { + rate_ns >>= 1; + rate_shft--; + } + } /* else unlimited, ie. zero delay */ + + b->tin_rate_bps = rate; + b->tin_rate_ns = rate_ns; + b->tin_rate_shft = rate_shft; + + byte_target_ns = (byte_target * rate_ns) >> rate_shft; + + b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); + b->cparams.interval = max(rtt_est_ns + + b->cparams.target - target_ns, + b->cparams.target * 2); + b->cparams.mtu_time = byte_target_ns; + b->cparams.p_inc = 1 << 24; /* 1/256 */ + b->cparams.p_dec = 1 << 20; /* 1/4096 */ +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[0]; + int c, ft = 0; + + q->tin_cnt = 1; + cake_set_rate(b, q->rate_bps, psched_mtu(qdisc_dev(sch)), + us_to_ns(q->target), us_to_ns(q->interval)); + b->tin_quantum_band = 65535; + b->tin_quantum_prio = 65535; + + for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { + cake_clear_tin(sch, c); + q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; + } + + q->rate_ns = q->tins[ft].tin_rate_ns; + q->rate_shft = q->tins[ft].tin_rate_shft; + + if (q->buffer_config_limit) { + q->buffer_limit = q->buffer_config_limit; + } else if (q->rate_bps) { + u64 t = q->rate_bps * q->interval; + + do_div(t, USEC_PER_SEC / 4); + q->buffer_limit = max_t(u32, t, 4U << 20); + } else { + q->buffer_limit = ~0; + } + + sch->flags &= ~TCQ_F_CAN_BYPASS; + + q->buffer_limit = min(q->buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit)); +} + +static int cake_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CAKE_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CAKE_BASE_RATE64]) + q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + + if (tb[TCA_CAKE_FLOW_MODE]) + q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & + CAKE_FLOW_MASK); + + if (tb[TCA_CAKE_RTT]) { + q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + + if (!q->interval) + q->interval = 1; + } + + if (tb[TCA_CAKE_TARGET]) { + q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + + if (!q->target) + q->target = 1; + } + + if (tb[TCA_CAKE_MEMORY]) + q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + + if (q->tins) { + sch_tree_lock(sch); + cake_reconfigure(sch); + sch_tree_unlock(sch); + } + + return 0; +} + +static void cake_destroy(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); + tcf_block_put(q->block); + kvfree(q->tins); +} + +static int cake_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int i, j, err; + + sch->limit = 10240; + q->tin_mode = CAKE_DIFFSERV_BESTEFFORT; + q->flow_mode = CAKE_FLOW_TRIPLE; + + q->rate_bps = 0; /* unlimited by default */ + + q->interval = 100000; /* 100ms default */ + q->target = 5000; /* 5ms: codel RFC argues + * for 5 to 10% of interval + */ + + q->cur_tin = 0; + q->cur_flow = 0; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) { + int err = cake_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + quantum_div[0] = ~0; + for (i = 1; i <= CAKE_QUEUES; i++) + quantum_div[i] = 65535 / i; + + q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), + GFP_KERNEL); + if (!q->tins) + goto nomem; + + for (i = 0; i < CAKE_MAX_TINS; i++) { + struct cake_tin_data *b = q->tins + i; + + INIT_LIST_HEAD(&b->new_flows); + INIT_LIST_HEAD(&b->old_flows); + INIT_LIST_HEAD(&b->decaying_flows); + b->sparse_flow_count = 0; + b->bulk_flow_count = 0; + b->decaying_flow_count = 0; + + for (j = 0; j < CAKE_QUEUES; j++) { + struct cake_flow *flow = b->flows + j; + u32 k = j * CAKE_MAX_TINS + i; + + INIT_LIST_HEAD(&flow->flowchain); + cobalt_vars_init(&flow->cvars); + + q->overflow_heap[k].t = i; + q->overflow_heap[k].b = j; + b->overflow_idx[j] = k; + } + } + + cake_reconfigure(sch); + q->avg_peak_bandwidth = q->rate_bps; + q->min_netlen = ~0; + q->min_adjlen = ~0; + return 0; + +nomem: + cake_destroy(sch); + return -ENOMEM; +} + +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, + TCA_CAKE_PAD)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, + q->flow_mode & CAKE_FLOW_MASK)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tstats, *ts; + int i; + + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ + data, TCA_CAKE_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); + PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); + PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); + PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); + PUT_STAT_U32(MAX_NETLEN, q->max_netlen); + PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); + PUT_STAT_U32(MIN_NETLEN, q->min_netlen); + PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + +#undef PUT_STAT_U32 +#undef PUT_STAT_U64 + + tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); + if (!tstats) + goto nla_put_failure; + +#define PUT_TSTAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_TSTAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ + data, TCA_CAKE_TIN_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + ts = nla_nest_start(d->skb, i + 1); + if (!ts) + goto nla_put_failure; + + PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); + PUT_TSTAT_U64(SENT_BYTES64, b->bytes); + PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); + + PUT_TSTAT_U32(TARGET_US, + ktime_to_us(ns_to_ktime(b->cparams.target))); + PUT_TSTAT_U32(INTERVAL_US, + ktime_to_us(ns_to_ktime(b->cparams.interval))); + + PUT_TSTAT_U32(SENT_PACKETS, b->packets); + PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); + PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); + PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); + + PUT_TSTAT_U32(PEAK_DELAY_US, + ktime_to_us(ns_to_ktime(b->peak_delay))); + PUT_TSTAT_U32(AVG_DELAY_US, + ktime_to_us(ns_to_ktime(b->avge_delay))); + PUT_TSTAT_U32(BASE_DELAY_US, + ktime_to_us(ns_to_ktime(b->base_delay))); + + PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); + PUT_TSTAT_U32(WAY_MISSES, b->way_misses); + PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); + + PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + + b->decaying_flow_count); + PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); + PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); + PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); + + PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); + nla_nest_end(d->skb, ts); + } + +#undef PUT_TSTAT_U32 +#undef PUT_TSTAT_U64 + + nla_nest_end(d->skb, tstats); + return nla_nest_end(d->skb, stats); + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long cake_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void cake_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->block; +} + +static int cake_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct cake_sched_data *q = qdisc_priv(sch); + const struct cake_flow *flow = NULL; + struct gnet_stats_queue qs = { 0 }; + struct nlattr *stats; + u32 idx = cl - 1; + + if (idx < CAKE_QUEUES * q->tin_cnt) { + const struct cake_tin_data *b = &q->tins[idx / CAKE_QUEUES]; + const struct sk_buff *skb; + + flow = &b->flows[idx % CAKE_QUEUES]; + + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); + } + qs.backlog = b->backlogs[idx % CAKE_QUEUES]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) + return -1; + if (flow) { + ktime_t now = ktime_get(); + + stats = nla_nest_start(d->skb, TCA_STATS_APP); + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_S32(attr, data) do { \ + if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_S32(DEFICIT, flow->deficit); + PUT_STAT_U32(DROPPING, flow->cvars.dropping); + PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); + PUT_STAT_U32(P_DROP, flow->cvars.p_drop); + if (flow->cvars.p_drop) { + PUT_STAT_S32(BLUE_TIMER_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.blue_timer))); + } + if (flow->cvars.dropping) { + PUT_STAT_S32(DROP_NEXT_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.drop_next))); + } + + if (nla_nest_end(d->skb, stats) < 0) + return -1; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cake_sched_data *q = qdisc_priv(sch); + unsigned int i, j; + + if (arg->stop) + return; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + for (j = 0; j < CAKE_QUEUES; j++) { + if (list_empty(&b->flows[j].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} + +static const struct Qdisc_class_ops cake_class_ops = { + .leaf = cake_leaf, + .find = cake_find, + .tcf_block = cake_tcf_block, + .bind_tcf = cake_bind, + .unbind_tcf = cake_unbind, + .dump = cake_dump_class, + .dump_stats = cake_dump_class_stats, + .walk = cake_walk, +}; + +static struct Qdisc_ops cake_qdisc_ops __read_mostly = { + .cl_ops = &cake_class_ops, + .id = "cake", + .priv_size = sizeof(struct cake_sched_data), + .enqueue = cake_enqueue, + .dequeue = cake_dequeue, + .peek = qdisc_peek_dequeued, + .init = cake_init, + .reset = cake_reset, + .destroy = cake_destroy, + .change = cake_change, + .dump = cake_dump, + .dump_stats = cake_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cake_module_init(void) +{ + return register_qdisc(&cake_qdisc_ops); +} + +static void __exit cake_module_exit(void) +{ + unregister_qdisc(&cake_qdisc_ops); +} + +module_init(cake_module_init) +module_exit(cake_module_exit) +MODULE_AUTHOR("Jonathan Morton"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("The CAKE shaper."); -- cgit v1.2.3 From 7298de9cd7255a783ba93533acbf1c2b0a9c582d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add ingress mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ingress mode is meant to be enabled when CAKE runs downlink of the actual bottleneck (such as on an IFB device). The mode changes the shaper to also account dropped packets to the shaped rate, as these have already traversed the bottleneck. Enabling ingress mode will also tune the AQM to always keep at least two packets queued *for each flow*. This is done by scaling the minimum queue occupancy level that will disable the AQM by the number of active bulk flows. The rationale for this is that retransmits are more expensive in ingress mode, since dropped packets have to traverse the bottleneck again when they are retransmitted; thus, being more lenient and keeping a minimum number of packets queued will improve throughput in cases where the number of active flows are so large that they saturate the bottleneck even at their minimum window size. This commit also adds a separate switch to enable ingress mode rate autoscaling. If enabled, the autoscaling code will observe the actual traffic rate and adjust the shaper rate to match it. This can help avoid latency increases in the case where the actual bottleneck rate decreases below the shaped rate. The scaling filters out spikes by an EWMA filter. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ea0272615d63..2950a8d07887 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -435,7 +435,8 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, static bool cobalt_should_drop(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now, - struct sk_buff *skb) + struct sk_buff *skb, + u32 bulk_flows) { bool next_due, over_target, drop = false; ktime_t schedule; @@ -459,6 +460,7 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); schedule = ktime_sub(now, vars->drop_next); over_target = sojourn > p->target && + sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; next_due = vars->count && ktime_to_ns(schedule) >= 0; @@ -881,6 +883,9 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) b->tin_dropped++; sch->qstats.drops++; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, skb, now, true); + __qdisc_drop(skb, to_free); sch->q.qlen--; @@ -921,6 +926,8 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, return 0; } +static void cake_reconfigure(struct Qdisc *sch); + static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -988,8 +995,46 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, cake_heapify_up(q, b->overflow_idx[idx]); /* incoming bandwidth capacity estimate */ - q->avg_window_bytes = 0; - q->last_packet_time = now; + if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { + u64 packet_interval = \ + ktime_to_ns(ktime_sub(now, q->last_packet_time)); + + if (packet_interval > NSEC_PER_SEC) + packet_interval = NSEC_PER_SEC; + + /* filter out short-term bursts, eg. wifi aggregation */ + q->avg_packet_interval = \ + cake_ewma(q->avg_packet_interval, + packet_interval, + (packet_interval > q->avg_packet_interval ? + 2 : 8)); + + q->last_packet_time = now; + + if (packet_interval > q->avg_packet_interval) { + u64 window_interval = \ + ktime_to_ns(ktime_sub(now, + q->avg_window_begin)); + u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; + + do_div(b, window_interval); + q->avg_peak_bandwidth = + cake_ewma(q->avg_peak_bandwidth, b, + b > q->avg_peak_bandwidth ? 2 : 8); + q->avg_window_bytes = 0; + q->avg_window_begin = now; + + if (ktime_after(now, + ktime_add_ms(q->last_reconfig_time, + 250))) { + q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; + cake_reconfigure(sch); + } + } + } else { + q->avg_window_bytes = 0; + q->last_packet_time = now; + } /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { @@ -1268,15 +1313,27 @@ retry: } /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb) || + if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))) || !flow->head) break; + /* drop this packet, get another one */ + if (q->rate_flags & CAKE_FLAG_INGRESS) { + len = cake_advance_shaper(q, b, skb, + now, true); + flow->deficit -= len; + b->tin_deficit -= len; + } flow->dropped++; b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); kfree_skb(skb); + if (q->rate_flags & CAKE_FLAG_INGRESS) + goto retry; } b->tin_ecn_mark += !!flow->cvars.ecn_marked; @@ -1459,6 +1516,20 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, q->target = 1; } + if (tb[TCA_CAKE_AUTORATE]) { + if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) + q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + } + + if (tb[TCA_CAKE_INGRESS]) { + if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) + q->rate_flags |= CAKE_FLAG_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_INGRESS; + } + if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); @@ -1582,6 +1653,14 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_AUTORATE, + !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_INGRESS, + !!(q->rate_flags & CAKE_FLAG_INGRESS))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 8b7138814f29933898ecd31dfc83e35a30ee69f5 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add optional ACK filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ACK filter is an optional feature of CAKE which is designed to improve performance on links with very asymmetrical rate limits. On such links (which are unfortunately quite prevalent, especially for DSL and cable subscribers), the downstream throughput can be limited by the number of ACKs capable of being transmitted in the *upstream* direction. Filtering ACKs can, in general, have adverse effects on TCP performance because it interferes with ACK clocking (especially in slow start), and it reduces the flow's resiliency to ACKs being dropped further along the path. To alleviate these drawbacks, the ACK filter in CAKE tries its best to always keep enough ACKs queued to ensure forward progress in the TCP flow being filtered. It does this by only filtering redundant ACKs. In its default 'conservative' mode, the filter will always keep at least two redundant ACKs in the queue, while in 'aggressive' mode, it will filter down to a single ACK. The ACK filter works by inspecting the per-flow queue on every packet enqueue. Starting at the head of the queue, the filter looks for another eligible packet to drop (so the ACK being dropped is always closer to the head of the queue than the packet being enqueued). An ACK is eligible only if it ACKs *fewer* bytes than the new packet being enqueued, including any SACK options. This prevents duplicate ACKs from being filtered, to avoid interfering with retransmission logic. In addition, we check TCP header options and only drop those that are known to not interfere with sender state. In particular, packets with unknown option codes are never dropped. In aggressive mode, an eligible packet is always dropped, while in conservative mode, at least two ACKs are kept in the queue. Only pure ACKs (with no data segments) are considered eligible for dropping, but when an ACK with data segments is enqueued, this can cause another pure ACK to become eligible for dropping. The approach described above ensures that this ACK filter avoids most of the drawbacks of a naive filtering mechanism that only keeps flow state but does not inspect the queue. This is the rationale for including the ACK filter in CAKE itself rather than as separate module (as the TC filter, for instance). Our performance evaluation has shown that on a 30/1 Mbps link with a bidirectional traffic test (RRUL), turning on the ACK filter on the upstream link improves downstream throughput by ~20% (both modes) and upstream throughput by ~12% in conservative mode and ~40% in aggressive mode, at the cost of ~5ms of inter-flow latency due to the increased congestion. In *really* pathological cases, the effect can be a lot more; for instance, the ACK filter increases the achievable downstream throughput on a link with 100 Kbps in the upstream direction by an order of magnitude (from ~2.5 Mbps to ~25 Mbps). Finally, even though we consider the ACK filter to be safer than most, we do not recommend turning it on everywhere: on more symmetrical link bandwidths the effect is negligible at best. Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 454 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 452 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 2950a8d07887..930096d46c4f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -725,6 +725,433 @@ static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) skb->next = NULL; } +static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, + struct ipv6hdr *buf) +{ + unsigned int offset = skb_network_offset(skb); + struct iphdr *iph; + + iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); + + if (!iph) + return NULL; + + if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) + return skb_header_pointer(skb, offset + iph->ihl * 4, + sizeof(struct ipv6hdr), buf); + + else if (iph->version == 4) + return iph; + + else if (iph->version == 6) + return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), + buf); + + return NULL; +} + +static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, + void *buf, unsigned int bufsize) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct tcphdr *tcph; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; + struct tcphdr _tcph; + + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h) + return NULL; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + + } else if (iph->protocol != IPPROTO_TCP) { + return NULL; + } + + } else if (ipv6h->version == 6) { + if (ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + } else { + return NULL; + } + + tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!tcph) + return NULL; + + return skb_header_pointer(skb, offset, + min(__tcp_hdrlen(tcph), bufsize), buf); +} + +static const void *cake_get_tcpopt(const struct tcphdr *tcph, + int code, int *oplen) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + if (opcode == code) { + *oplen = opsize; + return ptr; + } + + ptr += opsize - 2; + length -= opsize; + } + + return NULL; +} + +/* Compare two SACK sequences. A sequence is considered greater if it SACKs more + * bytes than the other. In the case where both sequences ACKs bytes that the + * other doesn't, A is considered greater. DSACKs in A also makes A be + * considered greater. + * + * @return -1, 0 or 1 as normal compare functions + */ +static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, + const struct tcphdr *tcph_b) +{ + const struct tcp_sack_block_wire *sack_a, *sack_b; + u32 ack_seq_a = ntohl(tcph_a->ack_seq); + u32 bytes_a = 0, bytes_b = 0; + int oplen_a, oplen_b; + bool first = true; + + sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); + sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); + + /* pointers point to option contents */ + oplen_a -= TCPOLEN_SACK_BASE; + oplen_b -= TCPOLEN_SACK_BASE; + + if (sack_a && oplen_a >= sizeof(*sack_a) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return -1; + else if (sack_b && oplen_b >= sizeof(*sack_b) && + (!sack_a || oplen_a < sizeof(*sack_a))) + return 1; + else if ((!sack_a || oplen_a < sizeof(*sack_a)) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return 0; + + while (oplen_a >= sizeof(*sack_a)) { + const struct tcp_sack_block_wire *sack_tmp = sack_b; + u32 start_a = get_unaligned_be32(&sack_a->start_seq); + u32 end_a = get_unaligned_be32(&sack_a->end_seq); + int oplen_tmp = oplen_b; + bool found = false; + + /* DSACK; always considered greater to prevent dropping */ + if (before(start_a, ack_seq_a)) + return -1; + + bytes_a += end_a - start_a; + + while (oplen_tmp >= sizeof(*sack_tmp)) { + u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); + u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); + + /* first time through we count the total size */ + if (first) + bytes_b += end_b - start_b; + + if (!after(start_b, start_a) && !before(end_b, end_a)) { + found = true; + if (!first) + break; + } + oplen_tmp -= sizeof(*sack_tmp); + sack_tmp++; + } + + if (!found) + return -1; + + oplen_a -= sizeof(*sack_a); + sack_a++; + first = false; + } + + /* If we made it this far, all ranges SACKed by A are covered by B, so + * either the SACKs are equal, or B SACKs more bytes. + */ + return bytes_b > bytes_a ? 1 : 0; +} + +static void cake_tcph_get_tstamp(const struct tcphdr *tcph, + u32 *tsval, u32 *tsecr) +{ + const u8 *ptr; + int opsize; + + ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); + + if (ptr && opsize == TCPOLEN_TIMESTAMP) { + *tsval = get_unaligned_be32(ptr); + *tsecr = get_unaligned_be32(ptr + 4); + } +} + +static bool cake_tcph_may_drop(const struct tcphdr *tcph, + u32 tstamp_new, u32 tsecr_new) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + u32 tstamp, tsecr; + + /* 3 reserved flags must be unset to avoid future breakage + * ACK must be set + * ECE/CWR are handled separately + * All other flags URG/PSH/RST/SYN/FIN must be unset + * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) + * 0x00C00000 = CWR/ECE (handled separately) + * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 + */ + if (((tcp_flag_word(tcph) & + cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) + return false; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + switch (opcode) { + case TCPOPT_MD5SIG: /* doesn't influence state */ + break; + + case TCPOPT_SACK: /* stricter checking performed later */ + if (opsize % 8 != 2) + return false; + break; + + case TCPOPT_TIMESTAMP: + /* only drop timestamps lower than new */ + if (opsize != TCPOLEN_TIMESTAMP) + return false; + tstamp = get_unaligned_be32(ptr); + tsecr = get_unaligned_be32(ptr + 4); + if (after(tstamp, tstamp_new) || + after(tsecr, tsecr_new)) + return false; + break; + + case TCPOPT_MSS: /* these should only be set on SYN */ + case TCPOPT_WINDOW: + case TCPOPT_SACK_PERM: + case TCPOPT_FASTOPEN: + case TCPOPT_EXP: + default: /* don't drop if any unknown options are present */ + return false; + } + + ptr += opsize - 2; + length -= opsize; + } + + return true; +} + +static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, + struct cake_flow *flow) +{ + bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; + struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; + struct sk_buff *skb_check, *skb_prev = NULL; + const struct ipv6hdr *ipv6h, *ipv6h_check; + unsigned char _tcph[64], _tcph_check[64]; + const struct tcphdr *tcph, *tcph_check; + const struct iphdr *iph, *iph_check; + struct ipv6hdr _iph, _iph_check; + const struct sk_buff *skb; + int seglen, num_found = 0; + u32 tstamp = 0, tsecr = 0; + __be32 elig_flags = 0; + int sack_comp; + + /* no other possible ACKs to filter */ + if (flow->head == flow->tail) + return NULL; + + skb = flow->tail; + tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); + iph = cake_get_iphdr(skb, &_iph); + if (!tcph) + return NULL; + + cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); + + /* the 'triggering' packet need only have the ACK flag set. + * also check that SYN is not set, as there won't be any previous ACKs. + */ + if ((tcp_flag_word(tcph) & + (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) + return NULL; + + /* the 'triggering' ACK is at the tail of the queue, we have already + * returned if it is the only packet in the flow. loop through the rest + * of the queue looking for pure ACKs with the same 5-tuple as the + * triggering one. + */ + for (skb_check = flow->head; + skb_check && skb_check != skb; + skb_prev = skb_check, skb_check = skb_check->next) { + iph_check = cake_get_iphdr(skb_check, &_iph_check); + tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, + sizeof(_tcph_check)); + + /* only TCP packets with matching 5-tuple are eligible, and only + * drop safe headers + */ + if (!tcph_check || iph->version != iph_check->version || + tcph_check->source != tcph->source || + tcph_check->dest != tcph->dest) + continue; + + if (iph_check->version == 4) { + if (iph_check->saddr != iph->saddr || + iph_check->daddr != iph->daddr) + continue; + + seglen = ntohs(iph_check->tot_len) - + (4 * iph_check->ihl); + } else if (iph_check->version == 6) { + ipv6h = (struct ipv6hdr *)iph; + ipv6h_check = (struct ipv6hdr *)iph_check; + + if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || + ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) + continue; + + seglen = ntohs(ipv6h_check->payload_len); + } else { + WARN_ON(1); /* shouldn't happen */ + continue; + } + + /* If the ECE/CWR flags changed from the previous eligible + * packet in the same flow, we should no longer be dropping that + * previous packet as this would lose information. + */ + if (elig_ack && (tcp_flag_word(tcph_check) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { + elig_ack = NULL; + elig_ack_prev = NULL; + num_found--; + } + + /* Check TCP options and flags, don't drop ACKs with segment + * data, and don't drop ACKs with a higher cumulative ACK + * counter than the triggering packet. Check ACK seqno here to + * avoid parsing SACK options of packets we are going to exclude + * anyway. + */ + if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || + (seglen - __tcp_hdrlen(tcph_check)) != 0 || + after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) + continue; + + /* Check SACK options. The triggering packet must SACK more data + * than the ACK under consideration, or SACK the same range but + * have a larger cumulative ACK counter. The latter is a + * pathological case, but is contained in the following check + * anyway, just to be safe. + */ + sack_comp = cake_tcph_sack_compare(tcph_check, tcph); + + if (sack_comp < 0 || + (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && + sack_comp == 0)) + continue; + + /* At this point we have found an eligible pure ACK to drop; if + * we are in aggressive mode, we are done. Otherwise, keep + * searching unless this is the second eligible ACK we + * found. + * + * Since we want to drop ACK closest to the head of the queue, + * save the first eligible ACK we find, even if we need to loop + * again. + */ + if (!elig_ack) { + elig_ack = skb_check; + elig_ack_prev = skb_prev; + elig_flags = (tcp_flag_word(tcph_check) + & (TCP_FLAG_ECE | TCP_FLAG_CWR)); + } + + if (num_found++ > 0) + goto found; + } + + /* We made it through the queue without finding two eligible ACKs . If + * we found a single eligible ACK we can drop it in aggressive mode if + * we can guarantee that this does not interfere with ECN flag + * information. We ensure this by dropping it only if the enqueued + * packet is consecutive with the eligible ACK, and their flags match. + */ + if (elig_ack && aggressive && elig_ack->next == skb && + (elig_flags == (tcp_flag_word(tcph) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)))) + goto found; + + return NULL; + +found: + if (elig_ack_prev) + elig_ack_prev->next = elig_ack->next; + else + flow->head = elig_ack->next; + + elig_ack->next = NULL; + + return elig_ack; +} + static u64 cake_ewma(u64 avg, u64 sample, u32 shift) { avg -= avg >> shift; @@ -934,6 +1361,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct cake_sched_data *q = qdisc_priv(sch); int len = qdisc_pkt_len(skb); int uninitialized_var(ret); + struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; @@ -980,8 +1408,24 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, cobalt_set_enqueue_time(skb, now); flow_queue_add(flow, skb); - sch->q.qlen++; - q->buffer_used += skb->truesize; + if (q->ack_filter) + ack = cake_ack_filter(q, flow); + + if (ack) { + b->ack_drops++; + sch->qstats.drops++; + b->bytes += qdisc_pkt_len(ack); + len -= qdisc_pkt_len(ack); + q->buffer_used += skb->truesize - ack->truesize; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, ack, now, true); + + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + consume_skb(ack); + } else { + sch->q.qlen++; + q->buffer_used += skb->truesize; + } /* stats */ b->packets++; @@ -1530,6 +1974,9 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, q->rate_flags &= ~CAKE_FLAG_INGRESS; } + if (tb[TCA_CAKE_ACK_FILTER]) + q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); @@ -1661,6 +2108,9 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From b60a60405fb95a688eb2ef4ef20f5fcaa7b64f68 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: netfilter: Add nf_ct_get_tuple_skb global lookup function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a global netfilter function to extract a conntrack tuple from an skb. The function uses a new function added to nf_ct_hook, which will try to get the tuple from skb->_nfct, and do a full lookup if that fails. This makes it possible to use the lookup function before the skb has passed through the conntrack init hooks (e.g., in an ingress qdisc). The tuple is copied to the caller to avoid issues with reference counting. The function returns false if conntrack is not loaded, allowing it to be used without incurring a module dependency on conntrack. This is used by the NAT mode in sch_cake. Cc: netfilter-devel@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/netfilter.h | 11 +++++++++++ net/netfilter/core.c | 15 +++++++++++++++ net/netfilter/nf_conntrack_core.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 23b48de8c2e2..07efffd0c759 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -414,8 +414,17 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); +struct nf_conntrack_tuple; +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} +struct nf_conntrack_tuple; +static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + return false; +} #endif struct nf_conn; @@ -424,6 +433,8 @@ enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); + bool (*get_tuple_skb)(struct nf_conntrack_tuple *, + const struct sk_buff *); }; extern struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 168af54db975..dc240cb47ddf 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + struct nf_ct_hook *ct_hook; + bool ret = false; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ret = ct_hook->get_tuple_skb(dst_tuple, skb); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_ct_get_tuple_skb); + /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3465da2a98bd..85ab2fd6a665 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1683,6 +1683,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; } +static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + const struct nf_conntrack_tuple *src_tuple; + const struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple srctuple; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + NFPROTO_IPV4, dev_net(skb->dev), + &srctuple)) + return false; + + hash = nf_conntrack_find_get(dev_net(skb->dev), + &nf_ct_zone_dflt, + &srctuple); + if (!hash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(hash); + src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + nf_ct_put(ct); + + return true; +} + /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), @@ -2204,6 +2239,7 @@ err_cachep: static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = destroy_conntrack, + .get_tuple_skb = nf_conntrack_get_tuple_skb, }; void nf_conntrack_init_end(void) -- cgit v1.2.3 From ea82511518f4f2e5fe83d2fe1884ef5fc6be6204 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add NAT awareness to packet classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CAKE is deployed on a gateway that also performs NAT (which is a common deployment mode), the host fairness mechanism cannot distinguish internal hosts from each other, and so fails to work correctly. To fix this, we add an optional NAT awareness mode, which will query the kernel conntrack mechanism to obtain the pre-NAT addresses for each packet and use that in the flow and host hashing. When the shaper is enabled and the host is already performing NAT, the cost of this lookup is negligible. However, in unlimited mode with no NAT being performed, there is a significant CPU cost at higher bandwidths. For this reason, the feature is turned off by default. Cc: netfilter-devel@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 930096d46c4f..633ca1578114 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -71,6 +71,10 @@ #include #include +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + #define CAKE_SET_WAYS (8) #define CAKE_MAX_TINS (8) #define CAKE_QUEUES (1024) @@ -516,6 +520,29 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } +static void cake_update_flowkeys(struct flow_keys *keys, + const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct nf_conntrack_tuple tuple = {}; + bool rev = !skb->_nfct; + + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + return; + + if (!nf_ct_get_tuple_skb(&tuple, skb)) + return; + + keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + + if (keys->ports.ports) { + keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; + keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + } +#endif +} + /* Cake has several subtle multiple bit settings. In these cases you * would be matching triple isolate mode as well. */ @@ -543,6 +570,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + if (flow_mode & CAKE_FLOW_NAT_FLAG) + cake_update_flowkeys(&keys, skb); + /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat * src and dst host addresses as independently selectable. @@ -1939,12 +1969,25 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + if (tb[TCA_CAKE_NAT]) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; + q->flow_mode |= CAKE_FLOW_NAT_FLAG * + !!nla_get_u32(tb[TCA_CAKE_NAT]); +#else + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], + "No conntrack support in kernel"); + return -EOPNOTSUPP; +#endif + } + if (tb[TCA_CAKE_BASE_RATE64]) q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); if (tb[TCA_CAKE_FLOW_MODE]) - q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & - CAKE_FLOW_MASK); + q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & + CAKE_FLOW_MASK)); if (tb[TCA_CAKE_RTT]) { q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); @@ -2111,6 +2154,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_NAT, + !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 83f8fd69af4f62136765b60fd0efa1c9167917c5 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add DiffServ handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for DiffServ-based priority queueing to CAKE. If the shaper is in use, each priority tier gets its own virtual clock, which limits that tier's rate to a fraction of the overall shaped rate, to discourage trying to game the priority mechanism. CAKE defaults to a simple, three-tier mode that interprets most code points as "best effort", but places CS1 traffic into a low-priority "bulk" tier which is assigned 1/16 of the total rate, and a few code points indicating latency-sensitive or control traffic (specifically TOS4, VA, EF, CS6, CS7) into a "latency sensitive" high-priority tier, which is assigned 1/4 rate. The other supported DiffServ modes are a 4-tier mode matching the 802.11e precedence rules, as well as two 8-tier modes, one of which implements strict precedence of the eight priority levels. This commit also adds an optional DiffServ 'wash' mode, which will zero out the DSCP fields of any packet passing through CAKE. While this can technically be done with other mechanisms in the kernel, having the feature available in CAKE significantly decreases configuration complexity; and the implementation cost is low on top of the other DiffServ-handling code. Filters and applications can set the skb->priority field to override the DSCP-based classification into tiers. If TC_H_MAJ(skb->priority) matches CAKE's qdisc handle, the minor number will be interpreted as a priority tier if it is less than or equal to the number of configured priority tiers. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 439 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 423 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 633ca1578114..43eeca81b247 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -296,6 +296,68 @@ static void cobalt_set_enqueue_time(struct sk_buff *skb, static u16 quantum_div[CAKE_QUEUES + 1] = {0}; +/* Diffserv lookup tables */ + +static const u8 precedence[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static const u8 diffserv8[] = { + 2, 5, 1, 2, 4, 2, 2, 2, + 0, 2, 1, 2, 1, 2, 1, 2, + 5, 2, 4, 2, 4, 2, 4, 2, + 3, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 2, 2, 6, 2, 6, 2, + 7, 2, 2, 2, 2, 2, 2, 2, + 7, 2, 2, 2, 2, 2, 2, 2, +}; + +static const u8 diffserv4[] = { + 0, 2, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 0, 0, 3, 0, 3, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 diffserv3[] = { + 0, 0, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 2, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 besteffort[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* tin priority order for stats dumping */ + +static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; +static const u8 bulk_order[] = {1, 0, 2, 3}; + #define REC_INV_SQRT_CACHE (16) static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; @@ -1351,20 +1413,91 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) return idx + (tin << 16); } -static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, +static void cake_wash_diffserv(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + break; + case htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + break; + default: + break; + } +} + +static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +{ + u8 dscp; + + switch (skb->protocol) { + case htons(ETH_P_IP): + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + if (wash && dscp) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_IPV6): + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + if (wash && dscp) + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_ARP): + return 0x38; /* CS7 - Net Control */ + + default: + /* If there is no Diffserv field, treat as best-effort */ + return 0; + } +} + +static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, + struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + u32 tin; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->tin_cnt) { + tin = TC_H_MIN(skb->priority) - 1; + + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { + /* extract the Diffserv Precedence field, if it exists */ + /* and clear DSCP bits if washing */ + tin = q->tin_index[cake_handle_diffserv(skb, + q->rate_flags & CAKE_FLAG_WASH)]; + if (unlikely(tin >= q->tin_cnt)) + tin = 0; + } else { + tin = 0; + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } + + return &q->tins[tin]; +} + +static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, struct sk_buff *skb, int flow_mode, int *qerr) { struct cake_sched_data *q = qdisc_priv(sch); struct tcf_proto *filter; struct tcf_result res; + u32 flow = 0; int result; filter = rcu_dereference_bh(q->filter_list); if (!filter) - return cake_hash(t, skb, flow_mode) + 1; + goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1378,9 +1511,11 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, } #endif if (TC_H_MIN(res.classid) <= CAKE_QUEUES) - return TC_H_MIN(res.classid); + flow = TC_H_MIN(res.classid); } - return 0; +hash: + *t = cake_select_tin(sch, skb); + return flow ?: cake_hash(*t, skb, flow_mode) + 1; } static void cake_reconfigure(struct Qdisc *sch); @@ -1395,13 +1530,10 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx, tin; - - tin = 0; - b = &q->tins[tin]; + u32 idx; /* choose flow to insert into */ - idx = cake_classify(sch, b, skb, q->flow_mode, &ret); + idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); @@ -1917,18 +2049,275 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->cparams.p_dec = 1 << 20; /* 1/4096 */ } -static void cake_reconfigure(struct Qdisc *sch) +static int cake_config_besteffort(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; - int c, ft = 0; + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; q->tin_cnt = 1; - cake_set_rate(b, q->rate_bps, psched_mtu(qdisc_dev(sch)), + + q->tin_index = besteffort; + q->tin_order = normal_order; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum_band = 65535; b->tin_quantum_prio = 65535; + return 0; +} + +static int cake_config_precedence(struct Qdisc *sch) +{ + /* convert high-level (user visible) parameters into internal format */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + q->tin_index = precedence; + q->tin_order = normal_order; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +/* List of known Diffserv codepoints: + * + * Least Effort (CS1) + * Best Effort (CS0) + * Max Reliability & LLT "Lo" (TOS1) + * Max Throughput (TOS2) + * Min Delay (TOS4) + * LLT "La" (TOS5) + * Assured Forwarding 1 (AF1x) - x3 + * Assured Forwarding 2 (AF2x) - x3 + * Assured Forwarding 3 (AF3x) - x3 + * Assured Forwarding 4 (AF4x) - x3 + * Precedence Class 2 (CS2) + * Precedence Class 3 (CS3) + * Precedence Class 4 (CS4) + * Precedence Class 5 (CS5) + * Precedence Class 6 (CS6) + * Precedence Class 7 (CS7) + * Voice Admit (VA) + * Expedited Forwarding (EF) + + * Total 25 codepoints. + */ + +/* List of traffic classes in RFC 4594: + * (roughly descending order of contended priority) + * (roughly ascending order of uncontended throughput) + * + * Network Control (CS6,CS7) - routing traffic + * Telephony (EF,VA) - aka. VoIP streams + * Signalling (CS5) - VoIP setup + * Multimedia Conferencing (AF4x) - aka. video calls + * Realtime Interactive (CS4) - eg. games + * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch + * Broadcast Video (CS3) + * Low Latency Data (AF2x,TOS4) - eg. database + * Ops, Admin, Management (CS2,TOS1) - eg. ssh + * Standard Service (CS0 & unrecognised codepoints) + * High Throughput Data (AF1x,TOS2) - eg. web traffic + * Low Priority Data (CS1) - eg. BitTorrent + + * Total 12 traffic classes. + */ + +static int cake_config_diffserv8(struct Qdisc *sch) +{ +/* Pruned list of traffic classes for typical applications: + * + * Network Control (CS6, CS7) + * Minimum Latency (EF, VA, CS5, CS4) + * Interactive Shell (CS2, TOS1) + * Low Latency Transactions (AF2x, TOS4) + * Video Streaming (AF4x, AF3x, CS3) + * Bog Standard (CS0 etc.) + * High Throughput (AF1x, TOS2) + * Background Traffic (CS1) + * + * Total 8 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + + /* codepoint to class mapping */ + q->tin_index = diffserv8; + q->tin_order = normal_order; + + /* class characteristics */ + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +static int cake_config_diffserv4(struct Qdisc *sch) +{ +/* Further pruned list of traffic classes for four-class system: + * + * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) + * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) + * Best Effort (CS0, AF1x, TOS2, and those not specified) + * Background Traffic (CS1) + * + * Total 4 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 4; + + /* codepoint to class mapping */ + q->tin_index = diffserv4; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 1, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[3], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 2; + q->tins[3].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 1; + q->tins[3].tin_quantum_band = quantum >> 2; + + return 0; +} + +static int cake_config_diffserv3(struct Qdisc *sch) +{ +/* Simplified Diffserv structure with 3 tins. + * Low Priority (CS1) + * Best Effort + * Latency Sensitive (TOS4, VA, EF, CS6, CS7) + */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 3; + + /* codepoint to class mapping */ + q->tin_index = diffserv3; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 2; + + return 0; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int c, ft; + + switch (q->tin_mode) { + case CAKE_DIFFSERV_BESTEFFORT: + ft = cake_config_besteffort(sch); + break; + + case CAKE_DIFFSERV_PRECEDENCE: + ft = cake_config_precedence(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV8: + ft = cake_config_diffserv8(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV4: + ft = cake_config_diffserv4(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV3: + default: + ft = cake_config_diffserv3(sch); + break; + } + for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { cake_clear_tin(sch, c); q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; @@ -1984,6 +2373,16 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CAKE_BASE_RATE64]) q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + if (tb[TCA_CAKE_DIFFSERV_MODE]) + q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + + if (tb[TCA_CAKE_WASH]) { + if (!!nla_get_u32(tb[TCA_CAKE_WASH])) + q->rate_flags |= CAKE_FLAG_WASH; + else + q->rate_flags &= ~CAKE_FLAG_WASH; + } + if (tb[TCA_CAKE_FLOW_MODE]) q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & @@ -2048,7 +2447,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, int i, j, err; sch->limit = 10240; - q->tin_mode = CAKE_DIFFSERV_BESTEFFORT; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; q->rate_bps = 0; /* unlimited by default */ @@ -2158,6 +2557,13 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_WASH, + !!(q->rate_flags & CAKE_FLAG_WASH))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -2211,7 +2617,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) } while (0) for (i = 0; i < q->tin_cnt; i++) { - struct cake_tin_data *b = &q->tins[i]; + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; ts = nla_nest_start(d->skb, i + 1); if (!ts) @@ -2310,7 +2716,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, u32 idx = cl - 1; if (idx < CAKE_QUEUES * q->tin_cnt) { - const struct cake_tin_data *b = &q->tins[idx / CAKE_QUEUES]; + const struct cake_tin_data *b = \ + &q->tins[q->tin_order[idx / CAKE_QUEUES]]; const struct sk_buff *skb; flow = &b->flows[idx % CAKE_QUEUES]; @@ -2382,7 +2789,7 @@ static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->tin_cnt; i++) { - struct cake_tin_data *b = &q->tins[i]; + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; for (j = 0; j < CAKE_QUEUES; j++) { if (list_empty(&b->flows[j].flowchain) || -- cgit v1.2.3 From a729b7f0bd5bf4919306556aed614438f5174537 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add overhead compensation support to the rate shaper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds configurable overhead compensation support to the rate shaper. With this feature, userspace can configure the actual bottleneck link overhead and encapsulation mode used, which will be used by the shaper to calculate the precise duration of each packet on the wire. This feature is needed because CAKE is often deployed one or two hops upstream of the actual bottleneck (which can be, e.g., inside a DSL or cable modem). In this case, the link layer characteristics and overhead reported by the kernel does not match the actual bottleneck. Being able to set the actual values in use makes it possible to configure the shaper rate much closer to the actual bottleneck rate (our experience shows it is possible to get with 0.1% of the actual physical bottleneck rate), thus keeping latency low without sacrificing bandwidth. The overhead compensation has three tunables: A fixed per-packet overhead size (which, if set, will be accounted from the IP packet header), a minimum packet size (MPU) and a framing mode supporting either ATM or PTM framing. We include a set of common keywords in TC to help users configure the right parameters. If no overhead value is set, the value reported by the kernel is used. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 43eeca81b247..199670e1eb94 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -270,6 +270,7 @@ enum { struct cobalt_skb_cb { ktime_t enqueue_time; + u32 adjusted_len; }; static u64 us_to_ns(u64 us) @@ -1251,6 +1252,88 @@ static u64 cake_ewma(u64 avg, u64 sample, u32 shift) return avg; } +static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) +{ + if (q->rate_flags & CAKE_FLAG_OVERHEAD) + len -= off; + + if (q->max_netlen < len) + q->max_netlen = len; + if (q->min_netlen > len) + q->min_netlen = len; + + len += q->rate_overhead; + + if (len < q->rate_mpu) + len = q->rate_mpu; + + if (q->atm_mode == CAKE_ATM_ATM) { + len += 47; + len /= 48; + len *= 53; + } else if (q->atm_mode == CAKE_ATM_PTM) { + /* Add one byte per 64 bytes or part thereof. + * This is conservative and easier to calculate than the + * precise value. + */ + len += (len + 63) / 64; + } + + if (q->max_adjlen < len) + q->max_adjlen = len; + if (q->min_adjlen > len) + q->min_adjlen = len; + + return len; +} + +static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int hdr_len, last_len = 0; + u32 off = skb_network_offset(skb); + u32 len = qdisc_pkt_len(skb); + u16 segs = 1; + + q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); + + if (!shinfo->gso_size) + return cake_calc_overhead(q, len, off); + + /* borrowed from qdisc_pkt_len_init() */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | + SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } + + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) + segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); + else + segs = shinfo->gso_segs; + + len = shinfo->gso_size + hdr_len; + last_len = skb->len - shinfo->gso_size * (segs - 1); + + return (cake_calc_overhead(q, len, off) * (segs - 1) + + cake_calc_overhead(q, last_len, off)); +} + static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) { struct cake_heap_entry ii = q->overflow_heap[i]; @@ -1328,7 +1411,7 @@ static int cake_advance_shaper(struct cake_sched_data *q, struct sk_buff *skb, ktime_t now, bool drop) { - u32 len = qdisc_pkt_len(skb); + u32 len = get_cobalt_cb(skb)->adjusted_len; /* charge packet bandwidth to this tin * and to the global shaper. @@ -1568,6 +1651,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->max_skblen = len; cobalt_set_enqueue_time(skb, now); + get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); if (q->ack_filter) @@ -2388,6 +2472,31 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); + if (tb[TCA_CAKE_ATM]) + q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + + if (tb[TCA_CAKE_OVERHEAD]) { + q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); + q->rate_flags |= CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_RAW]) { + q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_MPU]) + q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + if (tb[TCA_CAKE_RTT]) { q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); @@ -2564,6 +2673,19 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + goto nla_put_failure; + + if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 0c850344d3882886f842bf0b50a9ff23001adb7e Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Conditionally split GSO segments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At lower bandwidths, the transmission time of a single GSO segment can add an unacceptable amount of latency due to HOL blocking. Furthermore, with a software shaper, any tuning mechanism employed by the kernel to control the maximum size of GSO segments is thrown off by the artificial limit on bandwidth. For this reason, we split GSO segments into their individual packets iff the shaper is active and configured to a bandwidth <= 1 Gbps. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 99 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 199670e1eb94..30695691e9ff 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -80,6 +80,7 @@ #define CAKE_QUEUES (1024) #define CAKE_FLOW_MASK 63 #define CAKE_FLOW_NAT_FLAG 64 +#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate @@ -1650,36 +1651,73 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - cobalt_set_enqueue_time(skb, now); - get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); - flow_queue_add(flow, skb); - - if (q->ack_filter) - ack = cake_ack_filter(q, flow); + if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + unsigned int slen = 0; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + cobalt_set_enqueue_time(segs, now); + get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, + segs); + flow_queue_add(flow, segs); + + sch->q.qlen++; + slen += segs->len; + q->buffer_used += segs->truesize; + b->packets++; + segs = nskb; + } - if (ack) { - b->ack_drops++; - sch->qstats.drops++; - b->bytes += qdisc_pkt_len(ack); - len -= qdisc_pkt_len(ack); - q->buffer_used += skb->truesize - ack->truesize; - if (q->rate_flags & CAKE_FLAG_INGRESS) - cake_advance_shaper(q, b, ack, now, true); + /* stats */ + b->bytes += slen; + b->backlogs[idx] += slen; + b->tin_backlog += slen; + sch->qstats.backlog += slen; + q->avg_window_bytes += slen; - qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); - consume_skb(ack); + qdisc_tree_reduce_backlog(sch, 1, len); + consume_skb(skb); } else { - sch->q.qlen++; - q->buffer_used += skb->truesize; - } + /* not splitting */ + cobalt_set_enqueue_time(skb, now); + get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); + flow_queue_add(flow, skb); + + if (q->ack_filter) + ack = cake_ack_filter(q, flow); + + if (ack) { + b->ack_drops++; + sch->qstats.drops++; + b->bytes += qdisc_pkt_len(ack); + len -= qdisc_pkt_len(ack); + q->buffer_used += skb->truesize - ack->truesize; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, ack, now, true); + + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + consume_skb(ack); + } else { + sch->q.qlen++; + q->buffer_used += skb->truesize; + } - /* stats */ - b->packets++; - b->bytes += len; - b->backlogs[idx] += len; - b->tin_backlog += len; - sch->qstats.backlog += len; - q->avg_window_bytes += len; + /* stats */ + b->packets++; + b->bytes += len; + b->backlogs[idx] += len; + b->tin_backlog += len; + sch->qstats.backlog += len; + q->avg_window_bytes += len; + } if (q->overflow_timeout) cake_heapify_up(q, b->overflow_idx[idx]); @@ -2531,6 +2569,11 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + else + q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2686,6 +2729,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, + !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 386c5680e2e80b012de557cf8326962070e0897b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:19:13 +0200 Subject: xfrm: use time64_t for in-kernel timestamps The lifetime managment uses '__u64' timestamps on the user space interface, but 'unsigned long' for reading the current time in the kernel with get_seconds(). While this is probably safe beyond y2038, it will still overflow in 2106, and the get_seconds() call is deprecated because fo that. This changes the xfrm time handling to use time64_t consistently, along with reading the time using the safer ktime_get_real_seconds(). It still suffers from problems that can happen from a concurrent settimeofday() call or (to a lesser degree) a leap second update, but since the time stamps are part of the user API, there is nothing we can do to prevent that. Signed-off-by: Arnd Bergmann Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 24 ++++++++++++------------ net/xfrm/xfrm_state.c | 10 +++++----- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ef75891450e7..5d2f734f4309 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -189,8 +189,8 @@ static inline unsigned long make_jiffies(long secs) static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = from_timer(xp, t, timer); - unsigned long now = get_seconds(); - long next = LONG_MAX; + time64_t now = ktime_get_real_seconds(); + time64_t next = TIME64_MAX; int warn = 0; int dir; @@ -202,7 +202,7 @@ static void xfrm_policy_timer(struct timer_list *t) dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { - long tmo = xp->lft.hard_add_expires_seconds + + time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; @@ -210,7 +210,7 @@ static void xfrm_policy_timer(struct timer_list *t) next = tmo; } if (xp->lft.hard_use_expires_seconds) { - long tmo = xp->lft.hard_use_expires_seconds + + time64_t tmo = xp->lft.hard_use_expires_seconds + (xp->curlft.use_time ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; @@ -218,7 +218,7 @@ static void xfrm_policy_timer(struct timer_list *t) next = tmo; } if (xp->lft.soft_add_expires_seconds) { - long tmo = xp->lft.soft_add_expires_seconds + + time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; @@ -228,7 +228,7 @@ static void xfrm_policy_timer(struct timer_list *t) next = tmo; } if (xp->lft.soft_use_expires_seconds) { - long tmo = xp->lft.soft_use_expires_seconds + + time64_t tmo = xp->lft.soft_use_expires_seconds + (xp->curlft.use_time ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; @@ -240,7 +240,7 @@ static void xfrm_policy_timer(struct timer_list *t) if (warn) km_policy_expired(xp, dir, 0, 0); - if (next != LONG_MAX && + if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); @@ -791,7 +791,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); - policy->curlft.add_time = get_seconds(); + policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); @@ -1282,7 +1282,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { - pol->curlft.add_time = get_seconds(); + pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } @@ -2132,7 +2132,7 @@ no_transform: } for (i = 0; i < num_pols; i++) - pols[i]->curlft.use_time = get_seconds(); + pols[i]->curlft.use_time = ktime_get_real_seconds(); if (num_xfrms < 0) { /* Prohibit the flow */ @@ -2352,7 +2352,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = get_seconds(); + pol->curlft.use_time = ktime_get_real_seconds(); pols[0] = pol; npols++; @@ -2366,7 +2366,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } - pols[1]->curlft.use_time = get_seconds(); + pols[1]->curlft.use_time = ktime_get_real_seconds(); npols++; } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c9ffcdfa89f6..27c84e63c7ff 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -475,8 +475,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) { struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer); - unsigned long now = get_seconds(); - long next = LONG_MAX; + time64_t now = ktime_get_real_seconds(); + time64_t next = TIME64_MAX; int warn = 0; int err = 0; @@ -537,7 +537,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (warn) km_state_expired(x, 0, 0); resched: - if (next != LONG_MAX) { + if (next != TIME64_MAX) { tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL); } @@ -577,7 +577,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME, HRTIMER_MODE_ABS); timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); - x->curlft.add_time = get_seconds(); + x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; x->lft.hard_byte_limit = XFRM_INF; @@ -1588,7 +1588,7 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { if (!x->curlft.use_time) - x->curlft.use_time = get_seconds(); + x->curlft.use_time = ktime_get_real_seconds(); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { -- cgit v1.2.3 From 03dc7a35fcc83a199121a5156c4a7a976b836682 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:19:14 +0200 Subject: ipv6: xfrm: use 64-bit timestamps get_seconds() is deprecated because it can overflow on 32-bit architectures. For the xfrm_state->lastused member, we treat the data as a 64-bit number already, so we just need to use the right accessor that works on both 32-bit and 64-bit machines. Signed-off-by: Arnd Bergmann Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/ipv6/xfrm6_mode_ro.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a5378613a49c..1350e2cf0749 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -227,7 +227,7 @@ struct xfrm_state { long saved_tmo; /* Last used time */ - unsigned long lastused; + time64_t lastused; struct page_frag xfrag; diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 07d36573f50b..da28e4407b8f 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -55,7 +55,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) __skb_pull(skb, hdr_len); memmove(ipv6_hdr(skb), iph, hdr_len); - x->lastused = get_seconds(); + x->lastused = ktime_get_real_seconds(); return 0; } -- cgit v1.2.3 From 5e9a0fe492f89ff1c7583ee6ea89dc37b8c2e5c2 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 9 Jul 2018 02:26:20 +0000 Subject: net/sched: flower: Fix null pointer dereference when run tc vlan command Zahari issued tc vlan command without setting vlan_ethtype, which will crash kernel. To avoid this, we must check tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE] is not null before use it. Also we don't need to dump vlan_ethtype or cvlan_ethtype in this case. Fixes: d64efd0926ba ('net/sched: flower: Add supprt for matching on QinQ vlan headers') Signed-off-by: Jianbo Liu Reported-by: Zahari Doychev Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 487a152a852c..8b2474293db1 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -605,20 +605,22 @@ static int fl_set_key(struct net *net, struct nlattr **tb, TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan); - ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); - if (eth_type_vlan(ethertype)) { - fl_set_key_vlan(tb, ethertype, - TCA_FLOWER_KEY_CVLAN_ID, - TCA_FLOWER_KEY_CVLAN_PRIO, - &key->cvlan, &mask->cvlan); - fl_set_key_val(tb, &key->basic.n_proto, - TCA_FLOWER_KEY_CVLAN_ETH_TYPE, - &mask->basic.n_proto, - TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); - } else { - key->basic.n_proto = ethertype; - mask->basic.n_proto = cpu_to_be16(~0); + if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) { + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, + TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + &mask->basic.n_proto, + TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } } } else { key->basic.n_proto = ethertype; @@ -1344,14 +1346,16 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, key->cvlan.vlan_tpid))) goto nla_put_failure; - if (mask->cvlan.vlan_tpid) { - if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, - key->basic.n_proto)) - goto nla_put_failure; - } else if (mask->vlan.vlan_tpid) { - if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, - key->basic.n_proto)) - goto nla_put_failure; + if (mask->basic.n_proto) { + if (mask->cvlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } else if (mask->vlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } } if ((key->basic.n_proto == htons(ETH_P_IP) || -- cgit v1.2.3 From c6dbf7aaa48289d2eeacbef06785c069869ed0c0 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Jul 2018 12:25:14 +0200 Subject: net/ipv6: fix addrconf_sysctl_addr_gen_mode addrconf_sysctl_addr_gen_mode() has multiple problems. First, it ignores the errors returned by proc_dointvec(). addrconf_sysctl_addr_gen_mode() calls proc_dointvec() directly, which writes the value to memory, and then checks if it's valid and may return EINVAL. If a bad value is given, the value displayed when reading net.ipv6.conf.foo.addr_gen_mode next time will be invalid. In case the value provided by the user was valid, addrconf_dev_config() won't be called since idev->cnf.addr_gen_mode has already been updated. Fix this in the usual way we deal with values that need to be checked after the proc_do*() helper has returned: define a local ctl_table and storage, call proc_dointvec() on that temporary area, then check and store. addrconf_sysctl_addr_gen_mode() also writes the new value to the global ipv6_devconf_dflt, when we're writing to some netns's default, so that new netns will inherit the value that was set by the change occuring in any netns. That doesn't make any sense, so let's drop this assignment. Finally, since addr_gen_mode is a __u32, switch to proc_douintvec(). Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address generation mode") Signed-off-by: Sabrina Dubroca Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 91580c62bb86..e9ba53d2a147 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5892,32 +5892,31 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, loff_t *ppos) { int ret = 0; - int new_val; + u32 new_val; struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; struct net *net = (struct net *)ctl->extra2; + struct ctl_table tmp = { + .data = &new_val, + .maxlen = sizeof(new_val), + .mode = ctl->mode, + }; if (!rtnl_trylock()) return restart_syscall(); - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + new_val = *((u32 *)ctl->data); - if (write) { - new_val = *((int *)ctl->data); + ret = proc_douintvec(&tmp, write, buffer, lenp, ppos); + if (ret != 0) + goto out; + if (write) { if (check_addr_gen_mode(new_val) < 0) { ret = -EINVAL; goto out; } - /* request for default */ - if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) { - ipv6_devconf_dflt.addr_gen_mode = new_val; - - /* request for individual net device */ - } else { - if (!idev) - goto out; - + if (idev) { if (check_stable_privacy(idev, net, new_val) < 0) { ret = -EINVAL; goto out; @@ -5928,6 +5927,8 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, addrconf_dev_config(idev->dev); } } + + *((u32 *)ctl->data) = new_val; } out: -- cgit v1.2.3 From 70c30d76e580fe4aefe6facdf0f1edb1aa9a0e7a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Jul 2018 12:25:15 +0200 Subject: net/ipv6: don't reinitialize ndev->cnf.addr_gen_mode on new inet6_dev The value has already been copied from this netns's devconf_dflt, it shouldn't be reset to the global kernel default. Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address generation mode") Signed-off-by: Sabrina Dubroca Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e9ba53d2a147..e20f8a1d8cdb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -385,8 +385,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (ndev->cnf.stable_secret.initialized) ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; - else - ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode; ndev->cnf.mtu6 = dev->mtu; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); -- cgit v1.2.3 From bdd72f41333d9f61a22e4c4494e95782e9731fdb Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Jul 2018 12:25:16 +0200 Subject: net/ipv6: reserve room for IFLA_INET6_ADDR_GEN_MODE inet6_ifla6_size() is called to check how much space is needed by inet6_fill_link_af() and inet6_fill_ifinfo(), both of which include the IFLA_INET6_ADDR_GEN_MODE attribute. Reserve some room for it. Fixes: bc91b0f07ada ("ipv6: addrconf: implement address generation modes") Signed-off-by: Sabrina Dubroca Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e20f8a1d8cdb..e89bca83e0e4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5208,7 +5208,9 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ - + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ + + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */ + + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */ + + 0; } static inline size_t inet6_if_nlmsg_size(void) -- cgit v1.2.3 From f24c5987dddd28b23443e7b21b55d47549207755 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Jul 2018 12:25:17 +0200 Subject: net/ipv6: propagate net.ipv6.conf.all.addr_gen_mode to devices This aligns the addr_gen_mode sysctl with the expected behavior of the "all" variant. Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address generation mode") Suggested-by: David Ahern Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e89bca83e0e4..1659a6b3cf42 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5926,6 +5926,18 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, idev->cnf.addr_gen_mode = new_val; addrconf_dev_config(idev->dev); } + } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { + struct net_device *dev; + + net->ipv6.devconf_dflt->addr_gen_mode = new_val; + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev && + idev->cnf.addr_gen_mode != new_val) { + idev->cnf.addr_gen_mode = new_val; + addrconf_dev_config(idev->dev); + } + } } *((u32 *)ctl->data) = new_val; -- cgit v1.2.3 From 01e866bf07fbb10e96bff46ea1e5e0410d6e40b9 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 14:33:26 +0300 Subject: net: sched: act_ife: fix memory leak in ife init Free params if tcf_idr_check_alloc() returned error. Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Reported-by: Dan Carpenter Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_ife.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index a3eef00cd711..3d6e265758c0 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -485,8 +485,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return -ENOMEM; err = tcf_idr_check_alloc(tn, &parm->index, a, bind); - if (err < 0) + if (err < 0) { + kfree(p); return err; + } exists = err; if (exists && bind) { kfree(p); -- cgit v1.2.3 From e0479b670d394d478907bd4fc22daab6516953c7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 20:26:47 +0300 Subject: net: sched: fix unprotected access to rcu cookie pointer Fix action attribute size calculation function to take rcu read lock and access act_cookie pointer with rcu dereference. Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update") Reported-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sched/act_api.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 66dc19746c63..148a89ab789b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -149,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { + struct tc_cookie *act_cookie; u32 cookie_len = 0; - if (act->act_cookie) - cookie_len = nla_total_size(act->act_cookie->len); + rcu_read_lock(); + act_cookie = rcu_dereference(act->act_cookie); + + if (act_cookie) + cookie_len = nla_total_size(act_cookie->len); + rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ -- cgit v1.2.3 From 4929c9428a171145f82f81aae0c3c25ef7d82837 Mon Sep 17 00:00:00 2001 From: Deepti Raghavan Date: Mon, 9 Jul 2018 17:53:39 +0000 Subject: tcp: expose both send and receive intervals for rate sample Congestion control algorithms, which access the rate sample through the tcp_cong_control function, only have access to the maximum of the send and receive interval, for cases where the acknowledgment rate may be inaccurate due to ACK compression or decimation. Algorithms may want to use send rates and receive rates as separate signals. Signed-off-by: Deepti Raghavan Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_rate.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index cce37694776e..f6cb20e6e524 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -954,6 +954,8 @@ struct rate_sample { u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c61240e43923..4dff40dad4dc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" -- cgit v1.2.3 From 9012de5089560136b849b920ad038b96160ed8f6 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 10 Jul 2018 01:07:35 +0200 Subject: tipc: add sequence number check for link STATE messages Some switch infrastructures produce huge amounts of packet duplicates. This becomes a problem if those messages are STATE/NACK protocol messages, causing unnecessary retransmissions of already accepted packets. We now introduce a unique sequence number per STATE protocol message so that duplicates can be identified and ignored. This will also be useful when tracing such cases, and to avert replay attacks when TIPC is encrypted. For compatibility reasons we have to introduce a new capability flag TIPC_LINK_PROTO_SEQNO to handle this new feature. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 16 ++++++++++++++++ net/tipc/link.h | 1 + net/tipc/node.c | 7 +++++++ net/tipc/node.h | 14 ++++++++------ 4 files changed, 32 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index ec4d28328652..065e9e67da5d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -130,6 +130,8 @@ struct tipc_link { /* Management and link supervision data */ u32 peer_session; u32 session; + u16 snd_nxt_state; + u16 rcv_nxt_state; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; @@ -339,6 +341,11 @@ char tipc_link_plane(struct tipc_link *l) return l->net_plane; } +void tipc_link_update_caps(struct tipc_link *l, u16 capabilities) +{ + l->peer_caps = capabilities; +} + void tipc_link_add_bc_peer(struct tipc_link *snd_l, struct tipc_link *uc_l, struct sk_buff_head *xmitq) @@ -859,6 +866,8 @@ void tipc_link_reset(struct tipc_link *l) l->rcv_unacked = 0; l->snd_nxt = 1; l->rcv_nxt = 1; + l->snd_nxt_state = 1; + l->rcv_nxt_state = 1; l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; @@ -1353,6 +1362,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { + if (l->peer_caps & TIPC_LINK_PROTO_SEQNO) + msg_set_seqno(hdr, l->snd_nxt_state++); msg_set_seq_gap(hdr, rcvgap); msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); @@ -1522,6 +1533,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, case STATE_MSG: + if (l->peer_caps & TIPC_LINK_PROTO_SEQNO && + less(msg_seqno(hdr), l->rcv_nxt_state)) + break; + l->rcv_nxt_state = msg_seqno(hdr) + 1; + /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) l->tolerance = peers_tol; diff --git a/net/tipc/link.h b/net/tipc/link.h index ec59348a81e8..d56f9c9e5000 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -110,6 +110,7 @@ char *tipc_link_name(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); +void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); unsigned long tipc_link_tolerance(struct tipc_link *l); void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq); diff --git a/net/tipc/node.c b/net/tipc/node.c index cfdbaf479fd1..1cdb176798f7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -363,6 +363,8 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; + struct tipc_link *l; + int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); @@ -370,6 +372,11 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, if (n) { /* Same node may come back with new capabilities */ n->capabilities = capabilities; + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + l = n->links[bearer_id].link; + if (l) + tipc_link_update_caps(l, capabilities); + } goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); diff --git a/net/tipc/node.h b/net/tipc/node.h index 846c8f240872..48b3298a248d 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,14 +49,16 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_NODE_ID128 = (1 << 5) + TIPC_NODE_ID128 = (1 << 5), + TIPC_LINK_PROTO_SEQNO = (1 << 6) }; -#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ - TIPC_BCAST_STATE_NACK | \ - TIPC_BCAST_RCAST | \ - TIPC_BLOCK_FLOWCTL | \ - TIPC_NODE_ID128) +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ + TIPC_BCAST_STATE_NACK | \ + TIPC_BCAST_RCAST | \ + TIPC_BLOCK_FLOWCTL | \ + TIPC_NODE_ID128 | \ + TIPC_LINK_PROTO_SEQNO) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); -- cgit v1.2.3 From 7ea817f4e8322fa27fb860d15025bf72f68b179f Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 10 Jul 2018 01:07:36 +0200 Subject: tipc: check session number before accepting link protocol messages In some virtual environments we observe a significant higher number of packet reordering and delays than we have been used to traditionally. This makes it necessary with stricter checks on incoming link protocol messages' session number, which until now only has been validated for RESET messages. Since the other two message types, ACTIVATE and STATE messages also carry this number, it is easy to extend the validation check to those messages. We also introduce a flag indicating if a link has a valid peer session number or not. This eliminates the mixing of 32- and 16-bit arithmethics we are currently using to achieve this. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 68 +++++++++++++++++++++++++++++++++++++++------------------ net/tipc/link.h | 1 + net/tipc/node.c | 5 ++++- 3 files changed, 52 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 065e9e67da5d..df763be38541 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -128,8 +128,8 @@ struct tipc_link { struct net *net; /* Management and link supervision data */ - u32 peer_session; - u32 session; + u16 peer_session; + u16 session; u16 snd_nxt_state; u16 rcv_nxt_state; u32 peer_bearer_id; @@ -138,6 +138,7 @@ struct tipc_link { u32 abort_limit; u32 state; u16 peer_caps; + bool in_session; bool active; u32 silent_intv_cnt; char if_name[TIPC_MAX_IF_NAME]; @@ -216,11 +217,6 @@ enum { */ #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) -/* Wildcard value for link session numbers. When it is known that - * peer endpoint is down, any session number must be accepted. - */ -#define ANY_SESSION 0x10000 - /* Link FSM states: */ enum { @@ -478,7 +474,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->addr = peer; l->peer_caps = peer_caps; l->net = net; - l->peer_session = ANY_SESSION; + l->in_session = false; l->bearer_id = bearer_id; l->tolerance = tolerance; l->net_plane = net_plane; @@ -847,7 +843,7 @@ void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { - l->peer_session = ANY_SESSION; + l->in_session = false; l->session++; l->mtu = l->advertised_mtu; __skb_queue_purge(&l->transmq); @@ -1455,6 +1451,44 @@ tnl: } } +/* tipc_link_validate_msg(): validate message against current link state + * Returns true if message should be accepted, otherwise false + */ +bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) +{ + u16 curr_session = l->peer_session; + u16 session = msg_session(hdr); + int mtyp = msg_type(hdr); + + if (msg_user(hdr) != LINK_PROTOCOL) + return true; + + switch (mtyp) { + case RESET_MSG: + if (!l->in_session) + return true; + /* Accept only RESET with new session number */ + return more(session, curr_session); + case ACTIVATE_MSG: + if (!l->in_session) + return true; + /* Accept only ACTIVATE with new or current session number */ + return !less(session, curr_session); + case STATE_MSG: + /* Accept only STATE with current session number */ + if (!l->in_session) + return false; + if (session != curr_session) + return false; + if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) + return true; + /* Accept only STATE with new sequence number */ + return !less(msg_seqno(hdr), l->rcv_nxt_state); + default: + return false; + } +} + /* tipc_link_proto_rcv(): receive link level protocol message : * Note that network plane id propagates through the network, and may * change at any time. The node with lowest numerical id determines @@ -1488,17 +1522,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, hdr = buf_msg(skb); data = msg_data(hdr); + if (!tipc_link_validate_msg(l, hdr)) + goto exit; + switch (mtyp) { case RESET_MSG: - - /* Ignore duplicate RESET with old session number */ - if ((less_eq(msg_session(hdr), l->peer_session)) && - (l->peer_session != ANY_SESSION)) - break; - /* fall thru' */ - case ACTIVATE_MSG: - /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) @@ -1526,16 +1555,13 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); + l->in_session = true; l->peer_bearer_id = msg_bearer_id(hdr); if (l->mtu > msg_max_pkt(hdr)) l->mtu = msg_max_pkt(hdr); break; case STATE_MSG: - - if (l->peer_caps & TIPC_LINK_PROTO_SEQNO && - less(msg_seqno(hdr), l->rcv_nxt_state)) - break; l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ diff --git a/net/tipc/link.h b/net/tipc/link.h index d56f9c9e5000..7bc494a33fdf 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -111,6 +111,7 @@ char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); +bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq); diff --git a/net/tipc/node.c b/net/tipc/node.c index 1cdb176798f7..52fd80b0e728 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1540,7 +1540,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id * tipc_node_check_state - check and if necessary update node state * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet - * Returns true if state is ok, otherwise consumes buffer and returns false + * Returns true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) @@ -1574,6 +1574,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, } } + if (!tipc_link_validate_msg(l, hdr)) + return false; + /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) -- cgit v1.2.3 From d2bdd2681278d66fd34cd8e0cf724de918f429b2 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 11 Jul 2018 14:32:20 +0530 Subject: net/tls: Use aead_request_alloc/free for request alloc/free Instead of kzalloc/free for aead_request allocation and free, use functions aead_request_alloc(), aead_request_free(). It ensures that any sensitive crypto material held in crypto transforms is securely erased from memory. Signed-off-by: Vakul Garg Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0d670c8adf18..7453f5ae0819 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -57,14 +57,11 @@ static int tls_do_decryption(struct sock *sk, struct aead_request *aead_req; int ret; - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_recv); - aead_req = kzalloc(req_size, flags); + aead_req = aead_request_alloc(ctx->aead_recv, flags); if (!aead_req) return -ENOMEM; - aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -86,7 +83,7 @@ static int tls_do_decryption(struct sock *sk, ctx->saved_data_ready(sk); out: - kfree(aead_req); + aead_request_free(aead_req); return ret; } @@ -224,8 +221,7 @@ static int tls_push_record(struct sock *sk, int flags, struct aead_request *req; int rc; - req = kzalloc(sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); if (!req) return -ENOMEM; @@ -267,7 +263,7 @@ static int tls_push_record(struct sock *sk, int flags, tls_advance_record_sn(sk, &tls_ctx->tx); out_req: - kfree(req); + aead_request_free(req); return rc; } -- cgit v1.2.3 From cca9bab1b72cd2296097c75f59ef11ef80461279 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:16:12 +0200 Subject: tcp: use monotonic timestamps for PAWS Using get_seconds() for timestamps is deprecated since it can lead to overflows on 32-bit systems. While the interface generally doesn't overflow until year 2106, the specific implementation of the TCP PAWS algorithm breaks in 2038 when the intermediate signed 32-bit timestamps overflow. A related problem is that the local timestamps in CLOCK_REALTIME form lead to unexpected behavior when settimeofday is called to set the system clock backwards or forwards by more than 24 days. While the first problem could be solved by using an overflow-safe method of comparing the timestamps, a nicer solution is to use a monotonic clocksource with ktime_get_seconds() that simply doesn't overflow (at least not until 136 years after boot) and that doesn't change during settimeofday(). To make 32-bit and 64-bit architectures behave the same way here, and also save a few bytes in the tcp_options_received structure, I'm changing the type to a 32-bit integer, which is now safe on all architectures. Finally, the ts_recent_stamp field also (confusingly) gets used to store a jiffies value in tcp_synq_overflow()/tcp_synq_no_recent_overflow(). This is currently safe, but changing the type to 32-bit requires some small changes there to keep it working. Signed-off-by: Arnd Bergmann Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_cm.c | 2 +- include/linux/tcp.h | 4 ++-- include/net/tcp.h | 17 ++++++++++------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 8 ++++---- 6 files changed, 20 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index 2bb6f0380758..0997e166ea57 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -1673,7 +1673,7 @@ static void chtls_timewait(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->rcv_nxt++; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); tp->srtt_us = 0; tcp_time_wait(sk, TCP_TIME_WAIT, 0); } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3dbea6610304..58a8d7d71354 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -89,7 +89,7 @@ struct tcp_sack_block { struct tcp_options_received { /* PAWS/RTTM data */ - long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ + int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ @@ -426,7 +426,7 @@ struct tcp_timewait_sock { /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; - long tw_ts_recent_stamp; + int tw_ts_recent_stamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index f6cb20e6e524..582304955087 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -472,19 +472,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - unsigned long now = jiffies; + unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int now = jiffies; - if (time_after(now, last_overflow + HZ)) + if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int now = jiffies; - return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID); + return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) @@ -1375,7 +1376,8 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; - if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)) + if (unlikely(!time_before32(ktime_get_seconds(), + rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, @@ -1405,7 +1407,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, However, we can relax time bounds for RST segments to MSL. */ - if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL) + if (rst && !time_before32(ktime_get_seconds(), + rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 814ea43dd12f..d3b6390ecf23 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3462,7 +3462,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..dc415c66a33a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dac5893a52b4..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -537,7 +537,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; @@ -603,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } -- cgit v1.2.3 From 6f3dfb0dc831953187fea8e3b798768611441321 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 11 Jul 2018 16:04:49 +0200 Subject: net/sched: skbedit: use per-cpu counters use per-CPU counters, instead of sharing a single set of stats with all cores: this removes the need of spinlocks when stats are read/updated. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 86521a74ecdd..8651b5bd6b59 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -38,10 +38,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, { struct tcf_skbedit *d = to_skbedit(a); - spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); - bstats_update(&d->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + spin_lock(&d->tcf_lock); if (d->flags & SKBEDIT_F_PRIORITY) skb->priority = d->priority; if (d->flags & SKBEDIT_F_INHERITDSFIELD) { @@ -77,8 +77,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, return d->tcf_action; err: - d->tcf_qstats.drops++; spin_unlock(&d->tcf_lock); + qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -169,7 +169,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_skbedit_ops, bind, false); + &act_skbedit_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, parm->index); return ret; -- cgit v1.2.3 From c749cdda9089eb1fdb6a9ab98f945124d12f2595 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 11 Jul 2018 16:04:50 +0200 Subject: net/sched: act_skbedit: don't use spinlock in the data path use RCU instead of spin_{,un}lock_bh, to protect concurrent read/write on act_skbedit configuration. This reduces the effects of contention in the data path, in case multiple readers are present. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 37 ++++++++++---- net/sched/act_skbedit.c | 107 +++++++++++++++++++++++++--------------- 2 files changed, 95 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 19cd3d345804..911bbac838a2 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -22,14 +22,19 @@ #include #include +struct tcf_skbedit_params { + u32 flags; + u32 priority; + u32 mark; + u32 mask; + u16 queue_mapping; + u16 ptype; + struct rcu_head rcu; +}; + struct tcf_skbedit { - struct tc_action common; - u32 flags; - u32 priority; - u32 mark; - u32 mask; - u16 queue_mapping; - u16 ptype; + struct tc_action common; + struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) @@ -37,15 +42,27 @@ struct tcf_skbedit { static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) - return to_skbedit(a)->flags == SKBEDIT_F_MARK; + u32 flags; + + if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) { + rcu_read_lock(); + flags = rcu_dereference(to_skbedit(a)->params)->flags; + rcu_read_unlock(); + return flags == SKBEDIT_F_MARK; + } #endif return false; } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { - return to_skbedit(a)->mark; + u32 mark; + + rcu_read_lock(); + mark = rcu_dereference(to_skbedit(a)->params)->mark; + rcu_read_unlock(); + + return mark; } #endif /* __NET_TC_SKBEDIT_H */ diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8651b5bd6b59..da56e6938c9e 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -37,14 +37,19 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + int action; tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - spin_lock(&d->tcf_lock); - if (d->flags & SKBEDIT_F_PRIORITY) - skb->priority = d->priority; - if (d->flags & SKBEDIT_F_INHERITDSFIELD) { + rcu_read_lock(); + params = rcu_dereference(d->params); + action = READ_ONCE(d->tcf_action); + + if (params->flags & SKBEDIT_F_PRIORITY) + skb->priority = params->priority; + if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); switch (tc_skb_protocol(skb)) { @@ -63,23 +68,23 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, break; } } - if (d->flags & SKBEDIT_F_QUEUE_MAPPING && - skb->dev->real_num_tx_queues > d->queue_mapping) - skb_set_queue_mapping(skb, d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) { - skb->mark &= ~d->mask; - skb->mark |= d->mark & d->mask; + if (params->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > params->queue_mapping) + skb_set_queue_mapping(skb, params->queue_mapping); + if (params->flags & SKBEDIT_F_MARK) { + skb->mark &= ~params->mask; + skb->mark |= params->mark & params->mask; } - if (d->flags & SKBEDIT_F_PTYPE) - skb->pkt_type = d->ptype; - - spin_unlock(&d->tcf_lock); - return d->tcf_action; + if (params->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = params->ptype; +unlock: + rcu_read_unlock(); + return action; err: - spin_unlock(&d->tcf_lock); qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); - return TC_ACT_SHOT; + action = TC_ACT_SHOT; + goto unlock; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -98,6 +103,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tcf_skbedit_params *params_old, *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -185,25 +191,34 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } } - spin_lock_bh(&d->tcf_lock); + ASSERT_RTNL(); - d->flags = flags; + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_idr_release(*a, bind); + return -ENOMEM; + } + + params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) - d->priority = *priority; + params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) - d->queue_mapping = *queue_mapping; + params_new->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) - d->mark = *mark; + params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) - d->ptype = *ptype; + params_new->ptype = *ptype; /* default behaviour is to use all the bits */ - d->mask = 0xffffffff; + params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) - d->mask = *mask; + params_new->mask = *mask; d->tcf_action = parm->action; - - spin_unlock_bh(&d->tcf_lock); + params_old = rtnl_dereference(d->params); + rcu_assign_pointer(d->params, params_new); + if (params_old) + kfree_rcu(params_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -215,33 +230,36 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; - struct tcf_t t; u64 pure_flags = 0; + struct tcf_t t; + + params = rtnl_dereference(d->params); if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) + if ((params->flags & SKBEDIT_F_PRIORITY) && + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) + if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MARK) && - nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) + if ((params->flags & SKBEDIT_F_MARK) && + nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PTYPE) && - nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) + if ((params->flags & SKBEDIT_F_PTYPE) && + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MASK) && - nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) + if ((params->flags & SKBEDIT_F_MASK) && + nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; - if (d->flags & SKBEDIT_F_INHERITDSFIELD) + if (params->flags & SKBEDIT_F_INHERITDSFIELD) pure_flags |= SKBEDIT_F_INHERITDSFIELD; if (pure_flags != 0 && nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) @@ -257,6 +275,16 @@ nla_put_failure: return -1; } +static void tcf_skbedit_cleanup(struct tc_action *a) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + + params = rcu_dereference_protected(d->params, 1); + if (params) + kfree_rcu(params, rcu); +} + static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, @@ -289,6 +317,7 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, .delete = tcf_skbedit_delete, -- cgit v1.2.3 From 0761680d521559813648fb430ddeb479c97ab060 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 11 Jul 2018 17:01:20 +0200 Subject: net: ipv4: fix listify ip_rcv_finish in case of forwarding In commit 5fa12739a53d ("net: ipv4: listify ip_rcv_finish") calling dst_input(skb) was split-out. The ip_sublist_rcv_finish() just calls dst_input(skb) in a loop. The problem is that ip_sublist_rcv_finish() forgot to remove the SKB from the list before invoking dst_input(). Further more we need to clear skb->next as other parts of the network stack use another kind of SKB lists for xmit_more (see dev_hard_start_xmit). A crash occurs if e.g. dst_input() invoke ip_forward(), which calls dst_output()/ip_output() that eventually calls __dev_queue_xmit() + sch_direct_xmit(), and a crash occurs in validate_xmit_skb_list(). This patch only fixes the crash, but there is a huge potential for a performance boost if we can pass an SKB-list through to ip_forward. Fixes: 5fa12739a53d ("net: ipv4: listify ip_rcv_finish") Signed-off-by: Jesper Dangaard Brouer Acked-by: Edward Cree Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1a3b6f32b1c9..3196cf58f418 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -530,8 +530,14 @@ static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; - list_for_each_entry_safe(skb, next, head, list) + list_for_each_entry_safe(skb, next, head, list) { + list_del(&skb->list); + /* Handle ip{6}_forward case, as sch_direct_xmit have + * another kind of SKB-list usage (see validate_xmit_skb_list) + */ + skb->next = NULL; dst_input(skb); + } } static void ip_list_rcv_finish(struct net *net, struct sock *sk, -- cgit v1.2.3 From 68d2f84a1368cc5d4ccbbbfc6821f159d27681c9 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Thu, 12 Jul 2018 16:24:59 +0900 Subject: net: gro: properly remove skb from list Following crash occurs in validate_xmit_skb_list() when same skb is iterated multiple times in the loop and consume_skb() is called. The root cause is calling list_del_init(&skb->list) and not clearing skb->next in d4546c2509b1. list_del_init(&skb->list) sets skb->next to point to skb itself. skb->next needs to be cleared because other parts of network stack uses another kind of SKB lists. validate_xmit_skb_list() uses such list. A similar type of bugfix was reported by Jesper Dangaard Brouer. https://patchwork.ozlabs.org/patch/942541/ This patch clears skb->next and changes list_del_init() to list_del() so that list->prev will maintain the list poison. [ 148.185511] ================================================================== [ 148.187865] BUG: KASAN: use-after-free in validate_xmit_skb_list+0x4b/0xa0 [ 148.190158] Read of size 8 at addr ffff8801e52eefc0 by task swapper/1/0 [ 148.192940] [ 148.193642] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.18.0-rc3+ #25 [ 148.195423] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014 [ 148.199129] Call Trace: [ 148.200565] [ 148.201911] dump_stack+0xc6/0x14c [ 148.203572] ? dump_stack_print_info.cold.1+0x2f/0x2f [ 148.205083] ? kmsg_dump_rewind_nolock+0x59/0x59 [ 148.206307] ? validate_xmit_skb+0x2c6/0x560 [ 148.207432] ? debug_show_held_locks+0x30/0x30 [ 148.208571] ? validate_xmit_skb_list+0x4b/0xa0 [ 148.211144] print_address_description+0x6c/0x23c [ 148.212601] ? validate_xmit_skb_list+0x4b/0xa0 [ 148.213782] kasan_report.cold.6+0x241/0x2fd [ 148.214958] validate_xmit_skb_list+0x4b/0xa0 [ 148.216494] sch_direct_xmit+0x1b0/0x680 [ 148.217601] ? dev_watchdog+0x4e0/0x4e0 [ 148.218675] ? do_raw_spin_trylock+0x10/0x120 [ 148.219818] ? do_raw_spin_lock+0xe0/0xe0 [ 148.221032] __dev_queue_xmit+0x1167/0x1810 [ 148.222155] ? sched_clock+0x5/0x10 [...] [ 148.474257] Allocated by task 0: [ 148.475363] kasan_kmalloc+0xbf/0xe0 [ 148.476503] kmem_cache_alloc+0xb4/0x1b0 [ 148.477654] __build_skb+0x91/0x250 [ 148.478677] build_skb+0x67/0x180 [ 148.479657] e1000_clean_rx_irq+0x542/0x8a0 [ 148.480757] e1000_clean+0x652/0xd10 [ 148.481772] net_rx_action+0x4ea/0xc20 [ 148.482808] __do_softirq+0x1f9/0x574 [ 148.483831] [ 148.484575] Freed by task 0: [ 148.485504] __kasan_slab_free+0x12e/0x180 [ 148.486589] kmem_cache_free+0xb4/0x240 [ 148.487634] kfree_skbmem+0xed/0x150 [ 148.488648] consume_skb+0x146/0x250 [ 148.489665] validate_xmit_skb+0x2b7/0x560 [ 148.490754] validate_xmit_skb_list+0x70/0xa0 [ 148.491897] sch_direct_xmit+0x1b0/0x680 [ 148.493949] __dev_queue_xmit+0x1167/0x1810 [ 148.495103] br_dev_queue_push_xmit+0xce/0x250 [ 148.496196] br_forward_finish+0x276/0x280 [ 148.497234] __br_forward+0x44f/0x520 [ 148.498260] br_forward+0x19f/0x1b0 [ 148.499264] br_handle_frame_finish+0x65e/0x980 [ 148.500398] NF_HOOK.constprop.10+0x290/0x2a0 [ 148.501522] br_handle_frame+0x417/0x640 [ 148.502582] __netif_receive_skb_core+0xaac/0x18f0 [ 148.503753] __netif_receive_skb_one_core+0x98/0x120 [ 148.504958] netif_receive_skb_internal+0xe3/0x330 [ 148.506154] napi_gro_complete+0x190/0x2a0 [ 148.507243] dev_gro_receive+0x9f7/0x1100 [ 148.508316] napi_gro_receive+0xcb/0x260 [ 148.509387] e1000_clean_rx_irq+0x2fc/0x8a0 [ 148.510501] e1000_clean+0x652/0xd10 [ 148.511523] net_rx_action+0x4ea/0xc20 [ 148.512566] __do_softirq+0x1f9/0x574 [ 148.513598] [ 148.514346] The buggy address belongs to the object at ffff8801e52eefc0 [ 148.514346] which belongs to the cache skbuff_head_cache of size 232 [ 148.517047] The buggy address is located 0 bytes inside of [ 148.517047] 232-byte region [ffff8801e52eefc0, ffff8801e52ef0a8) [ 148.519549] The buggy address belongs to the page: [ 148.520726] page:ffffea000794bb00 count:1 mapcount:0 mapping:ffff880106f4dfc0 index:0xffff8801e52ee840 compound_mapcount: 0 [ 148.524325] flags: 0x17ffffc0008100(slab|head) [ 148.525481] raw: 0017ffffc0008100 ffff880106b938d0 ffff880106b938d0 ffff880106f4dfc0 [ 148.527503] raw: ffff8801e52ee840 0000000000190011 00000001ffffffff 0000000000000000 [ 148.529547] page dumped because: kasan: bad access detected Fixes: d4546c2509b1 ("net: Convert GRO SKB handling to list_head.") Signed-off-by: Prashant Bhole Reported-by: Tyler Hicks Tested-by: Tyler Hicks Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1c3f0997e857..14a748ee8cc9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5280,7 +5280,8 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - list_del_init(&skb->list); + list_del(&skb->list); + skb->next = NULL; napi_gro_complete(skb); napi->gro_count--; napi->gro_hash[index].count--; @@ -5461,7 +5462,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - list_del_init(&pp->list); + list_del(&pp->list); + pp->next = NULL; napi_gro_complete(pp); napi->gro_count--; napi->gro_hash[hash].count--; -- cgit v1.2.3 From b16ebe925a4400a2ec3dc663c81dce2fd9bf0998 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:08 +0300 Subject: devlink: Add support for creating and destroying regions This allows a device to register its supported address regions. Each address region can be accessed directly for example reading the snapshots taken of this address space. Drivers are not limited in the name selection for different regions. An example of a region-name can be: pci cr-space, register-space. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 22 ++++++++++++++ net/core/devlink.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index f67c29cede15..e5397652f2fb 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -28,6 +28,7 @@ struct devlink { struct list_head dpipe_table_list; struct list_head resource_list; struct list_head param_list; + struct list_head region_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -397,6 +398,8 @@ enum devlink_param_generic_id { .validate = _validate, \ } +struct devlink_region; + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -543,6 +546,11 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val); void devlink_param_value_changed(struct devlink *devlink, u32 param_id); +struct devlink_region *devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size); +void devlink_region_destroy(struct devlink_region *region); #else @@ -770,6 +778,20 @@ devlink_param_value_changed(struct devlink *devlink, u32 param_id) { } +static inline struct devlink_region * +devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size) +{ + return NULL; +} + +static inline void +devlink_region_destroy(struct devlink_region *region) +{ +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 470f3dbfecfe..cac856136ac6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -326,6 +326,28 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, pool_type, p_tc_index); } +struct devlink_region { + struct devlink *devlink; + struct list_head list; + const char *name; + struct list_head snapshot_list; + u32 max_snapshots; + u32 cur_snapshots; + u64 size; +}; + +static struct devlink_region * +devlink_region_get_by_name(struct devlink *devlink, const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + if (!strcmp(region->name, region_name)) + return region; + + return NULL; +} + #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -3358,6 +3380,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->param_list); + INIT_LIST_HEAD(&devlink->region_list); mutex_init(&devlink->lock); return devlink; } @@ -4109,6 +4132,67 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) } EXPORT_SYMBOL_GPL(devlink_param_value_changed); +/** + * devlink_region_create - create a new address region + * + * @devlink: devlink + * @region_name: region name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + */ +struct devlink_region *devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size) +{ + struct devlink_region *region; + int err = 0; + + mutex_lock(&devlink->lock); + + if (devlink_region_get_by_name(devlink, region_name)) { + err = -EEXIST; + goto unlock; + } + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + err = -ENOMEM; + goto unlock; + } + + region->devlink = devlink; + region->max_snapshots = region_max_snapshots; + region->name = region_name; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + list_add_tail(®ion->list, &devlink->region_list); + + mutex_unlock(&devlink->lock); + return region; + +unlock: + mutex_unlock(&devlink->lock); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(devlink_region_create); + +/** + * devlink_region_destroy - destroy address region + * + * @region: devlink region to destroy + */ +void devlink_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + + mutex_lock(&devlink->lock); + list_del(®ion->list); + mutex_unlock(&devlink->lock); + kfree(region); +} +EXPORT_SYMBOL_GPL(devlink_region_destroy); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From ccadfa444b34c6ec7bb458eee17fdd8c9a456c63 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:09 +0300 Subject: devlink: Add callback to query for snapshot id before snapshot create To restrict the driver with the snapshot ID selection a new callback is introduced for the driver to get the snapshot ID before creating a new snapshot. This will also allow giving the same ID for multiple snapshots taken of different regions on the same time. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++++ net/core/devlink.c | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index e5397652f2fb..f27d8593687a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -29,6 +29,7 @@ struct devlink { struct list_head resource_list; struct list_head param_list; struct list_head region_list; + u32 snapshot_id; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -551,6 +552,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, u32 region_max_snapshots, u64 region_size); void devlink_region_destroy(struct devlink_region *region); +u32 devlink_region_shapshot_id_get(struct devlink *devlink); #else @@ -792,6 +794,12 @@ devlink_region_destroy(struct devlink_region *region) { } +static inline u32 +devlink_region_shapshot_id_get(struct devlink *devlink) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index cac856136ac6..6c92ddd2465d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4193,6 +4193,27 @@ void devlink_region_destroy(struct devlink_region *region) } EXPORT_SYMBOL_GPL(devlink_region_destroy); +/** + * devlink_region_shapshot_id_get - get snapshot ID + * + * This callback should be called when adding a new snapshot, + * Driver should use the same id for multiple snapshots taken + * on multiple regions at the same time/by the same trigger. + * + * @devlink: devlink + */ +u32 devlink_region_shapshot_id_get(struct devlink *devlink) +{ + u32 id; + + mutex_lock(&devlink->lock); + id = ++devlink->snapshot_id; + mutex_unlock(&devlink->lock); + + return id; +} +EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From d7e5272282d93bedbbeb6174b8af8425d7dcfd6f Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:10 +0300 Subject: devlink: Add support for creating region snapshots Each device address region can store multiple snapshots, each snapshot is identified using a different numerical ID. This ID is used when deleting a snapshot or showing an address region specific snapshot. This patch exposes a callback to add a new snapshot to an address region. The snapshot will be deleted using the destructor function when destroying a region or when a snapshot delete command from devlink user tool. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 13 +++++++ net/core/devlink.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index f27d8593687a..905f0bb7b4ba 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -401,6 +401,8 @@ enum devlink_param_generic_id { struct devlink_region; +typedef void devlink_snapshot_data_dest_t(const void *data); + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -553,6 +555,9 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, u64 region_size); void devlink_region_destroy(struct devlink_region *region); u32 devlink_region_shapshot_id_get(struct devlink *devlink); +int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor); #else @@ -800,6 +805,14 @@ devlink_region_shapshot_id_get(struct devlink *devlink) return 0; } +static inline int +devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 6c92ddd2465d..7d09fe60fa4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -336,6 +336,15 @@ struct devlink_region { u64 size; }; +struct devlink_snapshot { + struct list_head list; + struct devlink_region *region; + devlink_snapshot_data_dest_t *data_destructor; + u64 data_len; + u8 *data; + u32 id; +}; + static struct devlink_region * devlink_region_get_by_name(struct devlink *devlink, const char *region_name) { @@ -348,6 +357,26 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name) return NULL; } +static struct devlink_snapshot * +devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) +{ + struct devlink_snapshot *snapshot; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) + if (snapshot->id == id) + return snapshot; + + return NULL; +} + +static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot) +{ + snapshot->region->cur_snapshots--; + list_del(&snapshot->list); + (*snapshot->data_destructor)(snapshot->data); + kfree(snapshot); +} + #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -4185,8 +4214,14 @@ EXPORT_SYMBOL_GPL(devlink_region_create); void devlink_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot, *ts; mutex_lock(&devlink->lock); + + /* Free all snapshots of region */ + list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) + devlink_region_snapshot_del(snapshot); + list_del(®ion->list); mutex_unlock(&devlink->lock); kfree(region); @@ -4214,6 +4249,66 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); +/** + * devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * @devlink_region: devlink region of the snapshot + * @data_len: size of snapshot data + * @data: snapshot data + * @snapshot_id: snapshot id to be created + * @data_destructor: pointer to destructor function to free data + */ +int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + mutex_lock(&devlink->lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) { + err = -ENOMEM; + goto unlock; + } + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + err = -EEXIST; + goto unlock; + } + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) { + err = -ENOMEM; + goto unlock; + } + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + snapshot->data_len = data_len; + snapshot->data_destructor = data_destructor; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + mutex_unlock(&devlink->lock); + return 0; + +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From d8db7ea55f2ff5890ad31137233a3808d80c7f62 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:11 +0300 Subject: devlink: Add support for region get command Add support for DEVLINK_CMD_REGION_GET command which is used for querying for the supported DEV/REGION values of devlink devices. The support is both for doit and dumpit. Reply includes: BUS_NAME, DEVICE_NAME, REGION_NAME, REGION_SIZE Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 6 +++ net/core/devlink.c | 114 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 68641fb56654..28bfa8aa3d91 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -83,6 +83,9 @@ enum devlink_command { DEVLINK_CMD_PARAM_NEW, DEVLINK_CMD_PARAM_DEL, + DEVLINK_CMD_REGION_GET, + DEVLINK_CMD_REGION_SET, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -262,6 +265,9 @@ enum devlink_attr { DEVLINK_ATTR_PARAM_VALUE_DATA, /* dynamic */ DEVLINK_ATTR_PARAM_VALUE_CMODE, /* u8 */ + DEVLINK_ATTR_REGION_NAME, /* string */ + DEVLINK_ATTR_REGION_SIZE, /* u64 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 7d09fe60fa4b..221ddb6bae48 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3149,6 +3149,111 @@ static void devlink_param_unregister_one(struct devlink *devlink, kfree(param_item); } +static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct devlink_region *region) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto nla_put_failure; + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_region *region; + const char *region_name; + struct sk_buff *msg; + int err; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, + info->snd_portid, info->snd_seq, 0, + region); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_region *region; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + + mutex_lock(&devlink->lock); + list_for_each_entry(region, &devlink->region_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_region_fill(msg, devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, region); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + cb->args[0] = idx; + return msg->len; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -3172,6 +3277,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -3370,6 +3476,14 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .doit = devlink_nl_cmd_region_get_doit, + .dumpit = devlink_nl_cmd_region_get_dumpit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From a006d467fbf1d405e73cd167829d7a9e3df600e3 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:12 +0300 Subject: devlink: Extend the support querying for region snapshot IDs Extend the support for DEVLINK_CMD_REGION_GET command to also return the IDs of the snapshot currently present on the region. Each reply will include a nested snapshots attribute that can contain multiple snapshot attributes each with an ID. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 28bfa8aa3d91..abde4e306375 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -267,6 +267,9 @@ enum devlink_attr { DEVLINK_ATTR_REGION_NAME, /* string */ DEVLINK_ATTR_REGION_SIZE, /* u64 */ + DEVLINK_ATTR_REGION_SNAPSHOTS, /* nested */ + DEVLINK_ATTR_REGION_SNAPSHOT, /* nested */ + DEVLINK_ATTR_REGION_SNAPSHOT_ID, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 221ddb6bae48..cb75e26d70ff 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3149,6 +3149,55 @@ static void devlink_param_unregister_one(struct devlink *devlink, kfree(param_item); } +static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_snapshot *snapshot) +{ + struct nlattr *snap_attr; + int err; + + snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT); + if (!snap_attr) + return -EINVAL; + + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, snap_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snap_attr); + return err; +} + +static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_region *region) +{ + struct devlink_snapshot *snapshot; + struct nlattr *snapshots_attr; + int err; + + snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); + if (!snapshots_attr) + return -EINVAL; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) { + err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); + if (err) + goto nla_put_failure; + } + + nla_nest_end(msg, snapshots_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snapshots_attr); + return err; +} + static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, @@ -3175,6 +3224,10 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; + err = devlink_nl_region_snapshots_id_put(msg, devlink, region); + if (err) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3 From 866319bb9437614407ca36f8b16f89ab77a6a831 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:13 +0300 Subject: devlink: Add support for region snapshot delete command Add support for DEVLINK_CMD_REGION_DEL used for deleting a snapshot from a region. The snapshot ID is required. Also added notification support for NEW and DEL of snapshots. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 + net/core/devlink.c | 93 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index abde4e306375..d212e02f843f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -85,6 +85,8 @@ enum devlink_command { DEVLINK_CMD_REGION_GET, DEVLINK_CMD_REGION_SET, + DEVLINK_CMD_REGION_NEW, + DEVLINK_CMD_REGION_DEL, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index cb75e26d70ff..fc0836371a71 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3236,6 +3236,58 @@ nla_put_failure: return err; } +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + void *hdr; + int err; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out_cancel_msg; + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, + region->name); + if (err) + goto out_cancel_msg; + + if (snapshot) { + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, + snapshot->id); + if (err) + goto out_cancel_msg; + } else { + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, DEVLINK_ATTR_PAD); + if (err) + goto out_cancel_msg; + } + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + + return; + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); +} + static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -3307,6 +3359,35 @@ out: return msg->len; } +static int devlink_nl_cmd_region_del(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct devlink_region *region; + const char *region_name; + u32 snapshot_id; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME] || + !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + return -EINVAL; + + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) + return -EINVAL; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); + devlink_region_snapshot_del(snapshot); + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -3331,6 +3412,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -3537,6 +3619,13 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_REGION_DEL, + .doit = devlink_nl_cmd_region_del, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -4363,6 +4452,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, region->size = region_size; INIT_LIST_HEAD(®ion->snapshot_list); list_add_tail(®ion->list, &devlink->region_list); + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); mutex_unlock(&devlink->lock); return region; @@ -4390,6 +4480,8 @@ void devlink_region_destroy(struct devlink_region *region) devlink_region_snapshot_del(snapshot); list_del(®ion->list); + + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); mutex_unlock(&devlink->lock); kfree(region); } @@ -4467,6 +4559,7 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, region->cur_snapshots++; + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); mutex_unlock(&devlink->lock); return 0; -- cgit v1.2.3 From 4e54795a27f56102649f121a34b8445e42f79ccd Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:14 +0300 Subject: devlink: Add support for region snapshot read command Add support for DEVLINK_CMD_REGION_READ_GET used for both reading and dumping region data. Read allows reading from a region specific address for given length. Dump allows reading the full region. If only snapshot ID is provided a snapshot dump will be done. If snapshot ID, Address and Length are provided a snapshot read will done. This is used for both snapshot access and will be used in the same way to access current data on the region. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 7 ++ net/core/devlink.c | 182 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index d212e02f843f..79407bbd296d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -87,6 +87,7 @@ enum devlink_command { DEVLINK_CMD_REGION_SET, DEVLINK_CMD_REGION_NEW, DEVLINK_CMD_REGION_DEL, + DEVLINK_CMD_REGION_READ, /* add new commands above here */ __DEVLINK_CMD_MAX, @@ -273,6 +274,12 @@ enum devlink_attr { DEVLINK_ATTR_REGION_SNAPSHOT, /* nested */ DEVLINK_ATTR_REGION_SNAPSHOT_ID, /* u32 */ + DEVLINK_ATTR_REGION_CHUNKS, /* nested */ + DEVLINK_ATTR_REGION_CHUNK, /* nested */ + DEVLINK_ATTR_REGION_CHUNK_DATA, /* binary */ + DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */ + DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index fc0836371a71..e5118dba6bb4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3388,6 +3388,181 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, + struct devlink *devlink, + u8 *chunk, u32 chunk_size, + u64 addr) +{ + struct nlattr *chunk_attr; + int err; + + chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK); + if (!chunk_attr) + return -EINVAL; + + err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, chunk_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, chunk_attr); + return err; +} + +#define DEVLINK_REGION_READ_CHUNK_SIZE 256 + +static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, + struct devlink *devlink, + struct devlink_region *region, + struct nlattr **attrs, + u64 start_offset, + u64 end_offset, + bool dump, + u64 *new_offset) +{ + struct devlink_snapshot *snapshot; + u64 curr_offset = start_offset; + u32 snapshot_id; + int err = 0; + + *new_offset = start_offset; + + snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) + return -EINVAL; + + if (end_offset > snapshot->data_len || dump) + end_offset = snapshot->data_len; + + while (curr_offset < end_offset) { + u32 data_size; + u8 *data; + + if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE) + data_size = end_offset - curr_offset; + else + data_size = DEVLINK_REGION_READ_CHUNK_SIZE; + + data = &snapshot->data[curr_offset]; + err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink, + data, data_size, + curr_offset); + if (err) + break; + + curr_offset += data_size; + } + *new_offset = curr_offset; + + return err; +} + +static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + u64 ret_offset, start_offset, end_offset = 0; + struct nlattr *attrs[DEVLINK_ATTR_MAX + 1]; + const struct genl_ops *ops = cb->data; + struct devlink_region *region; + struct nlattr *chunks_attr; + const char *region_name; + struct devlink *devlink; + bool dump = true; + void *hdr; + int err; + + start_offset = *((u64 *)&cb->args[0]); + + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); + if (err) + goto out; + + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto out; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); + + if (!attrs[DEVLINK_ATTR_REGION_NAME] || + !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + goto out_unlock; + + region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + goto out_unlock; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, + DEVLINK_CMD_REGION_READ); + if (!hdr) + goto out_unlock; + + err = devlink_nl_put_handle(skb, devlink); + if (err) + goto nla_put_failure; + + err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); + if (err) + goto nla_put_failure; + + chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS); + if (!chunks_attr) + goto nla_put_failure; + + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + dump = false; + } + + err = devlink_nl_region_read_snapshot_fill(skb, devlink, + region, attrs, + start_offset, + end_offset, dump, + &ret_offset); + + if (err && err != -EMSGSIZE) + goto nla_put_failure; + + /* Check if there was any progress done to prevent infinite loop */ + if (ret_offset == start_offset) + goto nla_put_failure; + + *((u64 *)&cb->args[0]) = ret_offset; + + nla_nest_end(skb, chunks_attr); + genlmsg_end(skb, hdr); + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); +out_unlock: + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); +out: + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -3626,6 +3801,13 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_REGION_READ, + .dumpit = devlink_nl_cmd_region_read_dumpit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From f6a69885f2e38be0229ab9f6a2d9d4a1b4ba2be5 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:17 +0300 Subject: devlink: Add generic parameters region_snapshot region_snapshot - When set enables capturing region snapshots Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 905f0bb7b4ba..b9b89d6604d4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -361,6 +361,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -376,6 +377,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME "enable_sriov" #define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME "region_snapshot_enable" +#define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index e5118dba6bb4..65fc366a78a4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2671,6 +2671,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, + .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 811e299f4645588cc7a1b78d97b6847c155324b9 Mon Sep 17 00:00:00 2001 From: Romuald CARI Date: Thu, 7 Jun 2018 16:08:02 +0200 Subject: ieee802154: add rx LQI from userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Link Quality Indication data exposed by drivers could not be accessed from userspace. Since this data is per-datagram received, it makes sense to make it available to userspace application through the ancillary data mechanism in recvmsg rather than through ioctls. This can be activated using the socket option WPAN_WANTLQI under SOL_IEEE802154 protocol. This LQI data is available in the ancillary data buffer under the SOL_IEEE802154 level as the type WPAN_LQI. The value is an unsigned byte indicating the link quality with values ranging 0-255. Signed-off-by: Romuald Cari Signed-off-by: Clément Peron Signed-off-by: Stefan Schmidt --- include/net/af_ieee802154.h | 1 + net/ieee802154/socket.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/net/af_ieee802154.h b/include/net/af_ieee802154.h index a5563d27a3eb..8003a9f6eb43 100644 --- a/include/net/af_ieee802154.h +++ b/include/net/af_ieee802154.h @@ -56,6 +56,7 @@ struct sockaddr_ieee802154 { #define WPAN_WANTACK 0 #define WPAN_SECURITY 1 #define WPAN_SECURITY_LEVEL 2 +#define WPAN_WANTLQI 3 #define WPAN_SECURITY_DEFAULT 0 #define WPAN_SECURITY_OFF 1 diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a60658c85a9a..bc6b912603f1 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -25,6 +25,7 @@ #include /* For TIOCOUTQ/INQ */ #include #include +#include #include #include #include @@ -452,6 +453,7 @@ struct dgram_sock { unsigned int bound:1; unsigned int connected:1; unsigned int want_ack:1; + unsigned int want_lqi:1; unsigned int secen:1; unsigned int secen_override:1; unsigned int seclevel:3; @@ -486,6 +488,7 @@ static int dgram_init(struct sock *sk) struct dgram_sock *ro = dgram_sk(sk); ro->want_ack = 1; + ro->want_lqi = 0; return 0; } @@ -713,6 +716,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; + struct dgram_sock *ro = dgram_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, noblock, &err); @@ -744,6 +748,13 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, *addr_len = sizeof(*saddr); } + if (ro->want_lqi) { + err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI, + sizeof(uint8_t), &(mac_cb(skb)->lqi)); + if (err) + goto done; + } + if (flags & MSG_TRUNC) copied = skb->len; done: @@ -847,6 +858,9 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, case WPAN_WANTACK: val = ro->want_ack; break; + case WPAN_WANTLQI: + val = ro->want_lqi; + break; case WPAN_SECURITY: if (!ro->secen_override) val = WPAN_SECURITY_DEFAULT; @@ -892,6 +906,9 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, case WPAN_WANTACK: ro->want_ack = !!val; break; + case WPAN_WANTLQI: + ro->want_lqi = !!val; + break; case WPAN_SECURITY: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { -- cgit v1.2.3 From 4f91da26c81145f255cb153152ffed70014b1c41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:38 -0700 Subject: xdp: add per mode attributes for attached programs In preparation for support of simultaneous driver and hardware XDP support add per-mode attributes. The catch-all IFLA_XDP_PROG_ID will still be reported, but user space can now also access the program ID in a new IFLA_XDP__PROG_ID attribute. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_link.h | 3 +++ net/core/rtnetlink.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cf01b6824244..bc86c2b105ec 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -928,6 +928,9 @@ enum { IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, __IFLA_XDP_MAX, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ef61222fdef..b40242459907 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4); /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID */ + nla_total_size(4); /* XDP__PROG_ID */ return xdp_size; } @@ -1378,16 +1379,17 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { + u32 prog_attr, prog_id; struct nlattr *xdp; - u32 prog_id; int err; + u8 mode; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, - rtnl_xdp_attached_mode(dev, &prog_id)); + mode = rtnl_xdp_attached_mode(dev, &prog_id); + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; @@ -1395,6 +1397,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; + + switch (mode) { + case XDP_ATTACHED_DRV: + prog_attr = IFLA_XDP_DRV_PROG_ID; + break; + case XDP_ATTACHED_SKB: + prog_attr = IFLA_XDP_SKB_PROG_ID; + break; + case XDP_ATTACHED_HW: + prog_attr = IFLA_XDP_HW_PROG_ID; + break; + case XDP_ATTACHED_NONE: + default: + err = -EINVAL; + goto err_cancel; + } + + err = nla_put_u32(skb, prog_attr, prog_id); + if (err) + goto err_cancel; } nla_nest_end(skb, xdp); -- cgit v1.2.3 From 6b8675897338f874c41612655a85d8e10cdb23d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:39 -0700 Subject: xdp: don't make drivers report attachment mode prog_attached of struct netdev_bpf should have been superseded by simply setting prog_id long time ago, but we kept it around to allow offloading drivers to communicate attachment mode (drv vs hw). Subsequently drivers were also allowed to report back attachment flags (prog_flags), and since nowadays only programs attached will XDP_FLAGS_HW_MODE can get offloaded, we can tell the attachment mode from the flags driver reports. Remove prog_attached member. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 - drivers/net/ethernet/cavium/thunder/nicvf_main.c | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 - drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 1 - drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 1 - drivers/net/netdevsim/bpf.c | 1 - drivers/net/tun.c | 1 - drivers/net/virtio_net.c | 1 - include/linux/netdevice.h | 5 ----- net/core/dev.c | 7 +++---- net/core/rtnetlink.c | 8 ++++++-- 15 files changed, 9 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 1f0e872d0667..0584d07c8c33 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -219,7 +219,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) rc = bnxt_xdp_set(bp, xdp->prog); break; case XDP_QUERY_PROG: - xdp->prog_attached = !!bp->xdp_prog; xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0; rc = 0; break; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 135766c4296b..768f584f8392 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1848,7 +1848,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return nicvf_xdp_setup(nic, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!nic->xdp_prog; xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 426b0ccb1fc6..51762428b40e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11841,7 +11841,6 @@ static int i40e_xdp(struct net_device *dev, case XDP_SETUP_PROG: return i40e_xdp_setup(vsi, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = i40e_enabled_xdp_vsi(vsi); xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a8e21becb619..3862fea1c923 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9966,7 +9966,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return ixgbe_xdp_setup(dev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!(adapter->xdp_prog); xdp->prog_id = adapter->xdp_prog ? adapter->xdp_prog->aux->id : 0; return 0; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 59416eddd840..d86446d202d5 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4462,7 +4462,6 @@ static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return ixgbevf_xdp_setup(dev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!(adapter->xdp_prog); xdp->prog_id = adapter->xdp_prog ? adapter->xdp_prog->aux->id : 0; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 65eb06e017e4..6785661d1a72 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2926,7 +2926,6 @@ static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) return mlx4_xdp_set(dev, xdp->prog); case XDP_QUERY_PROG: xdp->prog_id = mlx4_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bbd2fd0b2e06..e4a9a0768a81 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4192,7 +4192,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) return mlx5e_xdp_set(dev, xdp->prog); case XDP_QUERY_PROG: xdp->prog_id = mlx5e_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 7df5ca37bfb8..d20714598613 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3459,9 +3459,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, xdp->extack); case XDP_QUERY_PROG: - xdp->prog_attached = !!nn->xdp_prog; - if (nn->dp.bpf_offload_xdp) - xdp->prog_attached = XDP_ATTACHED_HW; xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0; return 0; diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index b823bfe2ea4d..f9a327c821eb 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1116,7 +1116,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return qede_xdp_set(edev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!edev->xdp_prog; xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 75c25306d234..712e6f918065 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -567,7 +567,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) nsim_bpf_destroy_prog(bpf->offload.prog); return 0; case XDP_QUERY_PROG: - bpf->prog_attached = ns->xdp_prog_mode; bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0; bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0; return 0; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a192a017cc68..49a50219d0da 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1268,7 +1268,6 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) return tun_xdp_set(dev, xdp->prog, xdp->extack); case XDP_QUERY_PROG: xdp->prog_id = tun_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 53085c63277b..2ff08bc103a9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2343,7 +2343,6 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) return virtnet_xdp_set(dev, xdp->prog, xdp->extack); case XDP_QUERY_PROG: xdp->prog_id = virtnet_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..69a664789b33 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -819,10 +819,6 @@ enum bpf_netdev_command { */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, - /* Check if a bpf program is set on the device. The callee should - * set @prog_attached to one of XDP_ATTACHED_* values, note that "true" - * is equivalent to XDP_ATTACHED_DRV. - */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_VERIFIER_PREP, @@ -849,7 +845,6 @@ struct netdev_bpf { }; /* XDP_QUERY_PROG */ struct { - u8 prog_attached; u32 prog_id; /* flags with which program was installed */ u32 prog_flags; diff --git a/net/core/dev.c b/net/core/dev.c index 89825c1eccdc..9fa3b3705a8e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4926,7 +4926,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) break; case XDP_QUERY_PROG: - xdp->prog_attached = !!old; xdp->prog_id = old ? old->aux->id : 0; break; @@ -7593,13 +7592,13 @@ void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, WARN_ON(bpf_op(dev, xdp) < 0); } -static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) { struct netdev_bpf xdp; __dev_xdp_query(dev, bpf_op, &xdp); - return xdp.prog_attached; + return xdp.prog_id; } static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, @@ -7634,7 +7633,7 @@ static void dev_xdp_uninstall(struct net_device *dev) return; __dev_xdp_query(dev, ndo_bpf, &xdp); - if (xdp.prog_attached == XDP_ATTACHED_NONE) + if (!xdp.prog_id) return; /* Program removal should always succeed */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b40242459907..02ebc056a688 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1372,9 +1372,13 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) return XDP_ATTACHED_NONE; __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - *prog_id = xdp.prog_id; + if (!xdp.prog_id) + return XDP_ATTACHED_NONE; - return xdp.prog_attached; + *prog_id = xdp.prog_id; + if (xdp.prog_flags & XDP_FLAGS_HW_MODE) + return XDP_ATTACHED_HW; + return XDP_ATTACHED_DRV; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From 05296620f6d14dce0030b87e1e57891a770fb65c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:40 -0700 Subject: xdp: factor out common program/flags handling from drivers Basic operations drivers perform during xdp setup and query can be moved to helpers in the core. Encapsulate program and flags into a structure and add helpers. Note that the structure is intended as the "main" program information source in the driver. Most drivers will additionally place the program pointer in their fast path or ring structures. The helpers don't have a huge impact now, but they will decrease the code duplication when programs can be installed in HW and driver at the same time. Encapsulating the basic operations in helpers will hopefully also reduce the number of changes to drivers which adopt them. Helpers could really be static inline, but they depend on definition of struct netdev_bpf which means they'd have to be placed in netdevice.h, an already 4500 line header. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_net.h | 6 ++-- .../net/ethernet/netronome/nfp/nfp_net_common.c | 28 +++++++----------- drivers/net/netdevsim/bpf.c | 16 +++------- drivers/net/netdevsim/netdevsim.h | 4 +-- include/net/xdp.h | 13 +++++++++ net/core/xdp.c | 34 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_offload.py | 4 +-- 7 files changed, 67 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 2a71a9ffd095..2021dda595b7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -553,8 +553,7 @@ struct nfp_net_dp { * @rss_cfg: RSS configuration * @rss_key: RSS secret key * @rss_itbl: RSS indirection table - * @xdp_flags: Flags with which XDP prog was loaded - * @xdp_prog: XDP prog (for ctrl path, both DRV and HW modes) + * @xdp: Information about the attached XDP program * @max_r_vecs: Number of allocated interrupt vectors for RX/TX * @max_tx_rings: Maximum number of TX rings supported by the Firmware * @max_rx_rings: Maximum number of RX rings supported by the Firmware @@ -610,8 +609,7 @@ struct nfp_net { u8 rss_key[NFP_NET_CFG_RSS_KEY_SZ]; u8 rss_itbl[NFP_NET_CFG_RSS_ITBL_SZ]; - u32 xdp_flags; - struct bpf_prog *xdp_prog; + struct xdp_attachment_info xdp; unsigned int max_tx_rings; unsigned int max_rx_rings; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index d20714598613..4bb589dbffbc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3417,34 +3417,29 @@ nfp_net_xdp_setup_drv(struct nfp_net *nn, struct bpf_prog *prog, return nfp_net_ring_reconfig(nn, dp, extack); } -static int -nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, - struct netlink_ext_ack *extack) +static int nfp_net_xdp_setup(struct nfp_net *nn, struct netdev_bpf *bpf) { struct bpf_prog *drv_prog, *offload_prog; int err; - if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES) + if (!xdp_attachment_flags_ok(&nn->xdp, bpf)) return -EBUSY; /* Load both when no flags set to allow easy activation of driver path * when program is replaced by one which can't be offloaded. */ - drv_prog = flags & XDP_FLAGS_HW_MODE ? NULL : prog; - offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog; + drv_prog = bpf->flags & XDP_FLAGS_HW_MODE ? NULL : bpf->prog; + offload_prog = bpf->flags & XDP_FLAGS_DRV_MODE ? NULL : bpf->prog; - err = nfp_net_xdp_setup_drv(nn, drv_prog, extack); + err = nfp_net_xdp_setup_drv(nn, drv_prog, bpf->extack); if (err) return err; - err = nfp_app_xdp_offload(nn->app, nn, offload_prog, extack); - if (err && flags & XDP_FLAGS_HW_MODE) + err = nfp_app_xdp_offload(nn->app, nn, offload_prog, bpf->extack); + if (err && bpf->flags & XDP_FLAGS_HW_MODE) return err; - if (nn->xdp_prog) - bpf_prog_put(nn->xdp_prog); - nn->xdp_prog = prog; - nn->xdp_flags = flags; + xdp_attachment_setup(&nn->xdp, bpf); return 0; } @@ -3456,12 +3451,9 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: case XDP_SETUP_PROG_HW: - return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, - xdp->extack); + return nfp_net_xdp_setup(nn, xdp); case XDP_QUERY_PROG: - xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; - xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0; - return 0; + return xdp_attachment_query(&nn->xdp, xdp); default: return nfp_app_bpf(nn->app, nn, xdp); } diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 712e6f918065..c485d97b5df4 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -199,10 +199,8 @@ static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { int err; - if (ns->xdp_prog && (bpf->flags ^ ns->xdp_flags) & XDP_FLAGS_MODES) { - NSIM_EA(bpf->extack, "program loaded with different flags"); + if (!xdp_attachment_flags_ok(&ns->xdp, bpf)) return -EBUSY; - } if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); @@ -219,11 +217,7 @@ static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf) return err; } - if (ns->xdp_prog) - bpf_prog_put(ns->xdp_prog); - - ns->xdp_prog = bpf->prog; - ns->xdp_flags = bpf->flags; + xdp_attachment_setup(&ns->xdp, bpf); if (!bpf->prog) ns->xdp_prog_mode = XDP_ATTACHED_NONE; @@ -567,9 +561,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) nsim_bpf_destroy_prog(bpf->offload.prog); return 0; case XDP_QUERY_PROG: - bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0; - bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0; - return 0; + return xdp_attachment_query(&ns->xdp, bpf); case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) @@ -636,6 +628,6 @@ void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(!list_empty(&ns->bpf_bound_progs)); WARN_ON(!list_empty(&ns->bpf_bound_maps)); - WARN_ON(ns->xdp_prog); + WARN_ON(ns->xdp.prog); WARN_ON(ns->bpf_offloaded); } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index d8a7cc995e88..69ffb4a2d14b 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -18,6 +18,7 @@ #include #include #include +#include #define DRV_NAME "netdevsim" @@ -67,9 +68,8 @@ struct netdevsim { struct bpf_prog *bpf_offloaded; u32 bpf_offloaded_id; - u32 xdp_flags; + struct xdp_attachment_info xdp; int xdp_prog_mode; - struct bpf_prog *xdp_prog; u32 prog_id_gen; diff --git a/include/net/xdp.h b/include/net/xdp.h index 2deea7166a34..fcb033f51d8c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -144,4 +144,17 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) return unlikely(xdp->data_meta > xdp->data); } +struct xdp_attachment_info { + struct bpf_prog *prog; + u32 flags; +}; + +struct netdev_bpf; +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); + #endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 31c58719b5a9..57285383ed00 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -3,8 +3,11 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. * Released under terms in GPL version 2. See COPYING. */ +#include +#include #include #include +#include #include #include #include @@ -370,3 +373,34 @@ void xdp_return_buff(struct xdp_buff *xdp) __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); } EXPORT_SYMBOL_GPL(xdp_return_buff); + +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + bpf->prog_id = info->prog ? info->prog->aux->id : 0; + bpf->prog_flags = info->prog ? info->flags : 0; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_attachment_query); + +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { + NL_SET_ERR_MSG(bpf->extack, + "program loaded with different flags"); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index f8d9bd81d9a4..40401e9e9351 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -821,7 +821,7 @@ try: ret, _, err = sim.set_xdp(obj, "", force=True, fail=False, include_stderr=True) fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack_nsim(err, "program loaded with different flags.", args) + check_extack(err, "program loaded with different flags.", args) start_test("Test XDP prog remove with bad flags...") ret, _, err = sim.unset_xdp("offload", force=True, @@ -831,7 +831,7 @@ try: ret, _, err = sim.unset_xdp("", force=True, fail=False, include_stderr=True) fail(ret == 0, "Removed program with a bad mode") - check_extack_nsim(err, "program loaded with different flags.", args) + check_extack(err, "program loaded with different flags.", args) start_test("Test MTU restrictions...") ret, _ = sim.set_mtu(9000, fail=False) -- cgit v1.2.3 From a25717d2b604347d9af8da81deea7b08e8c94220 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:41 -0700 Subject: xdp: support simultaneous driver and hw XDP attachment Split the query of HW-attached program from the software one. Introduce new .ndo_bpf command to query HW-attached program. This will allow drivers to install different programs in HW and SW at the same time. Netlink can now also carry multiple programs on dump (in which case mode will be set to XDP_ATTACHED_MULTI and user has to check per-attachment point attributes, IFLA_XDP_PROG_ID will not be present). We reuse IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size() doesn't need to be updated. Note that the installation side is still not there, since all drivers currently reject installing more than one program at the time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- .../net/ethernet/netronome/nfp/nfp_net_common.c | 6 ++ drivers/net/netdevsim/bpf.c | 6 ++ include/linux/netdevice.h | 7 +- include/uapi/linux/if_link.h | 1 + net/core/dev.c | 45 ++++++----- net/core/rtnetlink.c | 93 ++++++++++++---------- 6 files changed, 96 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 4bb589dbffbc..bb1e72e8dbc2 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3453,6 +3453,12 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) case XDP_SETUP_PROG_HW: return nfp_net_xdp_setup(nn, xdp); case XDP_QUERY_PROG: + if (nn->dp.bpf_offload_xdp) + return 0; + return xdp_attachment_query(&nn->xdp, xdp); + case XDP_QUERY_PROG_HW: + if (!nn->dp.bpf_offload_xdp) + return 0; return xdp_attachment_query(&nn->xdp, xdp); default: return nfp_app_bpf(nn->app, nn, xdp); diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index c485d97b5df4..5544c9b51173 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -561,6 +561,12 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) nsim_bpf_destroy_prog(bpf->offload.prog); return 0; case XDP_QUERY_PROG: + if (ns->xdp_prog_mode != XDP_ATTACHED_DRV) + return 0; + return xdp_attachment_query(&ns->xdp, bpf); + case XDP_QUERY_PROG_HW: + if (ns->xdp_prog_mode != XDP_ATTACHED_HW) + return 0; return xdp_attachment_query(&ns->xdp, bpf); case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69a664789b33..2422c0e88f5c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -820,6 +820,7 @@ enum bpf_netdev_command { XDP_SETUP_PROG, XDP_SETUP_PROG_HW, XDP_QUERY_PROG, + XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, @@ -843,7 +844,7 @@ struct netdev_bpf { struct bpf_prog *prog; struct netlink_ext_ack *extack; }; - /* XDP_QUERY_PROG */ + /* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */ struct { u32 prog_id; /* flags with which program was installed */ @@ -3533,8 +3534,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, - struct netdev_bpf *xdp); +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, + enum bpf_netdev_command cmd); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bc86c2b105ec..8759cfb8aa2e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -920,6 +920,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index 9fa3b3705a8e..993cdc3cd086 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7582,21 +7582,19 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - struct netdev_bpf *xdp) +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + enum bpf_netdev_command cmd) { - memset(xdp, 0, sizeof(*xdp)); - xdp->command = XDP_QUERY_PROG; + struct netdev_bpf xdp; - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, xdp) < 0); -} + if (!bpf_op) + return 0; -static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) -{ - struct netdev_bpf xdp; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = cmd; - __dev_xdp_query(dev, bpf_op, &xdp); + /* Query must always succeed. */ + WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); return xdp.prog_id; } @@ -7632,12 +7630,19 @@ static void dev_xdp_uninstall(struct net_device *dev) if (!ndo_bpf) return; - __dev_xdp_query(dev, ndo_bpf, &xdp); - if (!xdp.prog_id) - return; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + WARN_ON(ndo_bpf(dev, &xdp)); + if (xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); - /* Program removal should always succeed */ - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + /* Remove HW offload */ + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG_HW; + if (!ndo_bpf(dev, &xdp) && xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); } /** @@ -7653,12 +7658,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; + enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); + query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + bpf_op = bpf_chk = ops->ndo_bpf; if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; @@ -7668,10 +7676,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) + if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || + __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op)) + __dev_xdp_query(dev, bpf_op, query)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 02ebc056a688..c9929ef17539 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -964,7 +964,7 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4) + /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP__PROG_ID */ return xdp_size; @@ -1354,37 +1354,57 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } -static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +static u32 rtnl_xdp_prog_skb(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; - struct netdev_bpf xdp; ASSERT_RTNL(); - *prog_id = 0; generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (generic_xdp_prog) { - *prog_id = generic_xdp_prog->aux->id; - return XDP_ATTACHED_SKB; - } - if (!ops->ndo_bpf) - return XDP_ATTACHED_NONE; + if (!generic_xdp_prog) + return 0; + return generic_xdp_prog->aux->id; +} + +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); +} + +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, + XDP_QUERY_PROG_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; + + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; - __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - if (!xdp.prog_id) - return XDP_ATTACHED_NONE; + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; - *prog_id = xdp.prog_id; - if (xdp.prog_flags & XDP_FLAGS_HW_MODE) - return XDP_ATTACHED_HW; - return XDP_ATTACHED_DRV; + return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { - u32 prog_attr, prog_id; struct nlattr *xdp; + u32 prog_id; int err; u8 mode; @@ -1392,35 +1412,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) if (!xdp) return -EMSGSIZE; - mode = rtnl_xdp_attached_mode(dev, &prog_id); + prog_id = 0; + mode = XDP_ATTACHED_NONE; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb)) + goto err_cancel; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv)) + goto err_cancel; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw)) + goto err_cancel; + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; - if (prog_id) { + if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; - - switch (mode) { - case XDP_ATTACHED_DRV: - prog_attr = IFLA_XDP_DRV_PROG_ID; - break; - case XDP_ATTACHED_SKB: - prog_attr = IFLA_XDP_SKB_PROG_ID; - break; - case XDP_ATTACHED_HW: - prog_attr = IFLA_XDP_HW_PROG_ID; - break; - case XDP_ATTACHED_NONE: - default: - err = -EINVAL; - goto err_cancel; - } - - err = nla_put_u32(skb, prog_attr, prog_id); - if (err) - goto err_cancel; } nla_nest_end(skb, xdp); -- cgit v1.2.3 From c921c2077b32081617789a645120148bc8b60c98 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Jul 2018 12:16:43 +0300 Subject: net: ipmr: add support for passing full packet on wrong vif This patch adds support for IGMPMSG_WRVIFWHOLE which is used to pass full packet and real vif id when the incoming interface is wrong. While the RP and FHR are setting up state we need to be sending the registers encapsulated with all the data inside otherwise we lose it. The RP then decapsulates it and forwards it to the interested parties. Currently with WRONGVIF we can only be sending empty register packets and will lose that data. This behaviour can be enabled by using MRT_PIM with val == IGMPMSG_WRVIFWHOLE. This doesn't prevent IGMPMSG_WRONGVIF from happening, it happens in addition to it, also it is controlled by the same throttling parameters as WRONGVIF (i.e. 1 packet per 3 seconds currently). Both messages are generated to keep backwards compatibily and avoid breaking someone who was enabling MRT_PIM with val == 4, since any positive val is accepted and treated the same. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 1 + include/uapi/linux/mroute.h | 2 ++ net/ipv4/ipmr.c | 21 ++++++++++++++++----- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index fd436cdd4725..6675b9f81979 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -254,6 +254,7 @@ struct mr_table { atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; + bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 10f9ff9426a2..5d37a9ccce63 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -120,6 +120,7 @@ enum { IPMRA_TABLE_MROUTE_DO_ASSERT, IPMRA_TABLE_MROUTE_DO_PIM, IPMRA_TABLE_VIFS, + IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, __IPMRA_TABLE_MAX }; #define IPMRA_TABLE_MAX (__IPMRA_TABLE_MAX - 1) @@ -173,5 +174,6 @@ enum { #define IGMPMSG_NOCACHE 1 /* Kern cache fill request to mrouted */ #define IGMPMSG_WRONGVIF 2 /* For PIM assert processing (unused) */ #define IGMPMSG_WHOLEPKT 3 /* For PIM Register processing */ +#define IGMPMSG_WRVIFWHOLE 4 /* For PIM Register and assert processing */ #endif /* _UAPI__LINUX_MROUTE_H */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 82f914122f1b..5660adcf7a04 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *skb; int ret; - if (assert == IGMPMSG_WHOLEPKT) + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); @@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt, if (!skb) return -ENOBUFS; - if (assert == IGMPMSG_WHOLEPKT) { + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and @@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt, skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); - msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_msgtype = assert; msg->im_mbz = 0; - msg->im_vif = mrt->mroute_reg_vif_num; + if (assert == IGMPMSG_WRVIFWHOLE) + msg->im_vif = vifi; + else + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; + bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ @@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, break; } + do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; + mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: @@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + if (mrt->mroute_do_wrvifwhole) + ipmr_cache_report(mrt, skb, true_vifi, + IGMPMSG_WRVIFWHOLE); } goto dont_forward; } @@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, + mrt->mroute_do_wrvifwhole)) return false; return true; -- cgit v1.2.3 From 01683a1469995cc7aaf833d6f8b3f1c1d2fc3b92 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 13:29:11 +0300 Subject: net: sched: refactor flower walk to iterate over idr Extend struct tcf_walker with additional 'cookie' field. It is intended to be used by classifier walk implementations to continue iteration directly from particular filter, instead of iterating 'skip' number of times. Change flower walk implementation to save filter handle in 'cookie'. Each time flower walk is called, it looks up filter with saved handle directly with idr, instead of iterating over filter linked list 'skip' number of times. This change improves complexity of dumping flower classifier from quadratic to linearithmic. (assuming idr lookup has logarithmic complexity) Reviewed-by: Jiri Pirko Signed-off-by: Vlad Buslov Reported-by: Simon Horman Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + net/sched/cls_api.c | 2 ++ net/sched/cls_flower.c | 20 +++++++++----------- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2081e4219f81..e4252a176eec 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -13,6 +13,7 @@ struct tcf_walker { int stop; int skip; int count; + unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 73d9967c3739..c51b1b12450d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1508,7 +1508,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; + arg.w.cookie = cb->args[2]; tp->ops->walk(tp, &arg.w); + cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) return false; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8b2474293db1..c53fdd411f90 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1099,19 +1099,17 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; - struct fl_flow_mask *mask; - list_for_each_entry_rcu(mask, &head->masks, list) { - list_for_each_entry_rcu(f, &mask->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - break; - } -skip: - arg->count++; + arg->count = arg->skip; + + while ((f = idr_get_next_ul(&head->handle_idr, + &arg->cookie)) != NULL) { + if (arg->fn(tp, f, arg) < 0) { + arg->stop = 1; + break; } + arg->cookie = f->handle + 1; + arg->count++; } } -- cgit v1.2.3 From ff0432e5a8025df895813408325b2afdfa21f946 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 14 Jul 2018 16:36:29 +0800 Subject: tcp: remove redundant rcv_nxt update tcp_rcv_nxt_update() is already executed in tcp_data_queue(). This line is redundant. See bellow, tcp_queue_rcv tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); <<<< redundant Signed-off-by: Yafang Shao Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d3b6390ecf23..fac5d03d4528 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4695,7 +4695,6 @@ queue_and_out: } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) -- cgit v1.2.3 From f333ee0cdb27ba201e6cc0c99c76b1364aa29b86 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 11 Jul 2018 17:33:32 -0700 Subject: bpf: Add BPF_SOCK_OPS_TCP_LISTEN_CB Add new TCP-BPF callback that is called on listen(2) right after socket transition to TCP_LISTEN state. It fills the gap for listening sockets in TCP-BPF. For example BPF program can set BPF_SOCK_OPS_STATE_CB_FLAG when socket becomes listening and track later transition from TCP_LISTEN to TCP_CLOSE with BPF_SOCK_OPS_STATE_CB callback. Before there was no way to do it with TCP-BPF and other options were much harder to work with. E.g. socket state tracking can be done with tracepoints (either raw or regular) but they can't be attached to cgroup and their lifetime has to be managed separately. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +++ net/ipv4/af_inet.c | 1 + 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6bcb287a888d..870113916cac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2555,6 +2555,9 @@ enum { * Arg1: old_state * Arg2: new_state */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c716be13d58c..f2a0a3bab6b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; -- cgit v1.2.3 From 784abe24c903b093af04cf1a043140faa556cad7 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:35 +0300 Subject: net: Add decrypted field to skb The decrypted bit is propogated to cloned/copied skbs. This will be used later by the inline crypto receive side offload of tls. Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++++- net/core/skbuff.c | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7601838c2513..3ceb8dcc54da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @decrypted: Decrypted SKB * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -736,7 +737,11 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, - __unused:1; /* one bit hole */ +#ifdef CONFIG_TLS_DEVICE + decrypted:1; +#else + __unused:1; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c4e24ac27464..cfd6c6f35f9c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -805,6 +805,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; +#ifdef CONFIG_TLS_DEVICE + new->decrypted = old->decrypted; +#endif memcpy(&new->headers_start, &old->headers_start, offsetof(struct sk_buff, headers_end) - @@ -865,6 +868,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); +#ifdef CONFIG_TLS_DEVICE + C(decrypted); +#endif refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); -- cgit v1.2.3 From 14136564c8ee94566945e85014019cbdb1716dca Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Fri, 13 Jul 2018 14:33:36 +0300 Subject: net: Add TLS RX offload feature This patch adds a netdev feature to configure TLS RX inline crypto offload. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ net/core/ethtool.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 623bb8ced060..2b2a6dce1630 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -79,6 +79,7 @@ enum { NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ NETIF_F_HW_TLS_TX_BIT, /* Hardware TLS TX offload */ + NETIF_F_HW_TLS_RX_BIT, /* Hardware TLS RX offload */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ @@ -151,6 +152,7 @@ enum { #define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) +#define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e677a20180cf..c9993c6c2fd4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", }; static const char -- cgit v1.2.3 From 41ed9c04aac2f8c6ee922e29ce5e69f185c5125b Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:38 +0300 Subject: tcp: Don't coalesce decrypted and encrypted SKBs Prevent coalescing of decrypted and encrypted SKBs in GRO and TCP layer. Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++++++++ net/ipv4/tcp_offload.c | 3 +++ 2 files changed, 15 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fac5d03d4528..91dbb9afb950 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4343,6 +4343,11 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; +#ifdef CONFIG_TLS_DEVICE + if (from->decrypted != to->decrypted) + return false; +#endif + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -4871,6 +4876,9 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); +#ifdef CONFIG_TLS_DEVICE + nskb->decrypted = skb->decrypted; +#endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -4898,6 +4906,10 @@ restart: skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; +#ifdef CONFIG_TLS_DEVICE + if (skb->decrypted != nskb->decrypted) + goto end; +#endif } } } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index f5aee641f825..870b0a335061 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -262,6 +262,9 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); +#ifdef CONFIG_TLS_DEVICE + flush |= p->decrypted ^ skb->decrypted; +#endif if (flush || skb_gro_receive(p, skb)) { mss = 1; -- cgit v1.2.3 From d80a1b9d186057ddb0d384ba601cf2b7d214539c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:39 +0300 Subject: tls: Refactor tls_offload variable names For symmetry, we rename tls_offload_context to tls_offload_context_tx before we add tls_offload_context_rx. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h | 6 +++--- include/net/tls.h | 16 +++++++------- net/tls/tls_device.c | 25 +++++++++++----------- net/tls/tls_device_fallback.c | 8 +++---- 4 files changed, 27 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h index b6162178f621..b82f4deaa398 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h @@ -50,7 +50,7 @@ struct mlx5e_tls { }; struct mlx5e_tls_offload_context { - struct tls_offload_context base; + struct tls_offload_context_tx base; u32 expected_seq; __be32 swid; }; @@ -59,8 +59,8 @@ static inline struct mlx5e_tls_offload_context * mlx5e_get_tls_tx_context(struct tls_context *tls_ctx) { BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) > - TLS_OFFLOAD_CONTEXT_SIZE); - return container_of(tls_offload_ctx(tls_ctx), + TLS_OFFLOAD_CONTEXT_SIZE_TX); + return container_of(tls_offload_ctx_tx(tls_ctx), struct mlx5e_tls_offload_context, base); } diff --git a/include/net/tls.h b/include/net/tls.h index 70c273777fe9..5dcd808236a7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -128,7 +128,7 @@ struct tls_record_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; -struct tls_offload_context { +struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; @@ -147,8 +147,8 @@ struct tls_offload_context { #define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) }; -#define TLS_OFFLOAD_CONTEXT_SIZE \ - (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ +#define TLS_OFFLOAD_CONTEXT_SIZE_TX \ + (ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) enum { @@ -239,7 +239,7 @@ void tls_device_sk_destruct(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) @@ -380,10 +380,10 @@ static inline struct tls_sw_context_tx *tls_sw_ctx_tx( return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } -static inline struct tls_offload_context *tls_offload_ctx( - const struct tls_context *tls_ctx) +static inline struct tls_offload_context_tx * +tls_offload_ctx_tx(const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; + return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, @@ -396,7 +396,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct sk_buff *skb); int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7a8f8e20ff3..332a5d1459b6 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,9 +52,8 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + kfree(tls_offload_ctx_tx(ctx)); - kfree(offload_ctx); kfree(ctx); } @@ -125,7 +124,7 @@ static void destroy_record(struct tls_record_info *record) kfree(record); } -static void delete_all_records(struct tls_offload_context *offload_ctx) +static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; @@ -141,14 +140,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; - struct tls_offload_context *ctx; + struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; - ctx = tls_offload_ctx(tls_ctx); + ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; @@ -179,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); if (ctx->open_record) destroy_record(ctx->open_record); @@ -219,7 +218,7 @@ static void tls_append_frag(struct tls_record_info *record, static int tls_push_record(struct sock *sk, struct tls_context *ctx, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, struct page_frag *pfrag, int flags, @@ -264,7 +263,7 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_create_new_record(struct tls_offload_context *offload_ctx, +static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -290,7 +289,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx, } static int tls_do_allocation(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -324,7 +323,7 @@ static int tls_push_data(struct sock *sk, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; @@ -477,7 +476,7 @@ out: return rc; } -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; @@ -524,7 +523,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; struct tls_record_info *start_marker_record; - struct tls_offload_context *offload_ctx; + struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; struct net_device *netdev; char *iv, *rec_seq; @@ -546,7 +545,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto out; } - offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 748914abdb60..d1d7dce38e0b 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) static int fill_sg_in(struct scatterlist *sg_in, struct sk_buff *skb, - struct tls_offload_context *ctx, + struct tls_offload_context_tx *ctx, u64 *rcd_sn, s32 *sync_size, int *resync_sgs) @@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, s32 sync_size, u64 rcd_sn) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; void *buf, *iv, *aad, *dummy_buf; struct aead_request *aead_req; @@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; struct scatterlist *sg_in, sg_out[3]; struct sk_buff *nskb = NULL; @@ -415,7 +415,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, } int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { const u8 *key; -- cgit v1.2.3 From dafb67f3bb4a58a45fe92c1e362ea6429831688a Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:40 +0300 Subject: tls: Split decrypt_skb to two functions Previously, decrypt_skb also updated the TLS context. Now, decrypt_skb only decrypts the payload using the current context, while decrypt_skb_update also updates the state. Later, in the tls_device Rx flow, we will use decrypt_skb directly. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ net/tls/tls_sw.c | 44 ++++++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 5dcd808236a7..49b89221db43 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -390,6 +390,8 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout); struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7453f5ae0819..1d2271736717 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -53,7 +53,6 @@ static int tls_do_decryption(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; int ret; @@ -71,18 +70,6 @@ static int tls_do_decryption(struct sock *sk, ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - if (ret < 0) - goto out; - - rxm->offset += tls_ctx->rx.prepend_size; - rxm->full_len -= tls_ctx->rx.overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx); - - ctx->decrypted = true; - - ctx->saved_data_ready(sk); - -out: aead_request_free(aead_req); return ret; } @@ -666,8 +653,29 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } -static int decrypt_skb(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) +static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int err = 0; + + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + ctx->decrypted = true; + ctx->saved_data_ready(sk); + + return err; +} + +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -812,7 +820,7 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -821,7 +829,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -892,7 +900,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL); if (err < 0) { tls_err_abort(sk, EBADMSG); -- cgit v1.2.3 From 39f56e1a78d647316db330c3b6f4c5637a895e3b Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:41 +0300 Subject: tls: Split tls_sw_release_resources_rx This patch splits tls_sw_release_resources_rx into two functions one which releases all inner software tls structures and another that also frees the containing structure. In TLS_DEVICE we will need to release the software structures without freeeing the containing structure, which contains other information. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_sw.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 49b89221db43..7a485de25646 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -223,6 +223,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, void tls_sw_close(struct sock *sk, long timeout); void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); +void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1d2271736717..694d26589dcc 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1030,7 +1030,7 @@ void tls_sw_free_resources_tx(struct sock *sk) kfree(ctx); } -void tls_sw_free_resources_rx(struct sock *sk) +void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1049,6 +1049,14 @@ void tls_sw_free_resources_rx(struct sock *sk) strp_done(&ctx->strp); lock_sock(sk); } +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + + tls_sw_release_resources_rx(sk); kfree(ctx); } -- cgit v1.2.3 From b190a587c634a8559e4ceabeb0468e93db49789a Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:42 +0300 Subject: tls: Fill software context without allocation This patch allows tls_set_sw_offload to fill the context in case it was already allocated previously. We will use it in TLS_DEVICE to fill the RX software context. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 694d26589dcc..5f7d70b24be6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1081,28 +1081,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (tx) { - sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); - if (!sw_ctx_tx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_tx) { + sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + if (!sw_ctx_tx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_tx = sw_ctx_tx; + } else { + sw_ctx_tx = + (struct tls_sw_context_tx *)ctx->priv_ctx_tx; } - crypto_init_wait(&sw_ctx_tx->async_wait); - ctx->priv_ctx_tx = sw_ctx_tx; } else { - sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); - if (!sw_ctx_rx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_rx) { + sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + if (!sw_ctx_rx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_rx = sw_ctx_rx; + } else { + sw_ctx_rx = + (struct tls_sw_context_rx *)ctx->priv_ctx_rx; } - crypto_init_wait(&sw_ctx_rx->async_wait); - ctx->priv_ctx_rx = sw_ctx_rx; } if (tx) { + crypto_init_wait(&sw_ctx_tx->async_wait); crypto_info = &ctx->crypto_send; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { + crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; -- cgit v1.2.3 From 4799ac81e52a72a6404827bf2738337bb581a174 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:43 +0300 Subject: tls: Add rx inline crypto offload This patch completes the generic infrastructure to offload TLS crypto to a network device. It enables the kernel to skip decryption and authentication of some skbs marked as decrypted by the NIC. In the fast path, all packets received are decrypted by the NIC and the performance is comparable to plain TCP. This infrastructure doesn't require a TCP offload engine. Instead, the NIC only decrypts packets that contain the expected TCP sequence number. Out-Of-Order TCP packets are provided unmodified. As a result, at the worst case a received TLS record consists of both plaintext and ciphertext packets. These partially decrypted records must be reencrypted, only to be decrypted. The notable differences between SW KTLS Rx and this offload are as follows: 1. Partial decryption - Software must handle the case of a TLS record that was only partially decrypted by HW. This can happen due to packet reordering. 2. Resynchronization - tls_read_size calls the device driver to resynchronize HW after HW lost track of TLS record framing in the TCP stream. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 63 +++++++++- net/tls/tls_device.c | 278 ++++++++++++++++++++++++++++++++++++++---- net/tls/tls_device_fallback.c | 1 + net/tls/tls_main.c | 32 +++-- net/tls/tls_sw.c | 24 +++- 5 files changed, 355 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 7a485de25646..d8b3b6578c01 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,16 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; +enum { + TLS_BASE, + TLS_SW, +#ifdef CONFIG_TLS_DEVICE + TLS_HW, +#endif + TLS_HW_RECORD, + TLS_NUM_CONFIG, +}; + struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; @@ -197,6 +207,7 @@ struct tls_context { int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); + void (*sk_destruct)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); int (*setsockopt)(struct sock *sk, int level, @@ -209,13 +220,27 @@ struct tls_context { void (*unhash)(struct sock *sk); }; +struct tls_offload_context_rx { + /* sw must be the first member of tls_offload_context_rx */ + struct tls_sw_context_rx sw; + atomic64_t resync_req; + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE_RX \ + (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); - int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, @@ -290,11 +315,19 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +struct sk_buff * +tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) { - return sk_fullsock(sk) && - /* matches smp_store_release in tls_set_device_offload */ - smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + return sk_fullsock(sk) & + (smp_load_acquire(&sk->sk_validate_xmit_skb) == + &tls_validate_xmit_skb); +#else + return false; +#endif } static inline void tls_err_abort(struct sock *sk, int err) @@ -387,10 +420,27 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline struct tls_offload_context_rx * +tls_offload_ctx_rx(const struct tls_context *tls_ctx) +{ + return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; +} + +/* The TLS context is valid until sk_destruct is called */ +static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + + atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); +} + + int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); @@ -402,4 +452,9 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); + +void tls_device_offload_cleanup_rx(struct sock *sk); +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn); + #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 332a5d1459b6..4995d84d228d 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,7 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - kfree(tls_offload_ctx_tx(ctx)); + if (ctx->tx_conf == TLS_HW) + kfree(tls_offload_ctx_tx(ctx)); + + if (ctx->rx_conf == TLS_HW) + kfree(tls_offload_ctx_rx(ctx)); kfree(ctx); } @@ -70,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work) list_for_each_entry_safe(ctx, tmp, &gc_list, list) { struct net_device *netdev = ctx->netdev; - if (netdev) { + if (netdev && ctx->tx_conf == TLS_HW) { netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); + ctx->netdev = NULL; } list_del(&ctx->list); @@ -81,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work) } } +static void tls_device_attach(struct tls_context *ctx, struct sock *sk, + struct net_device *netdev) +{ + if (sk->sk_destruct != tls_device_sk_destruct) { + refcount_set(&ctx->refcount, 1); + dev_hold(netdev); + ctx->netdev = netdev; + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_device_sk_destruct; + } +} + static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { unsigned long flags; @@ -180,13 +201,15 @@ void tls_device_sk_destruct(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - if (ctx->open_record) - destroy_record(ctx->open_record); + tls_ctx->sk_destruct(sk); - delete_all_records(ctx); - crypto_free_aead(ctx->aead_send); - ctx->sk_destruct(sk); - clean_acked_data_disable(inet_csk(sk)); + if (tls_ctx->tx_conf == TLS_HW) { + if (ctx->open_record) + destroy_record(ctx->open_record); + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + clean_acked_data_disable(inet_csk(sk)); + } if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); @@ -519,6 +542,118 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; + struct tls_offload_context_rx *rx_ctx; + u32 is_req_pending; + s64 resync_req; + u32 req_seq; + + if (tls_ctx->rx_conf != TLS_HW) + return; + + rx_ctx = tls_offload_ctx_rx(tls_ctx); + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); + is_req_pending = resync_req; + + if (unlikely(is_req_pending) && req_seq == seq && + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); +} + +static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + int err = 0, offset = rxm->offset, copy, nsg; + struct sk_buff *skb_iter, *unused; + struct scatterlist sg[1]; + char *orig_buf, *buf; + + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation); + if (!orig_buf) + return -ENOMEM; + buf = orig_buf; + + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + err = nsg; + goto free_buf; + } + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], buf, + rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE); + skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + /* We are interested only in the decrypted data not the auth */ + err = decrypt_skb(sk, skb, sg); + if (err != -EBADMSG) + goto free_buf; + else + err = 0; + + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + + skb_walk_frags(skb, skb_iter) { + copy = min_t(int, skb_iter->len, + rxm->full_len - offset + rxm->offset - + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb_iter->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + } + +free_buf: + kfree(orig_buf); + return err; +} + +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); + int is_decrypted = skb->decrypted; + int is_encrypted = !is_decrypted; + struct sk_buff *skb_iter; + + /* Skip if it is already decrypted */ + if (ctx->sw.decrypted) + return 0; + + /* Check if all the data is decrypted already */ + skb_walk_frags(skb, skb_iter) { + is_decrypted &= skb_iter->decrypted; + is_encrypted &= !skb_iter->decrypted; + } + + ctx->sw.decrypted |= is_decrypted; + + /* Return immedeatly if the record is either entirely plaintext or + * entirely ciphertext. Otherwise handle reencrypt partially decrypted + * record. + */ + return (is_encrypted || is_decrypted) ? 0 : + tls_device_reencrypt(sk, skb); +} + int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; @@ -608,7 +743,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; - offload_ctx->sk_destruct = sk->sk_destruct; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. @@ -618,8 +752,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (skb) TCP_SKB_CB(skb)->eor = 1; - refcount_set(&ctx->refcount, 1); - /* We support starting offload on multiple sockets * concurrently, so we only need a read lock here. * This lock must precede get_netdev_for_sock to prevent races between @@ -654,19 +786,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto release_netdev; - ctx->netdev = netdev; + tls_device_attach(ctx, sk, netdev); - spin_lock_irq(&tls_device_lock); - list_add_tail(&ctx->list, &tls_device_list); - spin_unlock_irq(&tls_device_lock); - - sk->sk_validate_xmit_skb = tls_validate_xmit_skb; /* following this assignment tls_is_sk_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ - smp_store_release(&sk->sk_destruct, - &tls_device_sk_destruct); + smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); + dev_put(netdev); up_read(&device_offload_lock); goto out; @@ -689,6 +816,105 @@ out: return rc; } +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) +{ + struct tls_offload_context_rx *context; + struct net_device *netdev; + int rc = 0; + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: netdev %s with no TLS offload\n", + __func__, netdev->name); + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); + if (!context) { + rc = -ENOMEM; + goto release_netdev; + } + + ctx->priv_ctx_rx = context; + rc = tls_set_sw_offload(sk, ctx, 0); + if (rc) + goto release_ctx; + + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, + &ctx->crypto_recv, + tcp_sk(sk)->copied_seq); + if (rc) { + pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", + __func__); + goto free_sw_resources; + } + + tls_device_attach(ctx, sk, netdev); + goto release_netdev; + +free_sw_resources: + tls_sw_free_resources_rx(sk); +release_ctx: + ctx->priv_ctx_rx = NULL; +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + return rc; +} + +void tls_device_offload_cleanup_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (!netdev) + goto out; + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", + __func__); + goto out; + } + + netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, + TLS_OFFLOAD_CTX_DIR_RX); + + if (tls_ctx->tx_conf != TLS_HW) { + dev_put(netdev); + tls_ctx->netdev = NULL; + } +out: + up_read(&device_offload_lock); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + tls_sw_release_resources_rx(sk); +} + static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; @@ -709,8 +935,12 @@ static int tls_device_down(struct net_device *netdev) spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { - netdev->tlsdev_ops->tls_dev_del(netdev, ctx, - TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->tx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->rx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_RX); ctx->netdev = NULL; dev_put(netdev); list_del_init(&ctx->list); @@ -731,12 +961,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & NETIF_F_HW_TLS_TX)) + if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: + if ((dev->features & NETIF_F_HW_TLS_RX) && + !dev->tlsdev_ops->tls_dev_resync_rx) + return NOTIFY_BAD; + if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index d1d7dce38e0b..e3313c45663f 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -413,6 +413,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, return tls_sw_fallback(sk, skb); } +EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 301f22430469..b09867c8b817 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,15 +51,6 @@ enum { TLSV6, TLS_NUM_PROTS, }; -enum { - TLS_BASE, - TLS_SW, -#ifdef CONFIG_TLS_DEVICE - TLS_HW, -#endif - TLS_HW_RECORD, - TLS_NUM_CONFIG, -}; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); @@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } #ifdef CONFIG_TLS_DEVICE - if (ctx->tx_conf != TLS_HW) { + if (ctx->rx_conf == TLS_HW) + tls_device_offload_cleanup_rx(sk); + + if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) { #else { #endif @@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, conf = TLS_SW; } } else { - rc = tls_set_sw_offload(sk, ctx, 0); - conf = TLS_SW; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload_rx(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 0); + conf = TLS_SW; + } } if (rc) @@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; + + prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; + + prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; + + prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5f7d70b24be6..fe5735c57774 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -654,16 +654,25 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, } static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) + struct scatterlist *sgout, bool *zc) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); int err = 0; - err = decrypt_skb(sk, skb, sgout); +#ifdef CONFIG_TLS_DEVICE + err = tls_device_decrypted(sk, skb); if (err < 0) return err; +#endif + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + } else { + *zc = false; + } rxm->offset += tls_ctx->rx.prepend_size; rxm->full_len -= tls_ctx->rx.overhead_size; @@ -820,7 +829,7 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb_update(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin, &zc); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -829,7 +838,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb_update(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -884,6 +893,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; + bool zc; lock_sock(sk); @@ -900,7 +910,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); @@ -989,6 +999,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } +#ifdef CONFIG_TLS_DEVICE + handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, + *(u64*)tls_ctx->rx.rec_seq); +#endif return data_len + TLS_HEADER_SIZE; read_failure: -- cgit v1.2.3 From 4718799817c5a30ae723eda21f3a6c7d8701b1a4 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:44 +0300 Subject: tls: Fix zerocopy_from_iter iov handling zerocopy_from_iter iterates over the message, but it doesn't revert the updates made by the iov iteration. This patch fixes it. Now, the iov can be used after calling zerocopy_from_iter. Fixes: 3c4d75591 ("tls: kernel TLS support") Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fe5735c57774..7d194c0cd6cf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -263,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge) + bool charge, bool revert) { struct page *pages[MAX_SKB_FRAGS]; @@ -314,6 +314,8 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, out: *size_used = size; *pages_used = num_elem; + if (revert) + iov_iter_revert(from, size); return rc; } @@ -415,7 +417,7 @@ alloc_encrypted: &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true); + true, false); if (ret) goto fallback_to_reg_send; @@ -825,7 +827,7 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], - MAX_SKB_FRAGS, false); + MAX_SKB_FRAGS, false, true); if (err < 0) goto fallback_to_reg_recv; -- cgit v1.2.3 From 4e1a720d0312fd510699032c7694a362a010170f Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Sun, 15 Jul 2018 20:36:50 +0100 Subject: Bluetooth: avoid killing an already killed socket slub debug reported: [ 440.648642] ============================================================================= [ 440.648649] BUG kmalloc-1024 (Tainted: G BU O ): Poison overwritten [ 440.648651] ----------------------------------------------------------------------------- [ 440.648655] INFO: 0xe70f4bec-0xe70f4bec. First byte 0x6a instead of 0x6b [ 440.648665] INFO: Allocated in sk_prot_alloc+0x6b/0xc6 age=33155 cpu=1 pid=1047 [ 440.648671] ___slab_alloc.constprop.24+0x1fc/0x292 [ 440.648675] __slab_alloc.isra.18.constprop.23+0x1c/0x25 [ 440.648677] __kmalloc+0xb6/0x17f [ 440.648680] sk_prot_alloc+0x6b/0xc6 [ 440.648683] sk_alloc+0x1e/0xa1 [ 440.648700] sco_sock_alloc.constprop.6+0x26/0xaf [bluetooth] [ 440.648716] sco_connect_cfm+0x166/0x281 [bluetooth] [ 440.648731] hci_conn_request_evt.isra.53+0x258/0x281 [bluetooth] [ 440.648746] hci_event_packet+0x28b/0x2326 [bluetooth] [ 440.648759] hci_rx_work+0x161/0x291 [bluetooth] [ 440.648764] process_one_work+0x163/0x2b2 [ 440.648767] worker_thread+0x1a9/0x25c [ 440.648770] kthread+0xf8/0xfd [ 440.648774] ret_from_fork+0x2e/0x38 [ 440.648779] INFO: Freed in __sk_destruct+0xd3/0xdf age=3815 cpu=1 pid=1047 [ 440.648782] __slab_free+0x4b/0x27a [ 440.648784] kfree+0x12e/0x155 [ 440.648787] __sk_destruct+0xd3/0xdf [ 440.648790] sk_destruct+0x27/0x29 [ 440.648793] __sk_free+0x75/0x91 [ 440.648795] sk_free+0x1c/0x1e [ 440.648810] sco_sock_kill+0x5a/0x5f [bluetooth] [ 440.648825] sco_conn_del+0x8e/0xba [bluetooth] [ 440.648840] sco_disconn_cfm+0x3a/0x41 [bluetooth] [ 440.648855] hci_event_packet+0x45e/0x2326 [bluetooth] [ 440.648868] hci_rx_work+0x161/0x291 [bluetooth] [ 440.648872] process_one_work+0x163/0x2b2 [ 440.648875] worker_thread+0x1a9/0x25c [ 440.648877] kthread+0xf8/0xfd [ 440.648880] ret_from_fork+0x2e/0x38 [ 440.648884] INFO: Slab 0xf4718580 objects=27 used=27 fp=0x (null) flags=0x40008100 [ 440.648886] INFO: Object 0xe70f4b88 @offset=19336 fp=0xe70f54f8 When KASAN was enabled, it reported: [ 210.096613] ================================================================== [ 210.096634] BUG: KASAN: use-after-free in ex_handler_refcount+0x5b/0x127 [ 210.096641] Write of size 4 at addr ffff880107e17160 by task kworker/u9:1/2040 [ 210.096651] CPU: 1 PID: 2040 Comm: kworker/u9:1 Tainted: G U O 4.14.47-20180606+ #2 [ 210.096654] Hardware name: , BIOS 2017.01-00087-g43e04de 08/30/2017 [ 210.096693] Workqueue: hci0 hci_rx_work [bluetooth] [ 210.096698] Call Trace: [ 210.096711] dump_stack+0x46/0x59 [ 210.096722] print_address_description+0x6b/0x23b [ 210.096729] ? ex_handler_refcount+0x5b/0x127 [ 210.096736] kasan_report+0x220/0x246 [ 210.096744] ex_handler_refcount+0x5b/0x127 [ 210.096751] ? ex_handler_clear_fs+0x85/0x85 [ 210.096757] fixup_exception+0x8c/0x96 [ 210.096766] do_trap+0x66/0x2c1 [ 210.096773] do_error_trap+0x152/0x180 [ 210.096781] ? fixup_bug+0x78/0x78 [ 210.096817] ? hci_debugfs_create_conn+0x244/0x26a [bluetooth] [ 210.096824] ? __schedule+0x113b/0x1453 [ 210.096830] ? sysctl_net_exit+0xe/0xe [ 210.096837] ? __wake_up_common+0x343/0x343 [ 210.096843] ? insert_work+0x107/0x163 [ 210.096850] invalid_op+0x1b/0x40 [ 210.096888] RIP: 0010:hci_debugfs_create_conn+0x244/0x26a [bluetooth] [ 210.096892] RSP: 0018:ffff880094a0f970 EFLAGS: 00010296 [ 210.096898] RAX: 0000000000000000 RBX: ffff880107e170e8 RCX: ffff880107e17160 [ 210.096902] RDX: 000000000000002f RSI: ffff88013b80ed40 RDI: ffffffffa058b940 [ 210.096906] RBP: ffff88011b2b0578 R08: 00000000852f0ec9 R09: ffffffff81cfcf9b [ 210.096909] R10: 00000000d21bdad7 R11: 0000000000000001 R12: ffff8800967b0488 [ 210.096913] R13: ffff880107e17168 R14: 0000000000000068 R15: ffff8800949c0008 [ 210.096920] ? __sk_destruct+0x2c6/0x2d4 [ 210.096959] hci_event_packet+0xff5/0x7de2 [bluetooth] [ 210.096969] ? __local_bh_enable_ip+0x43/0x5b [ 210.097004] ? l2cap_sock_recv_cb+0x158/0x166 [bluetooth] [ 210.097039] ? hci_le_meta_evt+0x2bb3/0x2bb3 [bluetooth] [ 210.097075] ? l2cap_ertm_init+0x94e/0x94e [bluetooth] [ 210.097093] ? xhci_urb_enqueue+0xbd8/0xcf5 [xhci_hcd] [ 210.097102] ? __accumulate_pelt_segments+0x24/0x33 [ 210.097109] ? __accumulate_pelt_segments+0x24/0x33 [ 210.097115] ? __update_load_avg_se.isra.2+0x217/0x3a4 [ 210.097122] ? set_next_entity+0x7c3/0x12cd [ 210.097128] ? pick_next_entity+0x25e/0x26c [ 210.097135] ? pick_next_task_fair+0x2ca/0xc1a [ 210.097141] ? switch_mm_irqs_off+0x346/0xb4f [ 210.097147] ? __switch_to+0x769/0xbc4 [ 210.097153] ? compat_start_thread+0x66/0x66 [ 210.097188] ? hci_conn_check_link_mode+0x1cd/0x1cd [bluetooth] [ 210.097195] ? finish_task_switch+0x392/0x431 [ 210.097228] ? hci_rx_work+0x154/0x487 [bluetooth] [ 210.097260] hci_rx_work+0x154/0x487 [bluetooth] [ 210.097269] process_one_work+0x579/0x9e9 [ 210.097277] worker_thread+0x68f/0x804 [ 210.097285] kthread+0x31c/0x32b [ 210.097292] ? rescuer_thread+0x70c/0x70c [ 210.097299] ? kthread_create_on_node+0xa3/0xa3 [ 210.097306] ret_from_fork+0x35/0x40 [ 210.097314] Allocated by task 2040: [ 210.097323] kasan_kmalloc.part.1+0x51/0xc7 [ 210.097328] __kmalloc+0x17f/0x1b6 [ 210.097335] sk_prot_alloc+0xf2/0x1a3 [ 210.097340] sk_alloc+0x22/0x297 [ 210.097375] sco_sock_alloc.constprop.7+0x23/0x202 [bluetooth] [ 210.097410] sco_connect_cfm+0x2d0/0x566 [bluetooth] [ 210.097443] hci_conn_request_evt.isra.53+0x6d3/0x762 [bluetooth] [ 210.097476] hci_event_packet+0x85e/0x7de2 [bluetooth] [ 210.097507] hci_rx_work+0x154/0x487 [bluetooth] [ 210.097512] process_one_work+0x579/0x9e9 [ 210.097517] worker_thread+0x68f/0x804 [ 210.097523] kthread+0x31c/0x32b [ 210.097529] ret_from_fork+0x35/0x40 [ 210.097533] Freed by task 2040: [ 210.097539] kasan_slab_free+0xb3/0x15e [ 210.097544] kfree+0x103/0x1a9 [ 210.097549] __sk_destruct+0x2c6/0x2d4 [ 210.097584] sco_conn_del.isra.1+0xba/0x10e [bluetooth] [ 210.097617] hci_event_packet+0xff5/0x7de2 [bluetooth] [ 210.097648] hci_rx_work+0x154/0x487 [bluetooth] [ 210.097653] process_one_work+0x579/0x9e9 [ 210.097658] worker_thread+0x68f/0x804 [ 210.097663] kthread+0x31c/0x32b [ 210.097670] ret_from_fork+0x35/0x40 [ 210.097676] The buggy address belongs to the object at ffff880107e170e8 which belongs to the cache kmalloc-1024 of size 1024 [ 210.097681] The buggy address is located 120 bytes inside of 1024-byte region [ffff880107e170e8, ffff880107e174e8) [ 210.097683] The buggy address belongs to the page: [ 210.097689] page:ffffea00041f8400 count:1 mapcount:0 mapping: (null) index:0xffff880107e15b68 compound_mapcount: 0 [ 210.110194] flags: 0x8000000000008100(slab|head) [ 210.115441] raw: 8000000000008100 0000000000000000 ffff880107e15b68 0000000100170016 [ 210.115448] raw: ffffea0004a47620 ffffea0004b48e20 ffff88013b80ed40 0000000000000000 [ 210.115451] page dumped because: kasan: bad access detected [ 210.115454] Memory state around the buggy address: [ 210.115460] ffff880107e17000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 210.115465] ffff880107e17080: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb [ 210.115469] >ffff880107e17100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 210.115472] ^ [ 210.115477] ffff880107e17180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 210.115481] ffff880107e17200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 210.115483] ================================================================== And finally when BT_DBG() and ftrace was enabled it showed: <...>-14979 [001] .... 186.104191: sco_sock_kill <-sco_sock_close <...>-14979 [001] .... 186.104191: sco_sock_kill <-sco_sock_release <...>-14979 [001] .... 186.104192: sco_sock_kill: sk ef0497a0 state 9 <...>-14979 [001] .... 186.104193: bt_sock_unlink <-sco_sock_kill kworker/u9:2-792 [001] .... 186.104246: sco_sock_kill <-sco_conn_del kworker/u9:2-792 [001] .... 186.104248: sco_sock_kill: sk ef0497a0 state 9 kworker/u9:2-792 [001] .... 186.104249: bt_sock_unlink <-sco_sock_kill kworker/u9:2-792 [001] .... 186.104250: sco_sock_destruct <-__sk_destruct kworker/u9:2-792 [001] .... 186.104250: sco_sock_destruct: sk ef0497a0 kworker/u9:2-792 [001] .... 186.104860: hci_conn_del <-hci_event_packet kworker/u9:2-792 [001] .... 186.104864: hci_conn_del: hci0 hcon ef0484c0 handle 266 Only in the failed case, sco_sock_kill() gets called with the same sock pointer two times. Add a check for SOCK_DEAD to avoid continue killing a socket which has already been killed. Signed-off-by: Sudip Mukherjee Signed-off-by: Marcel Holtmann --- net/bluetooth/sco.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 413b8ee49fec..8f0f9279eac9 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -393,7 +393,8 @@ static void sco_sock_cleanup_listen(struct sock *parent) */ static void sco_sock_kill(struct sock *sk) { - if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || + sock_flag(sk, SOCK_DEAD)) return; BT_DBG("sk %p state %d", sk, sk->sk_state); -- cgit v1.2.3 From 6542df2f8412a8a065e987aac940130884028715 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 12 Jun 2018 01:54:47 +0900 Subject: netfilter: nft_reject_bridge: remove unnecessary ttl set In the nft_reject_br_send_v4_tcp_reset(), a ttl is set by the nf_reject_iphdr_put(). so, below code is unnecessary. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 6de981270566..08cbed7d940e 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -89,8 +89,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, net->ipv4.sysctl_ip_default_ttl); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->ttl = net->ipv4.sysctl_ip_default_ttl; - niph->tot_len = htons(nskb->len); + niph->tot_len = htons(nskb->len); ip_send_check(niph); nft_reject_br_push_etherhdr(oldskb, nskb); -- cgit v1.2.3 From e97d9404d5e8aea1f91f4c00dbe7854008f3a1e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 15 Jun 2018 23:46:42 +0200 Subject: netfilter: flowtables: use fixed renew timeout on teardown This is one of the very few external callers of ->get_timeouts(), We can use a fixed timeout instead, conntrack core will refresh this in case a new packet comes within this period. Use of ESTABLISHED timeout seems way too huge anyway. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index eb0d1658ac05..d8125616edc7 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -107,11 +107,12 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) tcp->seen[1].td_maxwin = 0; } +#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) +#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) + static void flow_offload_fixup_ct_state(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; - struct net *net = nf_ct_net(ct); - unsigned int *timeouts; unsigned int timeout; int l4num; @@ -123,14 +124,10 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct) if (!l4proto) return; - timeouts = l4proto->get_timeouts(net); - if (!timeouts) - return; - if (l4num == IPPROTO_TCP) - timeout = timeouts[TCP_CONNTRACK_ESTABLISHED]; + timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; else if (l4num == IPPROTO_UDP) - timeout = timeouts[UDP_CT_REPLIED]; + timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; else return; -- cgit v1.2.3 From f286586df68e7733a8e651098401f139dc2e17f4 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Mon, 18 Jun 2018 15:12:52 +0200 Subject: netfilter: nft_tproxy: Move nf_tproxy_assign_sock() to nf_tproxy.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is also necessary to implement nft tproxy support Fixes: 45ca4e0cf273 ("netfilter: Libify xt_TPROXY") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 8 ++++++++ net/netfilter/xt_TPROXY.c | 9 --------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 9754a50ecde9..d5a80888cbe4 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -17,6 +17,14 @@ static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) return false; } +/* assign a socket to the skb -- consumes sk */ +static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); /** diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 58fce4e749a9..35df0827e2ca 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -36,15 +36,6 @@ #include #include -/* assign a socket to the skb -- consumes sk */ -static void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = sock_edemux; -} - static unsigned int tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) -- cgit v1.2.3 From d7e5a9a50245b91f016c814b0f076f7e55cbb980 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:49:43 +0200 Subject: netfilter: utils: move nf_ip_checksum* from ipv4 to utils allows to make nf_ip_checksum_partial static, it no longer has an external caller. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4.h | 11 --------- net/ipv4/netfilter.c | 53 ---------------------------------------- net/netfilter/utils.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index b31dabfdb453..95ab5cc64422 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -23,9 +23,6 @@ struct nf_queue_entry; #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol); int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry); @@ -35,14 +32,6 @@ static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, { return 0; } -static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb, - unsigned int hook, - unsigned int dataoff, - unsigned int len, - u_int8_t protocol) -{ - return 0; -} static inline int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict) { diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e6774ccb7731..8d2e5dc9a827 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_ip_reroute); -__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if ((protocol == 0 && !csum_fold(skb->csum)) || - !csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - dataoff, protocol, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - if (protocol == 0) - skb->csum = 0; - else - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len - dataoff, - protocol, 0); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip_checksum); - -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, - skb->len - dataoff, 0); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -} -EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 0b660c568156..8980c8a0fe5c 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -1,9 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include +#ifdef CONFIG_INET +__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u8 protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if ((protocol == 0 && !csum_fold(skb->csum)) || + !csum_tcpudp_magic(iph->saddr, iph->daddr, + skb->len - dataoff, protocol, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + if (protocol == 0) + skb->csum = 0; + else + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len - dataoff, + protocol, 0); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip_checksum); +#endif + +static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u8 protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, + skb->len - dataoff, 0); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +} + __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family) -- cgit v1.2.3 From ebee5a50d0b7cdc576aa8081f05b86971880054d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:49:59 +0200 Subject: netfilter: utils: move nf_ip6_checksum* from ipv6 to utils similar to previous change, this also allows to remove it from nf_ipv6_ops and avoid the indirection. It also removes the bogus dependency of nf_conntrack_ipv6 on ipv6 module: ipv6 checksum functions are built into kernel even if CONFIG_IPV6=m, but ipv6/netfilter.o isn't. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 5 --- net/ipv6/netfilter.c | 62 ---------------------------------- net/netfilter/utils.c | 76 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 288c597e75b3..c0dc4dd78887 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -30,11 +30,6 @@ struct nf_ipv6_ops { void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); - __sum16 (*checksum)(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol); - __sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol); int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 531d6957af36..5ae8e1c51079 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -15,7 +15,6 @@ #include #include #include -#include #include int ip6_route_me_harder(struct net *net, struct sk_buff *skb) @@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst, return err; } -__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - skb->len - dataoff, protocol, - csum_sub(skb->csum, - skb_checksum(skb, 0, - dataoff, 0)))) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - skb->csum = ~csum_unfold( - csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - skb->len - dataoff, - protocol, - csum_sub(0, - skb_checksum(skb, 0, - dataoff, 0)))); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip6_checksum); - -static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __wsum hsum; - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip6_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - hsum = skb_checksum(skb, 0, dataoff, 0); - skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb->len - dataoff, - protocol, - csum_sub(0, hsum))); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -}; - static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, .route_input = ip6_route_input, .fragment = ip6_fragment, - .checksum = nf_ip6_checksum, - .checksum_partial = nf_ip6_checksum_partial, .route = nf_ip6_route, .reroute = nf_ip6_reroute, }; diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 8980c8a0fe5c..e8da9a9bba73 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, @@ -59,11 +60,69 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; } +__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u8 protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, protocol, + csum_sub(skb->csum, + skb_checksum(skb, 0, + dataoff, 0)))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold( + csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, + skb_checksum(skb, 0, + dataoff, 0)))); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip6_checksum); + +static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u8 protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum hsum; + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip6_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + hsum = skb_checksum(skb, 0, dataoff, 0); + skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, hsum))); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +}; + __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol, + unsigned int dataoff, u8 protocol, unsigned short family) { - const struct nf_ipv6_ops *v6ops; __sum16 csum = 0; switch (family) { @@ -71,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, csum = nf_ip_checksum(skb, hook, dataoff, protocol); break; case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - csum = v6ops->checksum(skb, hook, dataoff, protocol); + csum = nf_ip6_checksum(skb, hook, dataoff, protocol); break; } @@ -83,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, - u_int8_t protocol, unsigned short family) + u8 protocol, unsigned short family) { - const struct nf_ipv6_ops *v6ops; __sum16 csum = 0; switch (family) { @@ -94,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, protocol); break; case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - csum = v6ops->checksum_partial(skb, hook, dataoff, len, - protocol); + csum = nf_ip6_checksum_partial(skb, hook, dataoff, len, + protocol); break; } -- cgit v1.2.3 From 60e3be94e6a1c5162a0763c9aafb5190b2b1fdce Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:55:32 +0200 Subject: openvswitch: use nf_ct_get_tuplepr, invert_tuplepr These versions deal with the l3proto/l4proto details internally. It removes only caller of nf_ct_get_tuple, so make it static. After this, l3proto->get_l4proto() can be removed in a followup patch. Signed-off-by: Florian Westphal Acked-by: Pravin B Shelar Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 7 ------- net/netfilter/nf_conntrack_core.c | 3 +-- net/openvswitch/conntrack.c | 17 +++-------------- 3 files changed, 4 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 9b5e7634713e..90df45022c51 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -40,13 +40,6 @@ void nf_conntrack_cleanup_start(void); void nf_conntrack_init_end(void); void nf_conntrack_cleanup_end(void); -bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, - unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - struct net *net, - struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *l4proto); - bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_l3proto *l3proto, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 85ab2fd6a665..be0ab81e6b2c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net, return scale_hash(hash_conntrack_raw(tuple, net)); } -bool +static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, @@ -244,7 +244,6 @@ nf_ct_get_tuple(const struct sk_buff *skb, return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } -EXPORT_SYMBOL_GPL(nf_ct_get_tuple); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 284aca2a252d..e05bd3e53f0f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -607,23 +607,12 @@ static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 l3num, struct sk_buff *skb, bool natted) { - const struct nf_conntrack_l3proto *l3proto; - const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - unsigned int dataoff; - u8 protonum; - l3proto = __nf_ct_l3proto_find(l3num); - if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, - &protonum) <= 0) { - pr_debug("ovs_ct_find_existing: Can't get protonum\n"); - return NULL; - } - l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - protonum, net, &tuple, l3proto, l4proto)) { + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, + net, &tuple)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); return NULL; } @@ -632,7 +621,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, if (natted) { struct nf_conntrack_tuple inverse; - if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { + if (!nf_ct_invert_tuplepr(&inverse, &tuple)) { pr_debug("ovs_ct_find_existing: Inversion failed!\n"); return NULL; } -- cgit v1.2.3 From 7414d929bc35b9a7c3eab98ef7bd32d5ae4c2981 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 28 Jun 2018 20:01:02 +0200 Subject: netfilter: Kconfig: Make NETFILTER_XT_MATCH_SOCKET select NF_SOCKET_IPV4/6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of depending on it. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index dbd7d1fad277..3ce657fbca67 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1492,8 +1492,8 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n - depends on NF_SOCKET_IPV4 - depends on NF_SOCKET_IPV6 + select NF_SOCKET_IPV4 + select NF_SOCKET_IPV6 if IP6_NF_IPTABLES select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help -- cgit v1.2.3 From f957be9d349a3800940f823b16e12b0405cc305b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:44 +0200 Subject: netfilter: conntrack: remove ctnetlink callbacks from l3 protocol trackers handle everything from ctnetlink directly. After all these years we still only support ipv4 and ipv6, so it seems reasonable to remove l3 protocol tracker support and instead handle ipv4/ipv6 from a common, always builtin inet tracker. Step 1: Get rid of all the l3proto->func() calls. Start with ctnetlink, then move on to packet-path ones. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 6 +- include/net/netfilter/nf_conntrack_l3proto.h | 8 --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 47 ------------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 48 ------------- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 - net/netfilter/nf_conntrack_expect.c | 1 - net/netfilter/nf_conntrack_helper.c | 1 - net/netfilter/nf_conntrack_netlink.c | 96 +++++++++++++++++++------- net/netfilter/nf_conntrack_proto.c | 5 +- net/netfilter/nf_conntrack_standalone.c | 14 ++-- net/netfilter/nfnetlink_cttimeout.c | 1 - 11 files changed, 79 insertions(+), 149 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 90df45022c51..d454a53ba646 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -68,10 +68,8 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -void -print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *proto); +void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l4proto *proto); #define CONNTRACK_LOCKS 1024 diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index d5808f3e2715..d07b5216a925 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -46,14 +46,6 @@ struct nf_conntrack_l3proto { int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - int (*tuple_to_nlattr)(struct sk_buff *skb, - const struct nf_conntrack_tuple *t); - int (*nlattr_to_tuple)(struct nlattr *tb[], - struct nf_conntrack_tuple *t); - const struct nla_policy *nla_policy; -#endif - /* Called when netns wants to use connection tracking */ int (*net_ns_get)(struct net *); void (*net_ns_put)(struct net *); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 9db988f9a4d7..98ed12858c52 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -274,41 +274,6 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include - -static int ipv4_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V4_SRC] = { .type = NLA_U32 }, - [CTA_IP_V4_DST] = { .type = NLA_U32 }, -}; - -static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; - - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); - - return 0; -} -#endif - static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, @@ -360,13 +325,6 @@ const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv4_tuple_to_nlattr, - .nlattr_to_tuple = ipv4_nlattr_to_tuple, - .nla_policy = ipv4_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ - NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */ -#endif .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, @@ -419,11 +377,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) need_conntrack(); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv4.nla_size)) - return -EINVAL; -#endif ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 663827ee3cf8..13a660ae5799 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -269,41 +269,6 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include - -static int ipv6_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || - nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 }, - [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 }, -}; - -static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; - - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); - - return 0; -} -#endif - static int ipv6_hooks_register(struct net *net) { struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); @@ -345,13 +310,6 @@ const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, .get_l4proto = ipv6_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv6_tuple_to_nlattr, - .nlattr_to_tuple = ipv6_nlattr_to_tuple, - .nla_policy = ipv6_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) + - NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])), -#endif .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, .me = THIS_MODULE, @@ -409,12 +367,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) need_conntrack(); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv6.nla_size)) - return -EINVAL; -#endif - ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index c87b48359e8f..e631be25337e 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #endif diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 853b23206bb7..3f586ba23d92 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -610,7 +610,6 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l3proto_find(expect->tuple.src.l3num), __nf_ct_l4proto_find(expect->tuple.src.l3num, expect->tuple.dst.protonum)); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a75b11c39312..a55a58c706a9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 20a2e37c76d1..40152b9ad772 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -81,9 +80,26 @@ nla_put_failure: return -1; } +static int ipv4_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || + nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) + return -EMSGSIZE; + return 0; +} + +static int ipv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || + nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) + return -EMSGSIZE; + return 0; +} + static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto) + const struct nf_conntrack_tuple *tuple) { int ret = 0; struct nlattr *nest_parms; @@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, if (!nest_parms) goto nla_put_failure; - if (likely(l3proto->tuple_to_nlattr)) - ret = l3proto->tuple_to_nlattr(skb, tuple); + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + ret = ipv4_tuple_to_nlattr(skb, tuple); + break; + case NFPROTO_IPV6: + ret = ipv6_tuple_to_nlattr(skb, tuple); + break; + } nla_nest_end(skb, nest_parms); @@ -106,13 +128,11 @@ nla_put_failure: static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; int ret; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); + ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { l4proto = __nf_ct_l4proto_find(tuple->src.l3num, @@ -556,15 +576,20 @@ nla_put_failure: return -1; } +static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = { + [CTA_IP_V4_SRC] = { .type = NLA_U32 }, + [CTA_IP_V4_DST] = { .type = NLA_U32 }, + [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 }, + [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 }, +}; + #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; size_t len, len4 = 0; - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - len = l3proto->nla_size; + len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); @@ -936,29 +961,54 @@ out: return skb->len; } +static int ipv4_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + + return 0; +} + +static int ipv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + + return 0; +} + static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_IP_MAX+1]; - struct nf_conntrack_l3proto *l3proto; int ret = 0; ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL); if (ret < 0) return ret; - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = nla_validate_nested(attr, CTA_IP_MAX, + cta_ip_nla_policy, NULL); + if (ret) + return ret; - if (likely(l3proto->nlattr_to_tuple)) { - ret = nla_validate_nested(attr, CTA_IP_MAX, - l3proto->nla_policy, NULL); - if (ret == 0) - ret = l3proto->nlattr_to_tuple(tb, tuple); + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + ret = ipv4_nlattr_to_tuple(tb, tuple); + break; + case NFPROTO_IPV6: + ret = ipv6_nlattr_to_tuple(tb, tuple); + break; } - rcu_read_unlock(); - return ret; } @@ -2581,7 +2631,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; @@ -2597,8 +2646,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); + ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index d88841fbc560..859cb303bb91 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -294,10 +294,7 @@ int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto) if (proto->l3proto >= NFPROTO_NUMPROTO) return -EBUSY; -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (proto->tuple_to_nlattr && proto->nla_size == 0) - return -EINVAL; -#endif + mutex_lock(&nf_ct_proto_mutex); old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], lockdep_is_held(&nf_ct_proto_mutex)); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b642c0b2495c..47b80fd0d2c3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -38,10 +37,9 @@ MODULE_LICENSE("GPL"); #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - switch (l3proto->l3proto) { + switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); @@ -282,7 +280,6 @@ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; @@ -303,14 +300,12 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!net_eq(nf_ct_net(ct), net)) goto release; - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - WARN_ON(!l3proto); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); WARN_ON(!l4proto); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", - l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), + l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) @@ -320,7 +315,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto); + l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); @@ -333,8 +328,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); - print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto); + print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 9ee5fa551fa6..9da4b8462004 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 47a91b14de62e35d1466820cbb4c024b6c02dff1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:45 +0200 Subject: netfilter: conntrack: remove pkt_to_tuple indirection from l3 protocol trackers Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 7 ----- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 17 ----------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 18 ------------ net/netfilter/nf_conntrack_core.c | 39 ++++++++++++++++++++++---- net/netfilter/nf_conntrack_l3proto_generic.c | 10 ------- 5 files changed, 33 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index d07b5216a925..ece231450f30 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,13 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Try to fill in the third arg: nhoff is offset of l3 proto - * hdr. Return true if possible. - */ - bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple); - /* * Invert the per-proto part of the tuple: ie. turn xmit into reply. * Some packets can't be inverted: return 0 in that case. diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 98ed12858c52..7ed56f61798b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -38,22 +38,6 @@ struct conntrack4_net { unsigned int users; }; -static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const __be32 *ap; - __be32 _addrs[2]; - ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), - sizeof(u_int32_t) * 2, _addrs); - if (ap == NULL) - return false; - - tuple->src.u3.ip = ap[0]; - tuple->dst.u3.ip = ap[1]; - - return true; -} - static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -322,7 +306,6 @@ static void ipv4_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, - .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto, .net_ns_get = ipv4_hooks_register, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 13a660ae5799..bdb1709bb951 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -41,23 +41,6 @@ struct conntrack6_net { unsigned int users; }; -static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const u_int32_t *ap; - u_int32_t _addrs[8]; - - ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), - sizeof(_addrs), _addrs); - if (ap == NULL) - return false; - - memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); - memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); - - return true; -} - static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -307,7 +290,6 @@ static void ipv6_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, - .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, .get_l4proto = ipv6_get_l4proto, .net_ns_get = ipv6_hooks_register, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index be0ab81e6b2c..66b2ebae2747 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -230,15 +230,43 @@ nf_ct_get_tuple(const struct sk_buff *skb, u_int8_t protonum, struct net *net, struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { + unsigned int size; + const __be32 *ap; + __be32 _addrs[8]; + memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; - if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + switch (l3num) { + case NFPROTO_IPV4: + nhoff += offsetof(struct iphdr, saddr); + size = 2 * sizeof(__be32); + break; + case NFPROTO_IPV6: + nhoff += offsetof(struct ipv6hdr, saddr); + size = sizeof(_addrs); + break; + default: + return true; + } + + ap = skb_header_pointer(skb, nhoff, size, _addrs); + if (!ap) return false; + switch (l3num) { + case NFPROTO_IPV4: + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + break; + case NFPROTO_IPV6: + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + break; + } + tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; @@ -267,7 +295,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, l4proto = __nf_ct_l4proto_find(l3num, protonum); ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, - l3proto, l4proto); + l4proto); rcu_read_unlock(); return ret; @@ -1318,8 +1346,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, net, &tuple, l3proto, - l4proto)) { + dataoff, l3num, protonum, net, &tuple, l4proto)) { pr_debug("Can't get tuple\n"); return 0; } @@ -1633,7 +1660,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) l4proto = nf_ct_l4proto_find_get(l3num, l4num); if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple, l3proto, l4proto)) + l4num, net, &tuple, l4proto)) return -1; if (ct->status & IPS_SRC_NAT) { diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index 397e6911214f..0b01c9970e99 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -31,15 +31,6 @@ #include #include -static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); - memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - - return true; -} - static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -59,7 +50,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .get_l4proto = generic_get_l4proto, }; -- cgit v1.2.3 From d1b6fe94941f43e4743d5fea953d16b0a001c2c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:46 +0200 Subject: netfilter: conntrack: remove invert_tuple indirection from l3 protocol trackers Its simpler to just handle it directly in nf_ct_invert_tuple(). Also gets rid of need to pass l3proto pointer to resolve_conntrack(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 1 - include/net/netfilter/nf_conntrack_l3proto.h | 7 ------- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 10 ---------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 3 +-- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 10 ---------- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 3 +-- net/netfilter/nf_conntrack_core.c | 26 ++++++++++++++++---------- net/netfilter/nf_conntrack_l3proto_generic.c | 10 ---------- 8 files changed, 18 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index d454a53ba646..35461b2d3462 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -42,7 +42,6 @@ void nf_conntrack_cleanup_end(void); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto); /* Find a connection corresponding to a tuple. */ diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index ece231450f30..164641c743a5 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,13 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Invert the per-proto part of the tuple: ie. turn xmit into reply. - * Some packets can't be inverted: return 0 in that case. - */ - bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig); - /* * Called before tracking. * *dataoff: offset of protocol header (TCP, UDP,...) in skb diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 7ed56f61798b..e10e38c443ab 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -38,15 +38,6 @@ struct conntrack4_net { unsigned int users; }; -static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u3.ip = orig->dst.u3.ip; - tuple->dst.u3.ip = orig->src.u3.ip; - - return true; -} - static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -306,7 +297,6 @@ static void ipv4_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, - .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto, .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 5c15beafa711..34095949a003 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -142,8 +142,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { + if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index bdb1709bb951..f8051fe20489 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -41,15 +41,6 @@ struct conntrack6_net { unsigned int users; }; -static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); - memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); - - return true; -} - static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -290,7 +281,6 @@ static void ipv6_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, - .invert_tuple = ipv6_invert_tuple, .get_l4proto = ipv6_get_l4proto, .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 2548e2c8aedd..8bcbc2f15bd5 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -152,8 +152,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple, - &nf_conntrack_l3proto_ipv6, inproto)) { + if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { pr_debug("icmpv6_error: Can't invert tuple\n"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 66b2ebae2747..14c040805b32 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -305,14 +305,24 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; - if (l3proto->invert_tuple(inverse, orig) == 0) - return false; + + switch (orig->src.l3num) { + case NFPROTO_IPV4: + inverse->src.u3.ip = orig->dst.u3.ip; + inverse->dst.u3.ip = orig->src.u3.ip; + break; + case NFPROTO_IPV6: + inverse->src.u3.in6 = orig->dst.u3.in6; + inverse->dst.u3.in6 = orig->src.u3.in6; + break; + default: + break; + } inverse->dst.dir = !orig->dst.dir; @@ -1222,7 +1232,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash) @@ -1237,7 +1246,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_zone tmp; unsigned int *timeouts; - if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { pr_debug("Can't invert tuple.\n"); return NULL; } @@ -1334,7 +1343,6 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { const struct nf_conntrack_zone *zone; @@ -1356,7 +1364,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { - h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + h = init_conntrack(net, tmpl, &tuple, l4proto, skb, dataoff, hash); if (!h) return 0; @@ -1439,8 +1447,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } repeat: - ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, - l3proto, l4proto); + ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); @@ -1497,7 +1504,6 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, rcu_read_lock(); ret = nf_ct_invert_tuple(inverse, orig, - __nf_ct_l3proto_find(orig->src.l3num), __nf_ct_l4proto_find(orig->src.l3num, orig->dst.protonum)); rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index 0b01c9970e99..d6a8fe591ccc 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -31,15 +31,6 @@ #include #include -static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); - memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - - return true; -} - static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -50,7 +41,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .invert_tuple = generic_invert_tuple, .get_l4proto = generic_get_l4proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); -- cgit v1.2.3 From 6816d931cab009024b68c11c4cf752f8bf9a1e32 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:47 +0200 Subject: netfilter: conntrack: remove get_l4proto indirection from l3 protocol trackers Handle it in the core instead. ipv6_skip_exthdr() is built-in even if ipv6 is a module, i.e. this doesn't create an ipv6 dependency. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 8 -- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 30 ------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 29 ------- net/netfilter/Makefile | 2 +- net/netfilter/nf_conntrack_core.c | 108 ++++++++++++++++++++----- net/netfilter/nf_conntrack_l3proto_generic.c | 46 ----------- net/netfilter/nf_conntrack_proto.c | 5 ++ 7 files changed, 94 insertions(+), 134 deletions(-) delete mode 100644 net/netfilter/nf_conntrack_l3proto_generic.c (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 164641c743a5..5f160375c93a 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,14 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Called before tracking. - * *dataoff: offset of protocol header (TCP, UDP,...) in skb - * *protonum: protocol number - */ - int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum); - /* Called when netns wants to use connection tracking */ int (*net_ns_get)(struct net *); void (*net_ns_put)(struct net *); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index e10e38c443ab..9fbf6c7f8ece 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -38,35 +38,6 @@ struct conntrack4_net { unsigned int users; }; -static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - const struct iphdr *iph; - struct iphdr _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (iph == NULL) - return -NF_ACCEPT; - - /* Conntrack defragments packets, we might still see fragments - * inside ICMP packets though. */ - if (iph->frag_off & htons(IP_OFFSET)) - return -NF_ACCEPT; - - *dataoff = nhoff + (iph->ihl << 2); - *protonum = iph->protocol; - - /* Check bogus IP headers */ - if (*dataoff > skb->len) { - pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " - "nhoff %u, ihl %u, skblen %u\n", - nhoff, iph->ihl << 2, skb->len); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - static unsigned int ipv4_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -297,7 +268,6 @@ static void ipv4_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, - .get_l4proto = ipv4_get_l4proto, .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index f8051fe20489..37ab25645cf2 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -41,34 +41,6 @@ struct conntrack6_net { unsigned int users; }; -static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - unsigned int extoff = nhoff + sizeof(struct ipv6hdr); - __be16 frag_off; - int protoff; - u8 nexthdr; - - if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), - &nexthdr, sizeof(nexthdr)) != 0) { - pr_debug("ip6_conntrack_core: can't get nexthdr\n"); - return -NF_ACCEPT; - } - protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); - /* - * (protoff == skb->len) means the packet has not data, just - * IPv6 and possibly extensions headers, but it is tracked anyway - */ - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); - return -NF_ACCEPT; - } - - *dataoff = protoff; - *protonum = nexthdr; - return NF_ACCEPT; -} - static unsigned int ipv6_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -281,7 +253,6 @@ static void ipv6_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, - .get_l4proto = ipv6_get_l4proto, .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, .me = THIS_MODULE, diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 44449389e527..f132ea850778 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 14c040805b32..0674c6e5bfed 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -37,7 +37,6 @@ #include #include -#include #include #include #include @@ -55,6 +54,7 @@ #include #include #include +#include #include "nf_internals.h" @@ -273,21 +273,94 @@ nf_ct_get_tuple(const struct sk_buff *skb, return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } +static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + u_int8_t *protonum) +{ + int dataoff = -1; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4) + const struct iphdr *iph; + struct iphdr _iph; + + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + /* Conntrack defragments packets, we might still see fragments + * inside ICMP packets though. + */ + if (iph->frag_off & htons(IP_OFFSET)) + return -1; + + dataoff = nhoff + (iph->ihl << 2); + *protonum = iph->protocol; + + /* Check bogus IP headers */ + if (dataoff > skb->len) { + pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", + nhoff, iph->ihl << 2, skb->len); + return -1; + } +#endif + return dataoff; +} + +static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + u8 *protonum) +{ + int protoff = -1; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) + unsigned int extoff = nhoff + sizeof(struct ipv6hdr); + __be16 frag_off; + u8 nexthdr; + + if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), + &nexthdr, sizeof(nexthdr)) != 0) { + pr_debug("can't get nexthdr\n"); + return -1; + } + protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); + /* + * (protoff == skb->len) means the packet has not data, just + * IPv6 and possibly extensions headers, but it is tracked anyway + */ + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("can't find proto in pkt\n"); + return -1; + } + + *protonum = nexthdr; +#endif + return protoff; +} + +static int get_l4proto(const struct sk_buff *skb, + unsigned int nhoff, u8 pf, u8 *l4num) +{ + switch (pf) { + case NFPROTO_IPV4: + return ipv4_get_l4proto(skb, nhoff, l4num); + case NFPROTO_IPV6: + return ipv6_get_l4proto(skb, nhoff, l4num); + default: + *l4num = 0; + break; + } + return -1; +} + bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; - unsigned int protoff; - u_int8_t protonum; + u8 protonum; + int protoff; int ret; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(l3num); - ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); - if (ret != NF_ACCEPT) { + protoff = get_l4proto(skb, nhoff, l3num, &protonum); + if (protoff <= 0) { rcu_read_unlock(); return false; } @@ -1397,14 +1470,12 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; unsigned int *timeouts; - unsigned int dataoff; u_int8_t protonum; - int ret; + int dataoff, ret; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl || ctinfo == IP_CT_UNTRACKED) { @@ -1418,14 +1489,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, } /* rcu_read_lock()ed by nf_hook_thresh */ - l3proto = __nf_ct_l3proto_find(pf); - ret = l3proto->get_l4proto(skb, skb_network_offset(skb), - &dataoff, &protonum); - if (ret <= 0) { + dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum); + if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - ret = -ret; + ret = NF_ACCEPT; goto out; } @@ -1641,14 +1710,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct nf_nat_hook *nat_hook; - unsigned int dataoff, status; + unsigned int status; struct nf_conn *ct; + int dataoff; u16 l3num; u8 l4num; @@ -1657,10 +1726,9 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; l3num = nf_ct_l3num(ct); - l3proto = nf_ct_l3proto_find_get(l3num); - if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, - &l4num) <= 0) + dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); + if (dataoff <= 0) return -1; l4proto = nf_ct_l4proto_find_get(l3num, l4num); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c deleted file mode 100644 index d6a8fe591ccc..000000000000 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * (C) 2003,2004 USAGI/WIDE Project - * - * Based largely upon the original ip_conntrack code which - * had the following copyright information: - * - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - /* Never track !!! */ - return -NF_ACCEPT; -} - - -struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { - .l3proto = PF_UNSPEC, - .get_l4proto = generic_get_l4proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 859cb303bb91..39df72bb9d56 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -35,6 +35,11 @@ EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); +struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { + .l3proto = PF_UNSPEC, +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); + #ifdef CONFIG_SYSCTL static int nf_ct_register_sysctl(struct net *net, -- cgit v1.2.3 From 8b3892ea8718920d29432328fe9544d89a429614 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:48 +0200 Subject: netfilter: conntrack: avoid calls to l4proto invert_tuple Handle the common cases (tcp, udp, etc). in the core and only do the indirect call for the protocols that need it (GRE for instance). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- net/netfilter/nf_conntrack_core.c | 8 +++++++- net/netfilter/nf_conntrack_proto_dccp.c | 10 ---------- net/netfilter/nf_conntrack_proto_generic.c | 10 ---------- net/netfilter/nf_conntrack_proto_gre.c | 10 ---------- net/netfilter/nf_conntrack_proto_sctp.c | 10 ---------- net/netfilter/nf_conntrack_proto_tcp.c | 10 ---------- net/netfilter/nf_conntrack_proto_udp.c | 12 ------------ 8 files changed, 8 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index a7220eef9aee..6a55e337a161 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -36,7 +36,7 @@ struct nf_conntrack_l4proto { struct net *net, struct nf_conntrack_tuple *tuple); /* Invert the per-proto part of the tuple: ie. turn xmit into reply. - * Some packets can't be inverted: return 0 in that case. + * Only used by icmp, most protocols use a generic version. */ bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0674c6e5bfed..92efce69b690 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -400,7 +400,13 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, inverse->dst.dir = !orig->dst.dir; inverse->dst.protonum = orig->dst.protonum; - return l4proto->invert_tuple(inverse, orig); + + if (unlikely(l4proto->invert_tuple)) + return l4proto->invert_tuple(inverse, orig); + + inverse->src.u.all = orig->dst.u.all; + inverse->dst.u.all = orig->src.u.all; + return true; } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index abe647d5b8c6..05620c03f138 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -403,14 +403,6 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, - const struct nf_conntrack_tuple *tuple) -{ - inv->src.u.dccp.port = tuple->dst.u.dccp.port; - inv->dst.u.dccp.port = tuple->src.u.dccp.port; - return true; -} - static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { @@ -865,7 +857,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, .pkt_to_tuple = dccp_pkt_to_tuple, - .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, @@ -901,7 +892,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, .pkt_to_tuple = dccp_pkt_to_tuple, - .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 6c6896d21cd7..4dfe40aa9446 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -41,15 +41,6 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, return true; } -static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return true; -} - static unsigned int *generic_get_timeouts(struct net *net) { return &(generic_pernet(net)->timeout); @@ -168,7 +159,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .l3proto = PF_UNSPEC, .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, - .invert_tuple = generic_invert_tuple, .packet = generic_packet, .get_timeouts = generic_get_timeouts, .new = generic_new, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index d049ea5a3770..0bd40eb06b55 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -179,15 +179,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ -/* invert gre part of tuple */ -static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - return true; -} - /* gre hdr info to tuple */ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -356,7 +347,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fb9a35d16069..148957a5cf3e 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -166,14 +166,6 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.sctp.port = orig->dst.u.sctp.port; - tuple->dst.u.sctp.port = orig->src.u.sctp.port; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -781,7 +773,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif @@ -818,7 +809,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 8e67910185a0..03cff1e3066a 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -293,14 +293,6 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.tcp.port = orig->dst.u.tcp.port; - tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -1560,7 +1552,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .l3proto = PF_INET, .l4proto = IPPROTO_TCP, .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif @@ -1598,7 +1589,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index fe7243970aa4..6fe2233c323a 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -55,14 +55,6 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb, return true; } -static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.udp.port = orig->dst.u.udp.port; - tuple->dst.u.udp.port = orig->src.u.udp.port; - return true; -} - static unsigned int *udp_get_timeouts(struct net *net) { return udp_pernet(net)->timeouts; @@ -302,7 +294,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .l4proto = IPPROTO_UDP, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -334,7 +325,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -366,7 +356,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .l4proto = IPPROTO_UDP, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -398,7 +387,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, -- cgit v1.2.3 From 97e08caec33a0923385b1215c3386c9ee1d07982 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:49 +0200 Subject: netfilter: conntrack: avoid l4proto pkt_to_tuple calls Handle common protocols (udp, tcp, ..), in the core and only do the call if needed by the l4proto tracker. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 16 +++++++++++++++- net/netfilter/nf_conntrack_proto_dccp.c | 17 ----------------- net/netfilter/nf_conntrack_proto_sctp.c | 18 ------------------ net/netfilter/nf_conntrack_proto_tcp.c | 19 ------------------- net/netfilter/nf_conntrack_proto_udp.c | 23 ----------------------- 5 files changed, 15 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 92efce69b690..994591fd9b96 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -235,6 +235,10 @@ nf_ct_get_tuple(const struct sk_buff *skb, unsigned int size; const __be32 *ap; __be32 _addrs[8]; + struct { + __be16 sport; + __be16 dport; + } _inet_hdr, *inet_hdr; memset(tuple, 0, sizeof(*tuple)); @@ -270,7 +274,17 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; - return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); + if (unlikely(l4proto->pkt_to_tuple)) + return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); + + /* Actually only need first 4 bytes to get ports. */ + inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); + if (!inet_hdr) + return false; + + tuple->src.u.udp.port = inet_hdr->sport; + tuple->dst.u.udp.port = inet_hdr->dport; + return true; } static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 05620c03f138..abfdce7baed5 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -388,21 +388,6 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net) return &net->ct.nf_ct_proto.dccp; } -static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - struct dccp_hdr _hdr, *dh; - - /* Actually only need first 4 bytes to get ports. */ - dh = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (dh == NULL) - return false; - - tuple->src.u.dccp.port = dh->dccph_sport; - tuple->dst.u.dccp.port = dh->dccph_dport; - return true; -} - static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { @@ -856,7 +841,6 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, - .pkt_to_tuple = dccp_pkt_to_tuple, .new = dccp_new, .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, @@ -891,7 +875,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, - .pkt_to_tuple = dccp_pkt_to_tuple, .new = dccp_new, .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 148957a5cf3e..b4126a842bfd 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -150,22 +150,6 @@ static inline struct nf_sctp_net *sctp_pernet(struct net *net) return &net->ct.nf_ct_proto.sctp; } -static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct sctphdr *hp; - struct sctphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.sctp.port = hp->source; - tuple->dst.u.sctp.port = hp->dest; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -772,7 +756,6 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, - .pkt_to_tuple = sctp_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif @@ -808,7 +791,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, - .pkt_to_tuple = sctp_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 03cff1e3066a..13c89fd107b2 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -276,23 +276,6 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net) return &net->ct.nf_ct_proto.tcp; } -static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct tcphdr *hp; - struct tcphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.tcp.port = hp->source; - tuple->dst.u.tcp.port = hp->dest; - - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -1551,7 +1534,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_TCP, - .pkt_to_tuple = tcp_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif @@ -1588,7 +1570,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, - .pkt_to_tuple = tcp_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 6fe2233c323a..8b435d70ffe3 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -36,25 +36,6 @@ static inline struct nf_udp_net *udp_pernet(struct net *net) return &net->ct.nf_ct_proto.udp; } -static bool udp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, - struct nf_conntrack_tuple *tuple) -{ - const struct udphdr *hp; - struct udphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.udp.port = hp->source; - tuple->dst.u.udp.port = hp->dest; - - return true; -} - static unsigned int *udp_get_timeouts(struct net *net) { return udp_pernet(net)->timeouts; @@ -293,7 +274,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -324,7 +304,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -355,7 +334,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -386,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .pkt_to_tuple = udp_pkt_to_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, -- cgit v1.2.3 From c779e849608a875448f6ffc2a5c2a15523bdcd00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:50 +0200 Subject: netfilter: conntrack: remove get_timeout() indirection Not needed, we can have the l4trackers fetch it themselvs. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 8 ++------ include/net/netfilter/nf_conntrack_timeout.h | 18 ++++-------------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 16 +++++++++++----- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 14 ++++++++++---- net/netfilter/nf_conntrack_core.c | 16 ++-------------- net/netfilter/nf_conntrack_proto_dccp.c | 17 +++++++---------- net/netfilter/nf_conntrack_proto_generic.c | 22 ++++++++++++---------- net/netfilter/nf_conntrack_proto_gre.c | 14 ++++++++++---- net/netfilter/nf_conntrack_proto_sctp.c | 18 ++++++++---------- net/netfilter/nf_conntrack_proto_tcp.c | 23 +++++++++++------------ net/netfilter/nf_conntrack_proto_udp.c | 20 +++++++++++++------- net/netfilter/nfnetlink_cttimeout.c | 12 ++++-------- 12 files changed, 94 insertions(+), 104 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6a55e337a161..c7a0075d96df 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -45,13 +45,12 @@ struct nf_conntrack_l4proto { int (*packet)(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts); + enum ip_conntrack_info ctinfo); /* Called when a new connection for this protocol found; * returns TRUE if it's OK. If so, packet() called next. */ bool (*new)(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts); + unsigned int dataoff); /* Called when a conntrack entry is destroyed */ void (*destroy)(struct nf_conn *ct); @@ -63,9 +62,6 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* Return the array of timeouts for this protocol. */ - unsigned int *(*get_timeouts)(struct net *net); - /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct); diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 9468ab4ad12d..80ceb3d0291d 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -67,27 +67,17 @@ struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, #endif }; -static inline unsigned int * -nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, - const struct nf_conntrack_l4proto *l4proto) +static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { + unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; - unsigned int *timeouts; timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) { + if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); - if (unlikely(!timeouts)) - timeouts = l4proto->get_timeouts(net); - } else { - timeouts = l4proto->get_timeouts(net); - } - - return timeouts; -#else - return l4proto->get_timeouts(net); #endif + return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 34095949a003..036670b38282 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -80,12 +81,16 @@ static unsigned int *icmp_get_timeouts(struct net *net) static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmp_get_timeouts(nf_ct_net(ct)); + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -93,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, @@ -280,9 +285,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_icmp_net *in = icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { + if (!timeout) + timeout = &in->timeout; *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else { + } else if (timeout) { /* Set default ICMP timeout. */ *timeout = in->timeout; } @@ -357,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, - .get_timeouts = icmp_get_timeouts, .new = icmp_new, .error = icmp_error, .destroy = NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 8bcbc2f15bd5..bed07b998a10 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -93,9 +94,13 @@ static unsigned int *icmpv6_get_timeouts(struct net *net) static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmpv6_get_timeouts(nf_ct_net(ct)); + /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ @@ -106,7 +111,7 @@ static int icmpv6_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, @@ -280,6 +285,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeout = data; struct nf_icmp_net *in = icmpv6_pernet(net); + if (!timeout) + timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; @@ -358,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, .packet = icmpv6_packet, - .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, .error = icmpv6_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 994591fd9b96..c069f2faff4c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1337,7 +1337,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; - unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { pr_debug("Can't invert tuple.\n"); @@ -1356,15 +1355,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (timeout_ext) { - timeouts = nf_ct_timeout_data(timeout_ext); - if (unlikely(!timeouts)) - timeouts = l4proto->get_timeouts(net); - } else { - timeouts = l4proto->get_timeouts(net); - } - if (!l4proto->new(ct, skb, dataoff, timeouts)) { + if (!l4proto->new(ct, skb, dataoff)) { nf_conntrack_free(ct); pr_debug("can't track with proto module\n"); return NULL; @@ -1493,7 +1485,6 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, const struct nf_conntrack_l4proto *l4proto; struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; - unsigned int *timeouts; u_int8_t protonum; int dataoff, ret; @@ -1552,10 +1543,7 @@ repeat: goto out; } - /* Decide what timeout policy we want to apply to this flow. */ - timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - - ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts); + ret = l4proto->packet(ct, skb, dataoff, ctinfo); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index abfdce7baed5..f476d116c816 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -23,6 +23,7 @@ #include #include #include +#include #include /* Timeouts are based on values from RFC4340: @@ -389,7 +390,7 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net) } static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; @@ -437,19 +438,14 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) ntohl(dhack->dccph_ack_nr_low); } -static unsigned int *dccp_get_timeouts(struct net *net) -{ - return dccp_pernet(net)->dccp_timeout; -} - static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + unsigned int dataoff, enum ip_conntrack_info ctinfo) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; + unsigned int *timeouts; dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); @@ -523,6 +519,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; @@ -843,7 +842,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l4proto = IPPROTO_DCCP, .new = dccp_new, .packet = dccp_packet, - .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS @@ -877,7 +875,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l4proto = IPPROTO_DCCP, .new = dccp_new, .packet = dccp_packet, - .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 4dfe40aa9446..ac4a0b296dcd 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -11,6 +11,7 @@ #include #include #include +#include static const unsigned int nf_ct_generic_timeout = 600*HZ; @@ -41,25 +42,24 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, return true; } -static unsigned int *generic_get_timeouts(struct net *net) -{ - return &(generic_pernet(net)->timeout); -} - /* Returns verdict for packet, or -1 for invalid. */ static int generic_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { + const unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = &generic_pernet(nf_ct_net(ct))->timeout; + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { bool ret; @@ -78,8 +78,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - unsigned int *timeout = data; struct nf_generic_net *gn = generic_pernet(net); + unsigned int *timeout = data; + + if (!timeout) + timeout = &gn->timeout; if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) *timeout = @@ -160,7 +163,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, .packet = generic_packet, - .get_timeouts = generic_get_timeouts, .new = generic_new, #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) .ctnl_timeout = { diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 0bd40eb06b55..d1632252bf5b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -234,8 +235,7 @@ static unsigned int *gre_get_timeouts(struct net *net) static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ @@ -254,8 +254,13 @@ static int gre_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { + unsigned int *timeouts = nf_ct_timeout_lookup(ct); + + if (!timeouts) + timeouts = gre_get_timeouts(nf_ct_net(ct)); + pr_debug(": "); nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); @@ -291,6 +296,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; struct netns_proto_gre *net_gre = gre_pernet(net); + if (!timeouts) + timeouts = gre_get_timeouts(net); /* set default timeouts for GRE. */ timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; @@ -350,7 +357,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif - .get_timeouts = gre_get_timeouts, .packet = gre_packet, .new = gre_new, .destroy = gre_destroy, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b4126a842bfd..8d1e085fc14a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -28,6 +28,7 @@ #include #include #include +#include /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR @@ -272,17 +273,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } -static unsigned int *sctp_get_timeouts(struct net *net) -{ - return sctp_pernet(net)->timeouts; -} - /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -291,6 +286,7 @@ static int sctp_packet(struct nf_conn *ct, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; + unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); @@ -379,6 +375,10 @@ static int sctp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && @@ -399,7 +399,7 @@ out: /* Called when a new connection for this protocol found. */ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctphdr *sh; @@ -760,7 +760,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, .can_early_drop = sctp_can_early_drop, @@ -795,7 +794,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, .can_early_drop = sctp_can_early_drop, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 13c89fd107b2..d80d322b9d8b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -768,27 +769,21 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } -static unsigned int *tcp_get_timeouts(struct net *net) -{ - return tcp_pernet(net)->timeouts; -} - /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; + unsigned int index, *timeouts; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; - unsigned int index; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); @@ -1021,6 +1016,10 @@ static int tcp_packet(struct nf_conn *ct, && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = tn->timeouts; + if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; @@ -1070,7 +1069,7 @@ static int tcp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { enum tcp_conntrack new_state; const struct tcphdr *th; @@ -1288,10 +1287,12 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - unsigned int *timeouts = data; struct nf_tcp_net *tn = tcp_pernet(net); + unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; itimeouts[i]; @@ -1538,7 +1539,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, @@ -1574,7 +1574,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8b435d70ffe3..7a1b8988a931 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -45,9 +46,14 @@ static unsigned int *udp_get_timeouts(struct net *net) static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { + unsigned int *timeouts; + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = udp_get_timeouts(nf_ct_net(ct)); + /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { @@ -65,7 +71,7 @@ static int udp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { return true; } @@ -176,6 +182,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; struct nf_udp_net *un = udp_pernet(net); + if (!timeouts) + timeouts = un->timeouts; + /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; @@ -275,7 +284,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .l4proto = IPPROTO_UDP, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -305,7 +313,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -335,7 +342,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .l4proto = IPPROTO_UDP, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -365,7 +371,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -388,3 +393,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); #endif +#include diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 9da4b8462004..d9d952fad3e0 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -46,7 +46,7 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { }; static int -ctnl_timeout_parse_policy(void *timeouts, +ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { @@ -67,7 +67,7 @@ ctnl_timeout_parse_policy(void *timeouts, if (ret < 0) goto err; - ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); @@ -372,7 +372,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; - unsigned int *timeouts; __u16 l3num; __u8 l4num; int ret; @@ -392,9 +391,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, goto err; } - timeouts = l4proto->get_timeouts(net); - - ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, + ret = ctnl_timeout_parse_policy(NULL, l4proto, net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -431,7 +428,6 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { struct nlattr *nest_parms; - unsigned int *timeouts = l4proto->get_timeouts(net); int ret; nest_parms = nla_nest_start(skb, @@ -439,7 +435,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); if (ret < 0) goto nla_put_failure; -- cgit v1.2.3 From d9f37d01e294e5338aa3e9d3b2eda61b59b619df Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 13 Jul 2018 14:41:36 +0800 Subject: net: convert gro_count to bitmask gro_hash size is 192 bytes, and uses 3 cache lines, if there is few flows, gro_hash may be not fully used, so it is unnecessary to iterate all gro_hash in napi_gro_flush(), to occupy unnecessary cacheline. convert gro_count to a bitmask, and rename it as gro_bitmask, each bit represents a element of gro_hash, only flush a gro_hash element if the related bit is set, to speed up napi_gro_flush(). and update gro_bitmask only if it will be changed, to reduce cache update Suggested-by: Eric Dumazet Signed-off-by: Li RongQing Cc: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++-- net/core/dev.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3514d67112b3..c1295c7a452e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,9 +308,14 @@ struct gro_list { }; /* - * Structure for NAPI scheduling similar to tasklet but with weighting + * size of gro hash buckets, must less than bit number of + * napi_struct::gro_bitmask */ #define GRO_HASH_BUCKETS 8 + +/* + * Structure for NAPI scheduling similar to tasklet but with weighting + */ struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means @@ -322,7 +327,7 @@ struct napi_struct { unsigned long state; int weight; - unsigned int gro_count; + unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL int poll_owner; diff --git a/net/core/dev.c b/net/core/dev.c index 0df1771a12f9..c883b17ee0fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5282,9 +5282,11 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, list_del(&skb->list); skb->next = NULL; napi_gro_complete(skb); - napi->gro_count--; napi->gro_hash[index].count--; } + + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. @@ -5295,8 +5297,10 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { u32 i; - for (i = 0; i < GRO_HASH_BUCKETS; i++) - __napi_gro_flush_chain(napi, i, flush_old); + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + if (test_bit(i, &napi->gro_bitmask)) + __napi_gro_flush_chain(napi, i, flush_old); + } } EXPORT_SYMBOL(napi_gro_flush); @@ -5388,8 +5392,8 @@ static void gro_flush_oldest(struct list_head *head) if (WARN_ON_ONCE(!oldest)) return; - /* Do not adjust napi->gro_count, caller is adding a new SKB to - * the chain. + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. */ list_del(&oldest->list); napi_gro_complete(oldest); @@ -5464,7 +5468,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff list_del(&pp->list); pp->next = NULL; napi_gro_complete(pp); - napi->gro_count--; napi->gro_hash[hash].count--; } @@ -5477,7 +5480,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { gro_flush_oldest(gro_head); } else { - napi->gro_count++; napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; @@ -5492,6 +5494,13 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: + if (napi->gro_hash[hash].count) { + if (!test_bit(hash, &napi->gro_bitmask)) + __set_bit(hash, &napi->gro_bitmask); + } else if (test_bit(hash, &napi->gro_bitmask)) { + __clear_bit(hash, &napi->gro_bitmask); + } + return ret; normal: @@ -5890,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_count) { + if (n->gro_bitmask) { unsigned long timeout = 0; if (work_done) @@ -6099,7 +6108,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_count && !napi_disable_pending(napi) && + if (napi->gro_bitmask && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -6114,7 +6123,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - napi->gro_count = 0; + napi->gro_bitmask = 0; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; @@ -6174,7 +6183,7 @@ void netif_napi_del(struct napi_struct *napi) napi_free_frags(napi); flush_gro_hash(napi); - napi->gro_count = 0; + napi->gro_bitmask = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -6216,7 +6225,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_count) { + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ @@ -9272,6 +9281,9 @@ static struct hlist_head * __net_init netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { + BUILD_BUG_ON(GRO_HASH_BUCKETS > + FIELD_SIZEOF(struct napi_struct, gro_bitmask)); + if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); -- cgit v1.2.3 From 301f935be9e09a1bf188bd8262a4db0aeeac2b50 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 16 Jul 2018 16:45:09 +0200 Subject: sch_cake: Fix tin order when set through skb->priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In diffserv mode, CAKE stores tins in a different order internally than the logical order exposed to userspace. The order remapping was missing in the handling of 'tc filter' priority mappings through skb->priority, resulting in bulk and best effort mappings being reversed relative to how they are displayed. Fix this by adding the missing mapping when reading skb->priority. Fixes: 83f8fd69af4f ("sch_cake: Add DiffServ handling") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 30695691e9ff..539c9490c308 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1546,7 +1546,7 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->tin_cnt) { - tin = TC_H_MIN(skb->priority) - 1; + tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; if (q->rate_flags & CAKE_FLAG_WASH) cake_wash_diffserv(skb); -- cgit v1.2.3 From ccdb51717ba3bdc9585998e4ffd41d70c04dedea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 16 Jul 2018 17:02:04 -0700 Subject: net: Fix GRO_HASH_BUCKETS assertion. FIELD_SIZEOF() is in bytes, but we want bits. Fixes: d9f37d01e294 ("net: convert gro_count to bitmask") Suggested-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c883b17ee0fe..4f8b92d81d10 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9282,7 +9282,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - FIELD_SIZEOF(struct napi_struct, gro_bitmask)); + 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask)); if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); -- cgit v1.2.3 From a0ae2562c6c4b2721d9fddba63b7286c13517d9f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:51 +0200 Subject: netfilter: conntrack: remove l3proto abstraction This unifies ipv4 and ipv6 protocol trackers and removes the l3proto abstraction. This gets rid of all l3proto indirect calls and the need to do a lookup on the function to call for l3 demux. It increases module size by only a small amount (12kbyte), so this reduces size because nf_conntrack.ko is useless without either nf_conntrack_ipv4 or nf_conntrack_ipv6 module. before: text data bss dec hex filename 7357 1088 0 8445 20fd nf_conntrack_ipv4.ko 7405 1084 4 8493 212d nf_conntrack_ipv6.ko 72614 13689 236 86539 1520b nf_conntrack.ko 19K nf_conntrack_ipv4.ko 19K nf_conntrack_ipv6.ko 179K nf_conntrack.ko after: text data bss dec hex filename 79277 13937 236 93450 16d0a nf_conntrack.ko 191K nf_conntrack.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 - include/net/netfilter/nf_conntrack.h | 5 + include/net/netfilter/nf_conntrack_core.h | 1 - include/net/netfilter/nf_conntrack_l3proto.h | 54 -- include/net/netfilter/nf_conntrack_l4proto.h | 4 - net/ipv4/netfilter/Kconfig | 22 +- net/ipv4/netfilter/Makefile | 6 - net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 368 ----------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 388 ----------- net/ipv6/netfilter/Kconfig | 27 +- net/ipv6/netfilter/Makefile | 6 - net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 355 ----------- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 387 ----------- net/netfilter/Kconfig | 2 + net/netfilter/Makefile | 7 +- net/netfilter/nf_conntrack_core.c | 11 +- net/netfilter/nf_conntrack_proto.c | 847 ++++++++++++++++++------- net/netfilter/nf_conntrack_proto_icmp.c | 388 +++++++++++ net/netfilter/nf_conntrack_proto_icmpv6.c | 387 +++++++++++ net/netfilter/nf_conntrack_standalone.c | 14 +- net/netfilter/nf_nat_core.c | 8 - 21 files changed, 1420 insertions(+), 1870 deletions(-) delete mode 100644 include/net/netfilter/nf_conntrack_l3proto.h delete mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c delete mode 100644 net/ipv4/netfilter/nf_conntrack_proto_icmp.c delete mode 100644 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c delete mode 100644 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c create mode 100644 net/netfilter/nf_conntrack_proto_icmp.c create mode 100644 net/netfilter/nf_conntrack_proto_icmpv6.c (limited to 'net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 73f825732326..c84b51682f08 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -10,9 +10,6 @@ #ifndef _NF_CONNTRACK_IPV4_H #define _NF_CONNTRACK_IPV4_H - -const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; - extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 062dc19b5840..a2b0ed025908 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -41,6 +41,11 @@ union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; +struct nf_conntrack_net { + unsigned int users4; + unsigned int users6; +}; + #include #include diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 35461b2d3462..2a3e0974a6af 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -14,7 +14,6 @@ #define _NF_CONNTRACK_CORE_H #include -#include #include #include diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h deleted file mode 100644 index 5f160375c93a..000000000000 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C)2003,2004 USAGI/WIDE Project - * - * Header for use in defining a given L3 protocol for connection tracking. - * - * Author: - * Yasuyuki Kozakai @USAGI - * - * Derived from include/netfilter_ipv4/ip_conntrack_protocol.h - */ - -#ifndef _NF_CONNTRACK_L3PROTO_H -#define _NF_CONNTRACK_L3PROTO_H -#include -#include -#include -#include - -struct nf_conntrack_l3proto { - /* L3 Protocol Family number. ex) PF_INET */ - u_int16_t l3proto; - - /* size of tuple nlattr, fills a hole */ - u16 nla_size; - - /* Called when netns wants to use connection tracking */ - int (*net_ns_get)(struct net *); - void (*net_ns_put)(struct net *); - - /* Module (if any) which this is connected to. */ - struct module *me; -}; - -extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO]; - -/* Protocol global registration. */ -int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto); -void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto); - -const struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto); - -/* Existing built-in protocols */ -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic; - -static inline struct nf_conntrack_l3proto * -__nf_ct_l3proto_find(u_int16_t l3proto) -{ - if (unlikely(l3proto >= NFPROTO_NUMPROTO)) - return &nf_conntrack_l3proto_generic; - return rcu_dereference(nf_ct_l3protos[l3proto]); -} - -#endif /*_NF_CONNTRACK_L3PROTO_H*/ diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index c7a0075d96df..6068c6da3eac 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -130,10 +130,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, /* Protocol global registration. */ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto); -int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const proto[], - unsigned int num_proto); -void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const proto[], - unsigned int num_proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index bbfc356cb1b5..d9504adc47b3 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4 tristate default n -config NF_CONNTRACK_IPV4 - tristate "IPv4 connection tracking support (required for NAT)" - depends on NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV4 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv4 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -112,7 +96,7 @@ config NF_REJECT_IPV4 config NF_NAT_IPV4 tristate "IPv4 NAT" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT help @@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY # NAT + specific targets: nf_conntrack config IP_NF_NAT tristate "iptables NAT support" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT select NF_NAT_IPV4 @@ -340,7 +324,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support" depends on IP_NF_MANGLE - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK select NETFILTER_FAMILY_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 8394c17c269f..367993adf4d3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,12 +3,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for l3 independent conntrack -nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o - -# connection tracking -obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o - nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c deleted file mode 100644 index 9fbf6c7f8ece..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ /dev/null @@ -1,368 +0,0 @@ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2006-2012 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int conntrack4_net_id __read_mostly; -static DEFINE_MUTEX(register_ipv4_hooks); - -struct conntrack4_net { - unsigned int users; -}; - -static unsigned int ipv4_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv4_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -static unsigned int ipv4_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ - enum ip_conntrack_info ctinfo; - struct nf_conn *tmpl; - - tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl && nf_ct_is_template(tmpl)) { - /* when skipping ct, clear templates to avoid fooling - * later targets/matches - */ - skb->_nfct = 0; - nf_ct_put(tmpl); - } - return NF_ACCEPT; - } - - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static const struct nf_hook_ops ipv4_conntrack_ops[] = { - { - .hook = ipv4_conntrack_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_conntrack_local, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - - memset(&tuple, 0, sizeof(tuple)); - - lock_sock(sk); - tuple.src.u3.ip = inet->inet_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.ip = inet->inet_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.src.l3num = PF_INET; - tuple.dst.protonum = sk->sk_protocol; - release_sock(sk); - - /* We only do TCP and SCTP at the moment: is there a better way? */ - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (h) { - struct sockaddr_in sin; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u3.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - nf_ct_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = getorigdst, - .owner = THIS_MODULE, -}; - -static int ipv4_hooks_register(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - int err = 0; - - mutex_lock(®ister_ipv4_hooks); - - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv4_enable(net); - if (err) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv4_hooks); - return err; -} - -static void ipv4_hooks_unregister(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - - mutex_lock(®ister_ipv4_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - mutex_unlock(®ister_ipv4_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { - .l3proto = PF_INET, - .net_ns_get = ipv4_hooks_register, - .net_ns_put = ipv4_hooks_unregister, - .me = THIS_MODULE, -}; - -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); -MODULE_ALIAS("ip_conntrack"); -MODULE_LICENSE("GPL"); - -static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, - &nf_conntrack_l4proto_icmp, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, -#endif -}; - -static int ipv4_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static void ipv4_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static struct pernet_operations ipv4_net_ops = { - .init = ipv4_net_init, - .exit = ipv4_net_exit, - .id = &conntrack4_net_id, - .size = sizeof(struct conntrack4_net), -}; - -static int __init nf_conntrack_l3proto_ipv4_init(void) -{ - int ret = 0; - - need_conntrack(); - - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv4_net_ops); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); - goto cleanup_sockopt; - } - - ret = nf_ct_l4proto_register(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_l4proto; - } - - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - cleanup_pernet: - unregister_pernet_subsys(&ipv4_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv4_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - unregister_pernet_subsys(&ipv4_net_ops); - nf_unregister_sockopt(&so_getorigdst); -} - -module_init(nf_conntrack_l3proto_ipv4_init); -module_exit(nf_conntrack_l3proto_ipv4_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c deleted file mode 100644 index 036670b38282..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ /dev/null @@ -1,388 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2006-2010 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const unsigned int nf_ct_icmp_timeout = 30*HZ; - -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - -static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct icmphdr *hp; - struct icmphdr _hdr; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return false; - - tuple->dst.u.icmp.type = hp->type; - tuple->src.u.icmp.id = hp->un.echo.id; - tuple->dst.u.icmp.code = hp->code; - - return true; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 -}; - -static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - if (orig->dst.u.icmp.type >= sizeof(invmap) || - !invmap[orig->dst.u.icmp.type]) - return false; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return true; -} - -static unsigned int *icmp_get_timeouts(struct net *net) -{ - return &icmp_pernet(net)->timeout; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo) -{ - /* Do not immediately delete the connection after the first - successful reply to avoid excessive conntrackd traffic - and also to handle correctly ICMP echo reply duplicates. */ - unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!timeout) - timeout = icmp_get_timeouts(nf_ct_net(ct)); - - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 - }; - - if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || - !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { - /* Can't create a new ICMP `conn' with this. */ - pr_debug("icmp: can't create new conn with type %u\n", - ct->tuplehash[0].tuple.dst.u.icmp.type); - nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int hooknum) -{ - struct nf_conntrack_tuple innertuple, origtuple; - const struct nf_conntrack_l4proto *innerproto; - const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_zone *zone; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); - return -NF_ACCEPT; - } - - /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} - -static void icmp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); -} - -/* Small and modified version of icmp_rcv */ -static int -icmp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct icmphdr *icmph; - struct icmphdr _ih; - - /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); - if (icmph == NULL) { - icmp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } - - /* See ip_conntrack_proto_tcp.c */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, dataoff, 0)) { - icmp_error_log(skb, net, pf, "bad hw icmp checksum"); - return -NF_ACCEPT; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - icmp_error_log(skb, net, pf, "invalid icmp type"); - return -NF_ACCEPT; - } - - /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) - return NF_ACCEPT; - - return icmp_error_message(net, tmpl, skb, hooknum); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include - -static int icmp_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *t) -{ - if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || - nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || - nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, -}; - -static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; - - return 0; -} - -static unsigned int icmp_nlattr_tuple_size(void) -{ - static unsigned int size __read_mostly; - - if (!size) - size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); - - return size; -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include -#include - -static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); - - if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { - if (!timeout) - timeout = &in->timeout; - *timeout = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else if (timeout) { - /* Set default ICMP timeout. */ - *timeout = in->timeout; - } - return 0; -} - -static int -icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeout = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { - [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table icmp_sysctl_table[] = { - { - .procname = "nf_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmp_sysctl_table, - sizeof(icmp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmp_init_net(struct net *net, u_int16_t proto) -{ - struct nf_icmp_net *in = icmp_pernet(net); - struct nf_proto_net *pn = &in->pn; - - in->timeout = nf_ct_icmp_timeout; - - return icmp_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp.pn; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_ICMP, - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .packet = icmp_packet, - .new = icmp_new, - .error = icmp_error, - .destroy = NULL, - .me = NULL, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = icmp_tuple_to_nlattr, - .nlattr_tuple_size = icmp_nlattr_tuple_size, - .nlattr_to_tuple = icmp_nlattr_to_tuple, - .nla_policy = icmp_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = icmp_timeout_nlattr_to_obj, - .obj_to_nlattr = icmp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_ICMP_MAX, - .obj_size = sizeof(unsigned int), - .nla_policy = icmp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = icmp_init_net, - .get_net_proto = icmp_get_net_proto, -}; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 37b14dc9d863..339d0762b027 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,26 +5,6 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER -config NF_DEFRAG_IPV6 - tristate - default n - -config NF_CONNTRACK_IPV6 - tristate "IPv6 connection tracking support" - depends on INET && IPV6 && NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV6 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv6 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help @@ -128,7 +108,7 @@ config NF_LOG_IPV6 config NF_NAT_IPV6 tristate "IPv6 NAT" - depends on NF_CONNTRACK_IPV6 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT help @@ -328,7 +308,7 @@ config IP6_NF_SECURITY config IP6_NF_NAT tristate "ip6tables NAT support" - depends on NF_CONNTRACK_IPV6 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT select NF_NAT_IPV6 @@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT endif # IP6_NF_NAT endif # IP6_NF_IPTABLES - endmenu +config NF_DEFRAG_IPV6 + tristate diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 10a5a1c87320..200c0c235565 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o -# objects for l3 independent conntrack -nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o - -# l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o - nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c deleted file mode 100644 index 37ab25645cf2..000000000000 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (C)2004 USAGI/WIDE Project - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int conntrack6_net_id; -static DEFINE_MUTEX(register_ipv6_hooks); - -struct conntrack6_net { - unsigned int users; -}; - -static unsigned int ipv6_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - enum ip_conntrack_info ctinfo; - __be16 frag_off; - int protoff; - u8 nexthdr; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - - return helper->help(skb, protoff, ct, ctinfo); -} - -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - int protoff; - __be16 frag_off; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - goto out; - } - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv6_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); -} - -static unsigned int ipv6_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); -} - -static const struct nf_hook_ops ipv6_conntrack_ops[] = { - { - .hook = ipv6_conntrack_in, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK, - }, - { - .hook = ipv6_conntrack_local, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK, - }, - { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv6_confirm, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_LAST, - }, - { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv6_confirm, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_LAST-1, - }, -}; - -static int -ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; - const struct ipv6_pinfo *inet6 = inet6_sk(sk); - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct sockaddr_in6 sin6; - struct nf_conn *ct; - __be32 flow_label; - int bound_dev_if; - - lock_sock(sk); - tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.in6 = sk->sk_v6_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.dst.protonum = sk->sk_protocol; - bound_dev_if = sk->sk_bound_dev_if; - flow_label = inet6->flow_label; - release_sock(sk); - - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) - return -ENOPROTOOPT; - - if (*len < 0 || (unsigned int) *len < sizeof(sin6)) - return -EINVAL; - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; - } - - ct = nf_ct_tuplehash_to_ctrack(h); - - sin6.sin6_family = AF_INET6; - sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; - memcpy(&sin6.sin6_addr, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, - sizeof(sin6.sin6_addr)); - - nf_ct_put(ct); - sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); - return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; -} - -static int ipv6_hooks_register(struct net *net) -{ - struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); - int err = 0; - - mutex_lock(®ister_ipv6_hooks); - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv6_enable(net); - if (err < 0) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv6_hooks); - return err; -} - -static void ipv6_hooks_unregister(struct net *net) -{ - struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); - - mutex_lock(®ister_ipv6_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - mutex_unlock(®ister_ipv6_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { - .l3proto = PF_INET6, - .net_ns_get = ipv6_hooks_register, - .net_ns_put = ipv6_hooks_unregister, - .me = THIS_MODULE, -}; - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); - -static struct nf_sockopt_ops so_getorigdst6 = { - .pf = NFPROTO_IPV6, - .get_optmin = IP6T_SO_ORIGINAL_DST, - .get_optmax = IP6T_SO_ORIGINAL_DST + 1, - .get = ipv6_getorigdst, - .owner = THIS_MODULE, -}; - -static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = { - &nf_conntrack_l4proto_tcp6, - &nf_conntrack_l4proto_udp6, - &nf_conntrack_l4proto_icmpv6, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite6, -#endif -}; - -static int ipv6_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); -} - -static void ipv6_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); -} - -static struct pernet_operations ipv6_net_ops = { - .init = ipv6_net_init, - .exit = ipv6_net_exit, - .id = &conntrack6_net_id, - .size = sizeof(struct conntrack6_net), -}; - -static int __init nf_conntrack_l3proto_ipv6_init(void) -{ - int ret = 0; - - need_conntrack(); - - ret = nf_register_sockopt(&so_getorigdst6); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv6_net_ops); - if (ret < 0) - goto cleanup_sockopt; - - ret = nf_ct_l4proto_register(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); - goto cleanup_l4proto; - } - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - cleanup_pernet: - unregister_pernet_subsys(&ipv6_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst6); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv6_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_ct_l4proto_unregister(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - unregister_pernet_subsys(&ipv6_net_ops); - nf_unregister_sockopt(&so_getorigdst6); -} - -module_init(nf_conntrack_l3proto_ipv6_init); -module_exit(nf_conntrack_l3proto_ipv6_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c deleted file mode 100644 index bed07b998a10..000000000000 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Copyright (C)2003,2004 USAGI/WIDE Project - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; - -static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6; -} - -static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, - struct nf_conntrack_tuple *tuple) -{ - const struct icmp6hdr *hp; - struct icmp6hdr _hdr; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return false; - tuple->dst.u.icmp.type = hp->icmp6_type; - tuple->src.u.icmp.id = hp->icmp6_identifier; - tuple->dst.u.icmp.code = hp->icmp6_code; - - return true; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, - [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, - [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 -}; - -static const u_int8_t noct_valid_new[] = { - [ICMPV6_MGM_QUERY - 130] = 1, - [ICMPV6_MGM_REPORT - 130] = 1, - [ICMPV6_MGM_REDUCTION - 130] = 1, - [NDISC_ROUTER_SOLICITATION - 130] = 1, - [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, - [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, - [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 -}; - -static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - int type = orig->dst.u.icmp.type - 128; - if (type < 0 || type >= sizeof(invmap) || !invmap[type]) - return false; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return true; -} - -static unsigned int *icmpv6_get_timeouts(struct net *net) -{ - return &icmpv6_pernet(net)->timeout; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmpv6_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo) -{ - unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!timeout) - timeout = icmpv6_get_timeouts(nf_ct_net(ct)); - - /* Do not immediately delete the connection after the first - successful reply to avoid excessive conntrackd traffic - and also to handle correctly ICMP echo reply duplicates. */ - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - static const u_int8_t valid_new[] = { - [ICMPV6_ECHO_REQUEST - 128] = 1, - [ICMPV6_NI_QUERY - 128] = 1 - }; - int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; - - if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { - /* Can't create a new ICMPv6 `conn' with this. */ - pr_debug("icmpv6: can't create new conn with type %u\n", - type + 128); - nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - -static int -icmpv6_error_message(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int icmp6off) -{ - struct nf_conntrack_tuple intuple, origtuple; - const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_l4proto *inproto; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) - + sizeof(struct ipv6hdr) - + sizeof(struct icmp6hdr), - PF_INET6, net, &origtuple)) { - pr_debug("icmpv6_error: Can't get tuple\n"); - return -NF_ACCEPT; - } - - /* rcu_read_lock()ed by nf_hook_thresh */ - inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { - pr_debug("icmpv6_error: Can't invert tuple\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), - &intuple); - if (!h) { - pr_debug("icmpv6_error: no match\n"); - return -NF_ACCEPT; - } else { - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} - -static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); -} - -static int -icmpv6_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct icmp6hdr *icmp6h; - struct icmp6hdr _ih; - int type; - - icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); - if (icmp6h == NULL) { - icmpv6_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } - - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); - return -NF_ACCEPT; - } - - type = icmp6h->icmp6_type - 130; - if (type >= 0 && type < sizeof(noct_valid_new) && - noct_valid_new[type]) { - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - return NF_ACCEPT; - } - - /* is not error message ? */ - if (icmp6h->icmp6_type >= 128) - return NF_ACCEPT; - - return icmpv6_error_message(net, tmpl, skb, dataoff); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include -static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *t) -{ - if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || - nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || - nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, -}; - -static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMPV6_TYPE] || - !tb[CTA_PROTO_ICMPV6_CODE] || - !tb[CTA_PROTO_ICMPV6_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - - if (tuple->dst.u.icmp.type < 128 || - tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type - 128]) - return -EINVAL; - - return 0; -} - -static unsigned int icmpv6_nlattr_tuple_size(void) -{ - static unsigned int size __read_mostly; - - if (!size) - size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); - - return size; -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include -#include - -static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeout = data; - struct nf_icmp_net *in = icmpv6_pernet(net); - - if (!timeout) - timeout = icmpv6_get_timeouts(net); - if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { - *timeout = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; - } else { - /* Set default ICMPv6 timeout. */ - *timeout = in->timeout; - } - return 0; -} - -static int -icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeout = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { - [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table icmpv6_sysctl_table[] = { - { - .procname = "nf_conntrack_icmpv6_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmpv6_sysctl_table, - sizeof(icmpv6_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmpv6_init_net(struct net *net, u_int16_t proto) -{ - struct nf_icmp_net *in = icmpv6_pernet(net); - struct nf_proto_net *pn = &in->pn; - - in->timeout = nf_ct_icmpv6_timeout; - - return icmpv6_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6.pn; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_ICMPV6, - .pkt_to_tuple = icmpv6_pkt_to_tuple, - .invert_tuple = icmpv6_invert_tuple, - .packet = icmpv6_packet, - .new = icmpv6_new, - .error = icmpv6_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = icmpv6_tuple_to_nlattr, - .nlattr_tuple_size = icmpv6_nlattr_tuple_size, - .nlattr_to_tuple = icmpv6_nlattr_to_tuple, - .nla_policy = icmpv6_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, - .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_ICMP_MAX, - .obj_size = sizeof(unsigned int), - .nla_policy = icmpv6_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = icmpv6_init_net, - .get_net_proto = icmpv6_get_net_proto, -}; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3ce657fbca67..9eab519b403a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -49,6 +49,8 @@ config NETFILTER_NETLINK_LOG config NF_CONNTRACK tristate "Netfilter connection tracking support" default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IPV6 != n help Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f132ea850778..53bd1ed1228a 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,7 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \ + nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \ + nf_conntrack_proto_icmp.o \ + nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o + +nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c069f2faff4c..5123e91b1982 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -291,7 +291,6 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u_int8_t *protonum) { int dataoff = -1; -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4) const struct iphdr *iph; struct iphdr _iph; @@ -314,15 +313,14 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, nhoff, iph->ihl << 2, skb->len); return -1; } -#endif return dataoff; } +#if IS_ENABLED(CONFIG_IPV6) static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 *protonum) { int protoff = -1; -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) unsigned int extoff = nhoff + sizeof(struct ipv6hdr); __be16 frag_off; u8 nexthdr; @@ -343,9 +341,9 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, } *protonum = nexthdr; -#endif return protoff; } +#endif static int get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 pf, u8 *l4num) @@ -353,8 +351,10 @@ static int get_l4proto(const struct sk_buff *skb, switch (pf) { case NFPROTO_IPV4: return ipv4_get_l4proto(skb, nhoff, l4num); +#if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: return ipv6_get_l4proto(skb, nhoff, l4num); +#endif default: *l4num = 0; break; @@ -2197,9 +2197,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) } EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 39df72bb9d56..803607a90102 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -1,14 +1,4 @@ -/* L3/L4 protocol support for nf_conntrack. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * (C) 2003,2004 USAGI/WIDE Project - * (C) 2006-2012 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ +// SPDX-License-Identifier: GPL-2.0 #include #include @@ -24,22 +14,39 @@ #include #include -#include #include #include #include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +extern unsigned int nf_conntrack_net_id; + static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; -struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); -struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { - .l3proto = PF_UNSPEC, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); - #ifdef CONFIG_SYSCTL static int nf_ct_register_sysctl(struct net *net, @@ -127,137 +134,6 @@ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); -/* this is guaranteed to always return a valid protocol helper, since - * it falls back to generic_protocol */ -const struct nf_conntrack_l3proto * -nf_ct_l3proto_find_get(u_int16_t l3proto) -{ - struct nf_conntrack_l3proto *p; - - rcu_read_lock(); - p = __nf_ct_l3proto_find(l3proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l3proto_generic; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); - -int -nf_ct_l3proto_try_module_get(unsigned short l3proto) -{ - const struct nf_conntrack_l3proto *p; - int ret; - -retry: p = nf_ct_l3proto_find_get(l3proto); - if (p == &nf_conntrack_l3proto_generic) { - ret = request_module("nf_conntrack-%d", l3proto); - if (!ret) - goto retry; - - return -EPROTOTYPE; - } - - return 0; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get); - -void nf_ct_l3proto_module_put(unsigned short l3proto) -{ - struct nf_conntrack_l3proto *p; - - /* rcu_read_lock not necessary since the caller holds a reference, but - * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find() - */ - rcu_read_lock(); - p = __nf_ct_l3proto_find(l3proto); - module_put(p->me); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); - -static int nf_ct_netns_do_get(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - int ret; - - might_sleep(); - - ret = nf_ct_l3proto_try_module_get(nfproto); - if (ret < 0) - return ret; - - /* we already have a reference, can't fail */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (!l3proto->net_ns_get) - return 0; - - ret = l3proto->net_ns_get(net); - if (ret < 0) - nf_ct_l3proto_module_put(nfproto); - - return ret; -} - -int nf_ct_netns_get(struct net *net, u8 nfproto) -{ - int err; - - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; - -err2: - nf_ct_netns_put(net, NFPROTO_IPV4); -err1: - return err; -} -EXPORT_SYMBOL_GPL(nf_ct_netns_get); - -static void nf_ct_netns_do_put(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - - might_sleep(); - - /* same as nf_conntrack_netns_get(), reference assumed */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (WARN_ON(!l3proto)) - return; - - if (l3proto->net_ns_put) - l3proto->net_ns_put(net); - - nf_ct_l3proto_module_put(nfproto); -} - -void nf_ct_netns_put(struct net *net, uint8_t nfproto) -{ - if (nfproto == NFPROTO_INET) { - nf_ct_netns_do_put(net, NFPROTO_IPV4); - nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else - nf_ct_netns_do_put(net, nfproto); -} -EXPORT_SYMBOL_GPL(nf_ct_netns_put); - const struct nf_conntrack_l4proto * nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) { @@ -279,11 +155,6 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p) } EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); -static int kill_l3proto(struct nf_conn *i, void *data) -{ - return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto; -} - static int kill_l4proto(struct nf_conn *i, void *data) { const struct nf_conntrack_l4proto *l4proto; @@ -292,49 +163,6 @@ static int kill_l4proto(struct nf_conn *i, void *data) nf_ct_l3num(i) == l4proto->l3proto; } -int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto) -{ - int ret = 0; - struct nf_conntrack_l3proto *old; - - if (proto->l3proto >= NFPROTO_NUMPROTO) - return -EBUSY; - - mutex_lock(&nf_ct_proto_mutex); - old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], - lockdep_is_held(&nf_ct_proto_mutex)); - if (old != &nf_conntrack_l3proto_generic) { - ret = -EBUSY; - goto out_unlock; - } - - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); - -out_unlock: - mutex_unlock(&nf_ct_proto_mutex); - return ret; - -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); - -void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto) -{ - BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO); - - mutex_lock(&nf_ct_proto_mutex); - BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], - lockdep_is_held(&nf_ct_proto_mutex) - ) != proto); - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], - &nf_conntrack_l3proto_generic); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_rcu(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l3proto, (void*)proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); - static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, const struct nf_conntrack_l4proto *l4proto) { @@ -501,8 +329,23 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); -int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) +static void +nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], + unsigned int num_proto) +{ + mutex_lock(&nf_ct_proto_mutex); + while (num_proto-- != 0) + __nf_ct_l4proto_unregister_one(l4proto[num_proto]); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_net(); + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); +} + +static int +nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], + unsigned int num_proto) { int ret = -EINVAL, ver; unsigned int i; @@ -520,7 +363,6 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], } return ret; } -EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); int nf_ct_l4proto_pernet_register(struct net *net, const struct nf_conntrack_l4proto *const l4proto[], @@ -544,20 +386,6 @@ int nf_ct_l4proto_pernet_register(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); -void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) -{ - mutex_lock(&nf_ct_proto_mutex); - while (num_proto-- != 0) - __nf_ct_l4proto_unregister_one(l4proto[num_proto]); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_net(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); - void nf_ct_l4proto_pernet_unregister(struct net *net, const struct nf_conntrack_l4proto *const l4proto[], unsigned int num_proto) @@ -567,6 +395,563 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); +static unsigned int ipv4_helper(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); +} + +static unsigned int ipv4_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv4_conntrack_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); +} + +static unsigned int ipv4_conntrack_local(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ + enum ip_conntrack_info ctinfo; + struct nf_conn *tmpl; + + tmpl = nf_ct_get(skb, &ctinfo); + if (tmpl && nf_ct_is_template(tmpl)) { + /* when skipping ct, clear templates to avoid fooling + * later targets/matches + */ + skb->_nfct = 0; + nf_ct_put(tmpl); + } + return NF_ACCEPT; + } + + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); +} + +/* Connection tracking may drop packets, but never alters them, so + * make it the first hook. + */ +static const struct nf_hook_ops ipv4_conntrack_ops[] = { + { + .hook = ipv4_conntrack_in, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_conntrack_local, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_helper, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv4_confirm, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, + { + .hook = ipv4_helper, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv4_confirm, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +/* Fast function for those who don't want to parse /proc (and I don't + * blame them). + * Reversing the socket's dst/src point of view gives us the reply + * mapping. + */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + memset(&tuple, 0, sizeof(tuple)); + + lock_sock(sk); + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = sk->sk_protocol; + release_sock(sk); + + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int)*len < sizeof(struct sockaddr_in)) { + pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", + &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST + 1, + .get = getorigdst, + .owner = THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int +ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; + const struct ipv6_pinfo *inet6 = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct sockaddr_in6 sin6; + struct nf_conn *ct; + __be32 flow_label; + int bound_dev_if; + + lock_sock(sk); + tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.in6 = sk->sk_v6_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.dst.protonum = sk->sk_protocol; + bound_dev_if = sk->sk_bound_dev_if; + flow_label = inet6->flow_label; + release_sock(sk); + + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) + return -ENOPROTOOPT; + + if (*len < 0 || (unsigned int)*len < sizeof(sin6)) + return -EINVAL; + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (!h) { + pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", + &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + sin6.sin6_family = AF_INET6; + sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; + memcpy(&sin6.sin6_addr, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, + sizeof(sin6.sin6_addr)); + + nf_ct_put(ct); + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); + return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; +} + +static struct nf_sockopt_ops so_getorigdst6 = { + .pf = NFPROTO_IPV6, + .get_optmin = IP6T_SO_ORIGINAL_DST, + .get_optmax = IP6T_SO_ORIGINAL_DST + 1, + .get = ipv6_getorigdst, + .owner = THIS_MODULE, +}; + +static unsigned int ipv6_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + int protoff; + __be16 frag_off; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + goto out; + } + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv6_conntrack_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); +} + +static unsigned int ipv6_conntrack_local(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); +} + +static unsigned int ipv6_helper(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + __be16 frag_off; + int protoff; + u8 nexthdr; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +static const struct nf_hook_ops ipv6_conntrack_ops[] = { + { + .hook = ipv6_conntrack_in, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_conntrack_local, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_helper, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv6_confirm, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, + }, + { + .hook = ipv6_helper, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv6_confirm, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_LAST - 1, + }, +}; +#endif + +static int nf_ct_netns_do_get(struct net *net, u8 nfproto) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + int err = 0; + + mutex_lock(&nf_ct_proto_mutex); + + switch (nfproto) { + case NFPROTO_IPV4: + cnet->users4++; + if (cnet->users4 > 1) + goto out_unlock; + err = nf_defrag_ipv4_enable(net); + if (err) { + cnet->users4 = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + if (err) + cnet->users4 = 0; + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + cnet->users6++; + if (cnet->users6 > 1) + goto out_unlock; + err = nf_defrag_ipv6_enable(net); + if (err < 0) { + cnet->users6 = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + if (err) + cnet->users6 = 0; + break; +#endif + default: + err = -EPROTO; + break; + } + out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return err; +} + +static void nf_ct_netns_do_put(struct net *net, u8 nfproto) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + mutex_lock(&nf_ct_proto_mutex); + switch (nfproto) { + case NFPROTO_IPV4: + if (cnet->users4 && (--cnet->users4 == 0)) + nf_unregister_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + if (cnet->users6 && (--cnet->users6 == 0)) + nf_unregister_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + break; +#endif + } + + mutex_unlock(&nf_ct_proto_mutex); +} + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + if (nfproto == NFPROTO_INET) { + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_netns_do_get(net, nfproto); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_netns_put(net, NFPROTO_IPV4); +err1: + return err; +} +EXPORT_SYMBOL_GPL(nf_ct_netns_get); + +void nf_ct_netns_put(struct net *net, uint8_t nfproto) +{ + if (nfproto == NFPROTO_INET) { + nf_ct_netns_do_put(net, NFPROTO_IPV4); + nf_ct_netns_do_put(net, NFPROTO_IPV6); + } else { + nf_ct_netns_do_put(net, nfproto); + } +} +EXPORT_SYMBOL_GPL(nf_ct_netns_put); + +static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { + &nf_conntrack_l4proto_tcp4, + &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_icmp, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite4, +#endif +#if IS_ENABLED(CONFIG_IPV6) + &nf_conntrack_l4proto_tcp6, + &nf_conntrack_l4proto_udp6, + &nf_conntrack_l4proto_icmpv6, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite6, +#endif +#endif /* CONFIG_IPV6 */ +}; + +int nf_conntrack_proto_init(void) +{ + int ret = 0; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) + return ret; + +#if IS_ENABLED(CONFIG_IPV6) + ret = nf_register_sockopt(&so_getorigdst6); + if (ret < 0) + goto cleanup_sockopt; +#endif + ret = nf_ct_l4proto_register(builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + if (ret < 0) + goto cleanup_sockopt2; + + return ret; +cleanup_sockopt2: + nf_unregister_sockopt(&so_getorigdst); +#if IS_ENABLED(CONFIG_IPV6) +cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst6); +#endif + return ret; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + + nf_ct_l4proto_unregister(builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + nf_unregister_sockopt(&so_getorigdst); +#if IS_ENABLED(CONFIG_IPV6) + nf_unregister_sockopt(&so_getorigdst6); +#endif + + /* free l3proto protocol tables */ + for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) + kfree(nf_ct_protos[i]); +} + int nf_conntrack_proto_pernet_init(struct net *net) { int err; @@ -583,6 +968,14 @@ int nf_conntrack_proto_pernet_init(struct net *net) if (err < 0) return err; + err = nf_ct_l4proto_pernet_register(net, builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + if (err < 0) { + nf_ct_l4proto_unregister_sysctl(net, pn, + &nf_conntrack_l4proto_generic); + return err; + } + pn->users++; return 0; } @@ -592,25 +985,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net) struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); pn->users--; nf_ct_l4proto_unregister_sysctl(net, pn, &nf_conntrack_l4proto_generic); } -int nf_conntrack_proto_init(void) -{ - unsigned int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) - rcu_assign_pointer(nf_ct_l3protos[i], - &nf_conntrack_l3proto_generic); - return 0; -} -void nf_conntrack_proto_fini(void) -{ - unsigned int i; - /* free l3proto protocol tables */ - for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) - kfree(nf_ct_protos[i]); -} +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +MODULE_ALIAS("ip_conntrack"); +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 000000000000..036670b38282 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,388 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * (C) 2006-2010 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned int nf_ct_icmp_timeout = 30*HZ; + +static inline struct nf_icmp_net *icmp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp; +} + +static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) +{ + const struct icmphdr *hp; + struct icmphdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 +}; + +static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + if (orig->dst.u.icmp.type >= sizeof(invmap) || + !invmap[orig->dst.u.icmp.type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +static unsigned int *icmp_get_timeouts(struct net *net) +{ + return &icmp_pernet(net)->timeout; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo) +{ + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmp_get_timeouts(nf_ct_net(ct)); + + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + static const u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 + }; + + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || + !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + pr_debug("icmp: can't create new conn with type %u\n", + ct->tuplehash[0].tuple.dst.u.icmp.type); + nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); + return false; + } + return true; +} + +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + const struct nf_conntrack_l4proto *innerproto; + const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_zone *zone; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_zone tmp; + + WARN_ON(skb_nfct(skb)); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + ip_hdrlen(skb) + + sizeof(struct icmphdr), + PF_INET, net, &origtuple)) { + pr_debug("icmp_error_message: failed to get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_thresh */ + innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, zone, &innertuple); + if (!h) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo += IP_CT_IS_REPLY; + + /* Update skb to refer to this connection */ + nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + return NF_ACCEPT; +} + +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + u8 pf, unsigned int hooknum) +{ + const struct icmphdr *icmph; + struct icmphdr _ih; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); + if (icmph == NULL) { + icmp_error_log(skb, net, pf, "short packet"); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip_checksum(skb, hooknum, dataoff, 0)) { + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); + return -NF_ACCEPT; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + icmp_error_log(skb, net, pf, "invalid icmp type"); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(net, tmpl, skb, hooknum); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include +#include + +static int icmp_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, +}; + +static int icmp_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE] || + !tb[CTA_PROTO_ICMP_CODE] || + !tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + + return 0; +} + +static unsigned int icmp_nlattr_tuple_size(void) +{ + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include +#include + +static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_icmp_net *in = icmp_pernet(net); + + if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { + if (!timeout) + timeout = &in->timeout; + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; + } else if (timeout) { + /* Set default ICMP timeout. */ + *timeout = in->timeout; + } + return 0; +} + +static int +icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { + [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table icmp_sysctl_table[] = { + { + .procname = "nf_conntrack_icmp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmp_sysctl_table, + sizeof(icmp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmp_init_net(struct net *net, u_int16_t proto) +{ + struct nf_icmp_net *in = icmp_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmp_timeout; + + return icmp_kmemdup_sysctl_table(pn, in); +} + +static struct nf_proto_net *icmp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp.pn; +} + +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_ICMP, + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = icmp_tuple_to_nlattr, + .nlattr_tuple_size = icmp_nlattr_tuple_size, + .nlattr_to_tuple = icmp_nlattr_to_tuple, + .nla_policy = icmp_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmp_timeout_nlattr_to_obj, + .obj_to_nlattr = icmp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = icmp_init_net, + .get_net_proto = icmp_get_net_proto, +}; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 000000000000..bed07b998a10 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,387 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; + +static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6; +} + +static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct net *net, + struct nf_conntrack_tuple *tuple) +{ + const struct icmp6hdr *hp; + struct icmp6hdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 +}; + +static const u_int8_t noct_valid_new[] = { + [ICMPV6_MGM_QUERY - 130] = 1, + [ICMPV6_MGM_REPORT - 130] = 1, + [ICMPV6_MGM_REDUCTION - 130] = 1, + [NDISC_ROUTER_SOLICITATION - 130] = 1, + [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, + [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, + [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, + [ICMPV6_MLD2_REPORT - 130] = 1 +}; + +static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +static unsigned int *icmpv6_get_timeouts(struct net *net) +{ + return &icmpv6_pernet(net)->timeout; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo) +{ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmpv6_get_timeouts(nf_ct_net(ct)); + + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + static const u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; + + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { + /* Can't create a new ICMPv6 `conn' with this. */ + pr_debug("icmpv6: can't create new conn with type %u\n", + type + 128); + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + return false; + } + return true; +} + +static int +icmpv6_error_message(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int icmp6off) +{ + struct nf_conntrack_tuple intuple, origtuple; + const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_l4proto *inproto; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_zone tmp; + + WARN_ON(skb_nfct(skb)); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct icmp6hdr), + PF_INET6, net, &origtuple)) { + pr_debug("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_thresh */ + inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { + pr_debug("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), + &intuple); + if (!h) { + pr_debug("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + return NF_ACCEPT; +} + +static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); +} + +static int +icmpv6_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + u8 pf, unsigned int hooknum) +{ + const struct icmp6hdr *icmp6h; + struct icmp6hdr _ih; + int type; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + icmpv6_error_log(skb, net, pf, "short packet"); + return -NF_ACCEPT; + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { + icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); + return -NF_ACCEPT; + } + + type = icmp6h->icmp6_type - 130; + if (type >= 0 && type < sizeof(noct_valid_new) && + noct_valid_new[type]) { + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + return NF_ACCEPT; + } + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(net, tmpl, skb, dataoff); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include +#include +static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, +}; + +static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMPV6_TYPE] || + !tb[CTA_PROTO_ICMPV6_CODE] || + !tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + + return 0; +} + +static unsigned int icmpv6_nlattr_tuple_size(void) +{ + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + + return size; +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include +#include + +static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_icmp_net *in = icmpv6_pernet(net); + + if (!timeout) + timeout = icmpv6_get_timeouts(net); + if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; + } else { + /* Set default ICMPv6 timeout. */ + *timeout = in->timeout; + } + return 0; +} + +static int +icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { + [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table icmpv6_sysctl_table[] = { + { + .procname = "nf_conntrack_icmpv6_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmpv6_sysctl_table, + sizeof(icmpv6_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmpv6_init_net(struct net *net, u_int16_t proto) +{ + struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmpv6_timeout; + + return icmpv6_kmemdup_sysctl_table(pn, in); +} + +static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6.pn; +} + +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_ICMPV6, + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .packet = icmpv6_packet, + .new = icmpv6_new, + .error = icmpv6_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = icmpv6_tuple_to_nlattr, + .nlattr_tuple_size = icmpv6_nlattr_tuple_size, + .nlattr_to_tuple = icmpv6_nlattr_to_tuple, + .nla_policy = icmpv6_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, + .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmpv6_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = icmpv6_init_net, + .get_net_proto = icmpv6_get_net_proto, +}; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 47b80fd0d2c3..13279f683da9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,12 +1,4 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2005-2012 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - +// SPDX-License-Identifier: GPL-2.0 #include #include #include @@ -32,7 +24,7 @@ #include #include -MODULE_LICENSE("GPL"); +unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void @@ -674,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, + .id = &nf_conntrack_net_id, + .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 86df2a1666fd..6366f0c0b8c1 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -743,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) { - int err; - - err = nf_ct_l3proto_try_module_get(l3proto->l3proto); - if (err < 0) - return err; - mutex_lock(&nf_nat_proto_mutex); RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], &nf_nat_l4proto_tcp); @@ -781,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) synchronize_rcu(); nf_nat_l3proto_clean(l3proto->l3proto); - nf_ct_l3proto_module_put(l3proto->l3proto); } EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); -- cgit v1.2.3 From 5d400a4933e867dbc3706023c8ed55d364c233ed Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Tue, 10 Jul 2018 16:01:28 +0200 Subject: netfilter: Kconfig: Change select IPv6 dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... from IPV6 to NF_TABLES_IPV6 and IP6_NF_IPTABLES. In some cases module selects depend on IPV6, but this means that they select another module even if eg. NF_TABLES_IPV6 is not set in which case the selected module is useless due to the lack of IPv6 nf_tables functionality. The same applies for IP6_NF_IPTABLES and iptables. Joint work with: Arnd Bermann Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 6 +++--- net/netfilter/nft_socket.c | 4 ++-- net/netfilter/xt_TEE.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9eab519b403a..e0ab50c58dc4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -628,7 +628,7 @@ config NFT_SOCKET tristate "Netfilter nf_tables socket match support" depends on IPV6 || IPV6=n select NF_SOCKET_IPV4 - select NF_SOCKET_IPV6 if IPV6 + select NF_SOCKET_IPV6 if NF_TABLES_IPV6 help This option allows matching for the presence or absence of a corresponding socket and its attributes. @@ -894,7 +894,7 @@ config NETFILTER_XT_TARGET_LOG tristate "LOG target support" select NF_LOG_COMMON select NF_LOG_IPV4 - select NF_LOG_IPV6 if IPV6 + select NF_LOG_IPV6 if IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -986,7 +986,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IPV6 + select NF_DUP_IPV6 if IP6_NF_IPTABLES ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 998c2b546f6d..e43c1939d25f 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -31,7 +31,7 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFPROTO_IPV4: sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); break; -#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); break; @@ -77,7 +77,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, switch(ctx->family) { case NFPROTO_IPV4: -#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: #endif case NFPROTO_INET: diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 475957cfcf50..0d0d68c989df 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -141,7 +141,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, -- cgit v1.2.3 From bfd4271169176766343026bf324337e529d81fa4 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Bugge Date: Mon, 16 Jul 2018 15:00:09 +0200 Subject: net/rds: void function cannot return -1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b6fb0df12db6 ("RDS/IB: Make ib_recv_refill return void") did not change the comment accordingly. Fixes: b6fb0df12db6 ("RDS/IB: Make ib_recv_refill return void") Signed-off-by: HÃ¥kon Bugge Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index b4e421aa9727..e5ce93419263 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -376,8 +376,6 @@ static void release_refill(struct rds_connection *conn) * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into * sockets. - * - * -1 is returned if posting fails due to temporary resource exhaustion. */ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { -- cgit v1.2.3 From fa52531eb4409a1fc0cc11ac37e249088d3561c7 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Bugge Date: Mon, 16 Jul 2018 15:06:39 +0200 Subject: net/rds: Remove unnecessary variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: HÃ¥kon Bugge Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e5ce93419263..1eaf2550a9f8 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1023,7 +1023,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; - int ret = 0; rdsdebug("conn %p\n", conn); if (rds_conn_up(conn)) { @@ -1032,7 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp) rds_ib_stats_inc(s_ib_rx_refill_from_thread); } - return ret; + return 0; } int rds_ib_recv_init(void) -- cgit v1.2.3 From 2a406e8ac7c3e7e96b94d6c0765d5a4641970446 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:39 -0700 Subject: netfilter: nf_conncount: Early exit for garbage collection This patch is originally from Florian Westphal. We use an extra function with early exit for garbage collection. It is not necessary to traverse the full list for every node since it is enough to zap a couple of entries for garbage collection. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 510039862aa9..81c02185b2e8 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -189,6 +189,42 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, } EXPORT_SYMBOL_GPL(nf_conncount_lookup); +static void nf_conncount_gc_list(struct net *net, + struct nf_conncount_rb *rbconn) +{ + const struct nf_conntrack_tuple_hash *found; + struct nf_conncount_tuple *conn; + struct hlist_node *n; + struct nf_conn *found_ct; + unsigned int collected = 0; + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) { + found = find_or_evict(net, conn); + if (IS_ERR(found)) { + if (PTR_ERR(found) == -ENOENT) + collected++; + continue; + } + + found_ct = nf_ct_tuplehash_to_ctrack(found); + if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + hlist_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); + collected++; + continue; + } + + nf_ct_put(found_ct); + if (collected > CONNCOUNT_GC_MAX_NODES) + return; + } +} + static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -251,8 +287,7 @@ count_tree(struct net *net, struct rb_root *root, if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) continue; - /* only used for GC on hhead, retval and 'addit' ignored */ - nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit); + nf_conncount_gc_list(net, rbconn); if (hlist_empty(&rbconn->hhead)) gc_nodes[gc_count++] = rbconn; } -- cgit v1.2.3 From cb2b36f5a97df76f547fcc4ab444a02522fb6c96 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:40 -0700 Subject: netfilter: nf_conncount: Switch to plain list Original patch is from Florian Westphal. This patch switches from hlist to plain list to store the list of connections with the same filtering key in nf_conncount. With the plain list, we can insert new connections at the tail, so over time the beginning of list holds long-running connections and those are expired, while the newly creates ones are at the end. Later on, we could probably move checked ones to the end of the list, so the next run has higher chance to reclaim stale entries in the front. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 15 ++++-- net/netfilter/nf_conncount.c | 83 ++++++++++++++++++------------ net/netfilter/nft_connlimit.c | 24 ++++----- 3 files changed, 75 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 3a188a0923a3..e4884e0e4f69 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -1,8 +1,15 @@ #ifndef _NF_CONNTRACK_COUNT_H #define _NF_CONNTRACK_COUNT_H +#include + struct nf_conncount_data; +struct nf_conncount_list { + struct list_head head; /* connections with the same filtering key */ + unsigned int count; /* length of list */ +}; + struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, unsigned int keylen); void nf_conncount_destroy(struct net *net, unsigned int family, @@ -14,15 +21,17 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, +unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, bool *addit); -bool nf_conncount_add(struct hlist_head *head, +void nf_conncount_list_init(struct nf_conncount_list *list); + +bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -void nf_conncount_cache_free(struct hlist_head *hhead); +void nf_conncount_cache_free(struct nf_conncount_list *list); #endif diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 81c02185b2e8..81b060adefef 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -44,7 +44,7 @@ /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { - struct hlist_node node; + struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int cpu; @@ -53,7 +53,7 @@ struct nf_conncount_tuple { struct nf_conncount_rb { struct rb_node node; - struct hlist_head hhead; /* connections/hosts in same subnet */ + struct nf_conncount_list list; u32 key[MAX_KEYLEN]; }; @@ -82,12 +82,15 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -bool nf_conncount_add(struct hlist_head *head, +bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; + if (WARN_ON_ONCE(list->count > INT_MAX)) + return false; + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) return false; @@ -95,13 +98,26 @@ bool nf_conncount_add(struct hlist_head *head, conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; - hlist_add_head(&conn->node, head); + list_add_tail(&conn->node, &list->head); + list->count++; return true; } EXPORT_SYMBOL_GPL(nf_conncount_add); +static void conn_free(struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) +{ + if (WARN_ON_ONCE(list->count == 0)) + return; + + list->count--; + list_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); +} + static const struct nf_conntrack_tuple_hash * -find_or_evict(struct net *net, struct nf_conncount_tuple *conn) +find_or_evict(struct net *net, struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; @@ -121,30 +137,29 @@ find_or_evict(struct net *net, struct nf_conncount_tuple *conn) */ age = a - b; if (conn->cpu == cpu || age >= 2) { - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); return ERR_PTR(-ENOENT); } return ERR_PTR(-EAGAIN); } -unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, +unsigned int nf_conncount_lookup(struct net *net, + struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, bool *addit) { const struct nf_conntrack_tuple_hash *found; - struct nf_conncount_tuple *conn; + struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; - struct hlist_node *n; unsigned int length = 0; *addit = tuple ? true : false; /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, head, node) { - found = find_or_evict(net, conn); + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { @@ -157,6 +172,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, nf_ct_zone_id(zone, zone->dir)) *addit = false; } + continue; } @@ -176,8 +192,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, * closed already -> ditch it */ nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); continue; } @@ -189,17 +204,23 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, } EXPORT_SYMBOL_GPL(nf_conncount_lookup); +void nf_conncount_list_init(struct nf_conncount_list *list) +{ + INIT_LIST_HEAD(&list->head); + list->count = 1; +} +EXPORT_SYMBOL_GPL(nf_conncount_list_init); + static void nf_conncount_gc_list(struct net *net, - struct nf_conncount_rb *rbconn) + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; - struct nf_conncount_tuple *conn; - struct hlist_node *n; + struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; - hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) { - found = find_or_evict(net, conn); + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { if (PTR_ERR(found) == -ENOENT) collected++; @@ -213,8 +234,7 @@ static void nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); collected++; continue; } @@ -271,14 +291,14 @@ count_tree(struct net *net, struct rb_root *root, /* same source network -> be counted! */ unsigned int count; - count = nf_conncount_lookup(net, &rbconn->hhead, tuple, + count = nf_conncount_lookup(net, &rbconn->list, tuple, zone, &addit); tree_nodes_free(root, gc_nodes, gc_count); if (!addit) return count; - if (!nf_conncount_add(&rbconn->hhead, tuple, zone)) + if (!nf_conncount_add(&rbconn->list, tuple, zone)) return 0; /* hotdrop */ return count + 1; @@ -287,8 +307,8 @@ count_tree(struct net *net, struct rb_root *root, if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) continue; - nf_conncount_gc_list(net, rbconn); - if (hlist_empty(&rbconn->hhead)) + nf_conncount_gc_list(net, &rbconn->list); + if (list_empty(&rbconn->list.head)) gc_nodes[gc_count++] = rbconn; } @@ -322,8 +342,8 @@ count_tree(struct net *net, struct rb_root *root, conn->zone = *zone; memcpy(rbconn->key, key, sizeof(u32) * keylen); - INIT_HLIST_HEAD(&rbconn->hhead); - hlist_add_head(&conn->node, &rbconn->hhead); + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); rb_link_node(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); @@ -388,12 +408,11 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family } EXPORT_SYMBOL_GPL(nf_conncount_init); -void nf_conncount_cache_free(struct hlist_head *hhead) +void nf_conncount_cache_free(struct nf_conncount_list *list) { - struct nf_conncount_tuple *conn; - struct hlist_node *n; + struct nf_conncount_tuple *conn, *conn_n; - hlist_for_each_entry_safe(conn, n, hhead, node) + list_for_each_entry_safe(conn, conn_n, &list->head, node) kmem_cache_free(conncount_conn_cachep, conn); } EXPORT_SYMBOL_GPL(nf_conncount_cache_free); @@ -408,7 +427,7 @@ static void destroy_tree(struct rb_root *r) rb_erase(node, r); - nf_conncount_cache_free(&rbconn->hhead); + nf_conncount_cache_free(&rbconn->list); kmem_cache_free(conncount_rb_cachep, rbconn); } diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index a832c59f0a9c..4f0491a36a1d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,10 +14,10 @@ #include struct nft_connlimit { - spinlock_t lock; - struct hlist_head hhead; - u32 limit; - bool invert; + spinlock_t lock; + struct nf_conncount_list list; + u32 limit; + bool invert; }; static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, @@ -46,13 +46,13 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, } spin_lock_bh(&priv->lock); - count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone, + count = nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, &addit); if (!addit) goto out; - if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) { + if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; spin_unlock_bh(&priv->lock); return; @@ -88,7 +88,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, } spin_lock_init(&priv->lock); - INIT_HLIST_HEAD(&priv->hhead); + nf_conncount_list_init(&priv->list); priv->limit = limit; priv->invert = invert; @@ -99,7 +99,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, struct nft_connlimit *priv) { nf_ct_netns_put(ctx->net, ctx->family); - nf_conncount_cache_free(&priv->hhead); + nf_conncount_cache_free(&priv->list); } static int nft_connlimit_do_dump(struct sk_buff *skb, @@ -213,7 +213,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_src = nft_expr_priv(src); spin_lock_init(&priv_dst->lock); - INIT_HLIST_HEAD(&priv_dst->hhead); + nf_conncount_list_init(&priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -225,7 +225,7 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, { struct nft_connlimit *priv = nft_expr_priv(expr); - nf_conncount_cache_free(&priv->hhead); + nf_conncount_cache_free(&priv->list); } static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) @@ -234,9 +234,9 @@ static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) bool addit, ret; spin_lock_bh(&priv->lock); - nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit); + nf_conncount_lookup(net, &priv->list, NULL, &nf_ct_zone_dflt, &addit); - ret = hlist_empty(&priv->hhead); + ret = list_empty(&priv->list.head); spin_unlock_bh(&priv->lock); return ret; -- cgit v1.2.3 From 976afca1ceba53df6f4a543014e15d1c7a962571 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:41 -0700 Subject: netfilter: nf_conncount: Early exit in nf_conncount_lookup() and cleanup This patch is originally from Florian Westphal. This patch does the following three tasks. It applies the same early exit technique for nf_conncount_lookup(). Since now we keep the number of connections in 'struct nf_conncount_list', we no longer need to return the count in nf_conncount_lookup(). Moreover, we expose the garbage collection function nf_conncount_gc_list() for nft_connlimit. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 11 +++++---- net/netfilter/nf_conncount.c | 38 +++++++++++++++++------------- net/netfilter/nft_connlimit.c | 9 +++---- 3 files changed, 33 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index e4884e0e4f69..dbec17f674b7 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -21,10 +21,10 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit); +void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit); void nf_conncount_list_init(struct nf_conncount_list *list); @@ -32,6 +32,9 @@ bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); +void nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list); + void nf_conncount_cache_free(struct nf_conncount_list *list); #endif diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 81b060adefef..7dfd9d5e6a3e 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -144,26 +144,29 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } -unsigned int nf_conncount_lookup(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +void nf_conncount_lookup(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; - unsigned int length = 0; + unsigned int collect = 0; + /* best effort only */ *addit = tuple ? true : false; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { + if (collect > CONNCOUNT_GC_MAX_NODES) + break; + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - length++; if (!tuple) continue; @@ -171,8 +174,8 @@ unsigned int nf_conncount_lookup(struct net *net, nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) *addit = false; - } - + } else if (PTR_ERR(found) == -ENOENT) + collect++; continue; } @@ -181,9 +184,10 @@ unsigned int nf_conncount_lookup(struct net *net, if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* - * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". + * + * Attempt to avoid a re-add in this case. */ *addit = false; } else if (already_closed(found_ct)) { @@ -193,14 +197,12 @@ unsigned int nf_conncount_lookup(struct net *net, */ nf_ct_put(found_ct); conn_free(list, conn); + collect++; continue; } nf_ct_put(found_ct); - length++; } - - return length; } EXPORT_SYMBOL_GPL(nf_conncount_lookup); @@ -211,8 +213,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list) } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -static void nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +void nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -244,6 +246,7 @@ static void nf_conncount_gc_list(struct net *net, return; } } +EXPORT_SYMBOL_GPL(nf_conncount_gc_list); static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], @@ -291,8 +294,9 @@ count_tree(struct net *net, struct rb_root *root, /* same source network -> be counted! */ unsigned int count; - count = nf_conncount_lookup(net, &rbconn->list, tuple, - zone, &addit); + nf_conncount_lookup(net, &rbconn->list, tuple, zone, + &addit); + count = rbconn->list.count; tree_nodes_free(root, gc_nodes, gc_count); if (!addit) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 4f0491a36a1d..37c52ae06741 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -46,8 +46,9 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, } spin_lock_bh(&priv->lock); - count = nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, - &addit); + nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, + &addit); + count = priv->list.count; if (!addit) goto out; @@ -231,10 +232,10 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool addit, ret; + bool ret; spin_lock_bh(&priv->lock); - nf_conncount_lookup(net, &priv->list, NULL, &nf_ct_zone_dflt, &addit); + nf_conncount_gc_list(net, &priv->list); ret = list_empty(&priv->list.head); spin_unlock_bh(&priv->lock); -- cgit v1.2.3 From 2ba39118c10ae3a7d3411c073485bba9576684cd Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:42 -0700 Subject: netfilter: nf_conncount: Move locking into count_tree() This patch is originally from Florian Westphal. This is a preparation patch to allow lockless traversal of the tree via RCU. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 52 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 7dfd9d5e6a3e..d1a4fd1c0f81 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -262,18 +262,26 @@ static void tree_nodes_free(struct rb_root *root, } static unsigned int -count_tree(struct net *net, struct rb_root *root, - const u32 *key, u8 keylen, +count_tree(struct net *net, + struct nf_conncount_data *data, + const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + struct rb_root *root; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; - unsigned int gc_count; + unsigned int gc_count, hash; bool no_gc = false; + unsigned int count = 0; + u8 keylen = data->keylen; + hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; + root = &data->root[hash]; + + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); restart: gc_count = 0; parent = NULL; @@ -292,20 +300,20 @@ count_tree(struct net *net, struct rb_root *root, rbnode = &((*rbnode)->rb_right); } else { /* same source network -> be counted! */ - unsigned int count; - nf_conncount_lookup(net, &rbconn->list, tuple, zone, &addit); count = rbconn->list.count; tree_nodes_free(root, gc_nodes, gc_count); if (!addit) - return count; + goto out_unlock; if (!nf_conncount_add(&rbconn->list, tuple, zone)) - return 0; /* hotdrop */ + count = 0; /* hotdrop */ + goto out_unlock; - return count + 1; + count++; + goto out_unlock; } if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) @@ -328,18 +336,18 @@ count_tree(struct net *net, struct rb_root *root, goto restart; } + count = 0; if (!tuple) - return 0; - + goto out_unlock; /* no match, need to insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) - return 0; + goto out_unlock; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) { kmem_cache_free(conncount_rb_cachep, rbconn); - return 0; + goto out_unlock; } conn->tuple = *tuple; @@ -348,10 +356,13 @@ count_tree(struct net *net, struct rb_root *root, nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); + count = 1; rb_link_node(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); - return 1; +out_unlock: + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + return count; } /* Count and return number of conntrack entries in 'net' with particular 'key'. @@ -363,20 +374,7 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - struct rb_root *root; - int count; - u32 hash; - - hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; - root = &data->root[hash]; - - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - - count = count_tree(net, root, key, data->keylen, tuple, zone); - - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - - return count; + return count_tree(net, data, key, tuple, zone); } EXPORT_SYMBOL_GPL(nf_conncount_count); -- cgit v1.2.3 From 34848d5c896ea1ab4e3c441b9c4fed39928ccbaf Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:43 -0700 Subject: netfilter: nf_conncount: Split insert and traversal This patch is originally from Florian Westphal. When we have a very coarse grouping, e.g. by large subnets, zone id, etc, it's likely that we do not need to do tree rotation because we'll find a node where we can attach new entry. Based on this observation, we split tree traversal and insertion. Later on, we can make traversal lockless (tree protected by RCU), and add extra lock in the individual nodes to protect list insertion/deletion, thereby allowing parallel insert/delete in different tree nodes. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 87 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index d1a4fd1c0f81..3f14806b7271 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -261,6 +261,71 @@ static void tree_nodes_free(struct rb_root *root, } } +static unsigned int +insert_tree(struct rb_root *root, + unsigned int hash, + const u32 *key, + u8 keylen, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + struct rb_node **rbnode, *parent; + struct nf_conncount_rb *rbconn; + struct nf_conncount_tuple *conn; + unsigned int count = 0; + + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); + + parent = *rbnode; + diff = key_diff(key, rbconn->key, keylen); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* unlikely: other cpu added node already */ + if (!nf_conncount_add(&rbconn->list, tuple, zone)) { + count = 0; /* hotdrop */ + goto out_unlock; + } + + count = rbconn->list.count; + goto out_unlock; + } + } + + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + goto out_unlock; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + goto out_unlock; + } + + conn->tuple = *tuple; + conn->zone = *zone; + memcpy(rbconn->key, key, sizeof(u32) * keylen); + + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); +out_unlock: + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + return count; +} + static unsigned int count_tree(struct net *net, struct nf_conncount_data *data, @@ -272,7 +337,6 @@ count_tree(struct net *net, struct rb_root *root; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; - struct nf_conncount_tuple *conn; unsigned int gc_count, hash; bool no_gc = false; unsigned int count = 0; @@ -339,27 +403,10 @@ count_tree(struct net *net, count = 0; if (!tuple) goto out_unlock; - /* no match, need to insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; - - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } - - conn->tuple = *tuple; - conn->zone = *zone; - memcpy(rbconn->key, key, sizeof(u32) * keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + return insert_tree(root, hash, key, keylen, tuple, zone); - rb_link_node(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); out_unlock: spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); return count; -- cgit v1.2.3 From 5c789e131cbb997a528451564ea4613e812fc718 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:44 -0700 Subject: netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search This patch is originally from Florian Westphal. This patch does the following 3 main tasks. 1) Add list lock to 'struct nf_conncount_list' so that we can alter the lists containing the individual connections without holding the main tree lock. It would be useful when we only need to add/remove to/from a list without allocate/remove a node in the tree. With this change, we update nft_connlimit accordingly since we longer need to maintain a list lock in nft_connlimit now. 2) Use RCU for the initial tree search to improve tree look up performance. 3) Add a garbage collection worker. This worker is schedule when there are excessive tree node that needed to be recycled. Moreover,the rbnode reclaim logic is moved from search tree to insert tree to avoid race condition. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 +- net/netfilter/nf_conncount.c | 253 +++++++++++++++++++++-------- net/netfilter/nft_connlimit.c | 17 +- 3 files changed, 196 insertions(+), 91 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index dbec17f674b7..4b2b2baf8ab4 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -5,9 +5,17 @@ struct nf_conncount_data; +enum nf_conncount_list_add { + NF_CONNCOUNT_ADDED, /* list add was ok */ + NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ + NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ +}; + struct nf_conncount_list { + spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ + bool dead; }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, @@ -28,11 +36,12 @@ void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, void nf_conncount_list_init(struct nf_conncount_list *list); -bool nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +enum nf_conncount_list_add +nf_conncount_add(struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); -void nf_conncount_gc_list(struct net *net, +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list); void nf_conncount_cache_free(struct nf_conncount_list *list); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 3f14806b7271..02ca7df793f5 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -49,12 +49,14 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; + struct rcu_head rcu_head; }; struct nf_conncount_rb { struct rb_node node; struct nf_conncount_list list; u32 key[MAX_KEYLEN]; + struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; @@ -62,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i struct nf_conncount_data { unsigned int keylen; struct rb_root root[CONNCOUNT_SLOTS]; + struct net *net; + struct work_struct gc_work; + unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; + unsigned int gc_tree; }; static u_int32_t conncount_rnd __read_mostly; @@ -82,42 +88,70 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -bool nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +enum nf_conncount_list_add +nf_conncount_add(struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; if (WARN_ON_ONCE(list->count > INT_MAX)) - return false; + return NF_CONNCOUNT_ERR; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) - return false; + return NF_CONNCOUNT_ERR; + conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; + spin_lock(&list->list_lock); + if (list->dead == true) { + kmem_cache_free(conncount_conn_cachep, conn); + spin_unlock(&list->list_lock); + return NF_CONNCOUNT_SKIP; + } list_add_tail(&conn->node, &list->head); list->count++; - return true; + spin_unlock(&list->list_lock); + return NF_CONNCOUNT_ADDED; } EXPORT_SYMBOL_GPL(nf_conncount_add); -static void conn_free(struct nf_conncount_list *list, +static void __conn_free(struct rcu_head *h) +{ + struct nf_conncount_tuple *conn; + + conn = container_of(h, struct nf_conncount_tuple, rcu_head); + kmem_cache_free(conncount_conn_cachep, conn); +} + +static bool conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { - if (WARN_ON_ONCE(list->count == 0)) - return; + bool free_entry = false; + + spin_lock(&list->list_lock); + + if (list->count == 0) { + spin_unlock(&list->list_lock); + return free_entry; + } list->count--; - list_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + list_del_rcu(&conn->node); + if (list->count == 0) + free_entry = true; + + spin_unlock(&list->list_lock); + call_rcu(&conn->rcu_head, __conn_free); + return free_entry; } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, - struct nf_conncount_tuple *conn) + struct nf_conncount_tuple *conn, bool *free_entry) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; @@ -137,7 +171,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, */ age = a - b; if (conn->cpu == cpu || age >= 2) { - conn_free(list, conn); + *free_entry = conn_free(list, conn); return ERR_PTR(-ENOENT); } @@ -154,6 +188,7 @@ void nf_conncount_lookup(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; + bool free_entry = false; /* best effort only */ *addit = tuple ? true : false; @@ -163,7 +198,7 @@ void nf_conncount_lookup(struct net *net, if (collect > CONNCOUNT_GC_MAX_NODES) break; - found = find_or_evict(net, list, conn); + found = find_or_evict(net, list, conn, &free_entry); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { @@ -208,24 +243,31 @@ EXPORT_SYMBOL_GPL(nf_conncount_lookup); void nf_conncount_list_init(struct nf_conncount_list *list) { + spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 1; + list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -void nf_conncount_gc_list(struct net *net, +/* Return true if the list is empty */ +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; + bool free_entry = false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { - found = find_or_evict(net, list, conn); + found = find_or_evict(net, list, conn, &free_entry); if (IS_ERR(found)) { - if (PTR_ERR(found) == -ENOENT) + if (PTR_ERR(found) == -ENOENT) { + if (free_entry) + return true; collected++; + } continue; } @@ -236,18 +278,28 @@ void nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - conn_free(list, conn); + if (conn_free(list, conn)) + return true; collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) - return; + return false; } + return false; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); +static void __tree_nodes_free(struct rcu_head *h) +{ + struct nf_conncount_rb *rbconn; + + rbconn = container_of(h, struct nf_conncount_rb, rcu_head); + kmem_cache_free(conncount_rb_cachep, rbconn); +} + static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -256,23 +308,39 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; - rb_erase(&rbconn->node, root); - kmem_cache_free(conncount_rb_cachep, rbconn); + spin_lock(&rbconn->list.list_lock); + if (rbconn->list.count == 0 && rbconn->list.dead == false) { + rbconn->list.dead = true; + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); + } + spin_unlock(&rbconn->list.list_lock); } } +static void schedule_gc_worker(struct nf_conncount_data *data, int tree) +{ + set_bit(tree, data->pending_trees); + schedule_work(&data->gc_work); +} + static unsigned int -insert_tree(struct rb_root *root, +insert_tree(struct net *net, + struct nf_conncount_data *data, + struct rb_root *root, unsigned int hash, const u32 *key, u8 keylen, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { + enum nf_conncount_list_add ret; + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; - unsigned int count = 0; + unsigned int count = 0, gc_count = 0; + bool node_found = false; spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); @@ -290,16 +358,44 @@ insert_tree(struct rb_root *root, rbnode = &((*rbnode)->rb_right); } else { /* unlikely: other cpu added node already */ - if (!nf_conncount_add(&rbconn->list, tuple, zone)) { + node_found = true; + ret = nf_conncount_add(&rbconn->list, tuple, zone); + if (ret == NF_CONNCOUNT_ERR) { count = 0; /* hotdrop */ - goto out_unlock; + } else if (ret == NF_CONNCOUNT_ADDED) { + count = rbconn->list.count; + } else { + /* NF_CONNCOUNT_SKIP, rbconn is already + * reclaimed by gc, insert a new tree node + */ + node_found = false; } - - count = rbconn->list.count; - goto out_unlock; + break; } + + if (gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + if (nf_conncount_gc_list(net, &rbconn->list)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + + if (gc_count >= ARRAY_SIZE(gc_nodes)) + schedule_gc_worker(data, hash); } + if (node_found) + goto out_unlock; + /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -333,87 +429,97 @@ count_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + enum nf_conncount_list_add ret; struct rb_root *root; - struct rb_node **rbnode, *parent; + struct rb_node *parent; struct nf_conncount_rb *rbconn; - unsigned int gc_count, hash; - bool no_gc = false; - unsigned int count = 0; + unsigned int hash; u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - restart: - gc_count = 0; - parent = NULL; - rbnode = &(root->rb_node); - while (*rbnode) { + parent = rcu_dereference_raw(root->rb_node); + while (parent) { int diff; bool addit; - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); + rbconn = rb_entry(parent, struct nf_conncount_rb, node); - parent = *rbnode; diff = key_diff(key, rbconn->key, keylen); if (diff < 0) { - rbnode = &((*rbnode)->rb_left); + parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { - rbnode = &((*rbnode)->rb_right); + parent = rcu_dereference_raw(parent->rb_right); } else { /* same source network -> be counted! */ nf_conncount_lookup(net, &rbconn->list, tuple, zone, &addit); - count = rbconn->list.count; - tree_nodes_free(root, gc_nodes, gc_count); if (!addit) - goto out_unlock; + return rbconn->list.count; + + ret = nf_conncount_add(&rbconn->list, tuple, zone); + if (ret == NF_CONNCOUNT_ERR) { + return 0; /* hotdrop */ + } else if (ret == NF_CONNCOUNT_ADDED) { + return rbconn->list.count; + } else { + /* NF_CONNCOUNT_SKIP, rbconn is already + * reclaimed by gc, insert a new tree node + */ + break; + } + } + } - if (!nf_conncount_add(&rbconn->list, tuple, zone)) - count = 0; /* hotdrop */ - goto out_unlock; + if (!tuple) + return 0; - count++; - goto out_unlock; - } + return insert_tree(net, data, root, hash, key, keylen, tuple, zone); +} - if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) - continue; +static void tree_gc_worker(struct work_struct *work) +{ + struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; + struct rb_root *root; + struct rb_node *node; + unsigned int tree, next_tree, gc_count = 0; + + tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; + root = &data->root[tree]; - nf_conncount_gc_list(net, &rbconn->list); - if (list_empty(&rbconn->list.head)) + rcu_read_lock(); + for (node = rb_first(root); node != NULL; node = rb_next(node)) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + if (nf_conncount_gc_list(data->net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } + rcu_read_unlock(); + + spin_lock_bh(&nf_conncount_locks[tree]); if (gc_count) { - no_gc = true; tree_nodes_free(root, gc_nodes, gc_count); - /* tree_node_free before new allocation permits - * allocator to re-use newly free'd object. - * - * This is a rare event; in most cases we will find - * existing node to re-use. (or gc_count is 0). - */ - goto restart; } - count = 0; - if (!tuple) - goto out_unlock; + clear_bit(tree, data->pending_trees); - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - return insert_tree(root, hash, key, keylen, tuple, zone); + next_tree = (tree + 1) % CONNCOUNT_SLOTS; + next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); -out_unlock: - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - return count; + if (next_tree < CONNCOUNT_SLOTS) { + data->gc_tree = next_tree; + schedule_work(work); + } + + spin_unlock_bh(&nf_conncount_locks[tree]); } /* Count and return number of conntrack entries in 'net' with particular 'key'. * If 'tuple' is not null, insert it into the accounting data structure. + * Call with RCU read lock. */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, @@ -452,6 +558,8 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family data->root[i] = RB_ROOT; data->keylen = keylen / sizeof(u32); + data->net = net; + INIT_WORK(&data->gc_work, tree_gc_worker); return data; } @@ -487,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family, { unsigned int i; + cancel_work_sync(&data->gc_work); nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 37c52ae06741..b90d96ba4a12 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,7 +14,6 @@ #include struct nft_connlimit { - spinlock_t lock; struct nf_conncount_list list; u32 limit; bool invert; @@ -45,7 +44,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - spin_lock_bh(&priv->lock); nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, &addit); count = priv->list.count; @@ -53,14 +51,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, if (!addit) goto out; - if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { + if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { regs->verdict.code = NF_DROP; - spin_unlock_bh(&priv->lock); return; } count++; out: - spin_unlock_bh(&priv->lock); if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; @@ -88,7 +84,6 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - spin_lock_init(&priv->lock); nf_conncount_list_init(&priv->list); priv->limit = limit; priv->invert = invert; @@ -213,7 +208,6 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - spin_lock_init(&priv_dst->lock); nf_conncount_list_init(&priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -232,15 +226,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - spin_lock_bh(&priv->lock); - nf_conncount_gc_list(net, &priv->list); - - ret = list_empty(&priv->list.head); - spin_unlock_bh(&priv->lock); - - return ret; + return nf_conncount_gc_list(net, &priv->list); } static struct nft_expr_type nft_connlimit_type; -- cgit v1.2.3 From ed07d9a021df6da53456663a76999189badc432a Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Mon, 2 Jul 2018 16:52:14 +0200 Subject: netfilter: nf_conntrack: resolve clash for matching conntracks This patch enables the clash resolution for NAT (disabled in "590b52e10d41") if clashing conntracks match (i.e. both tuples are equal) and a protocol allows it. The clash might happen for a connections-less protocol (e.g. UDP) when two threads in parallel writes to the same socket and consequent calls to "get_unique_tuple" return the same tuples (incl. reply tuples). In this case it is safe to perform the resolution, as the losing CT describes the same mangling as the winning CT, so no modifications to the packet are needed, and the result of rules traversal for the loser's packet stays valid. Signed-off-by: Martynas Pumputis Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5123e91b1982..4ced7c7102b6 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -632,6 +632,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +static inline bool +nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) +{ + return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && + nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && + nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && + nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && + net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); +} + /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { @@ -825,19 +837,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; + enum ip_conntrack_info oldinfo; + struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->allow_clash && - ((ct->status & IPS_NAT_DONE_MASK) == 0) && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { - enum ip_conntrack_info oldinfo; - struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); - - nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_conntrack_put(&loser_ct->ct_general); - nf_ct_set(skb, ct, oldinfo); - return NF_ACCEPT; + if (((ct->status & IPS_NAT_DONE_MASK) == 0) || + nf_ct_match(ct, loser_ct)) { + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, oldinfo); + return NF_ACCEPT; + } + nf_ct_put(ct); } NF_CT_STAT_INC(net, drop); return NF_DROP; -- cgit v1.2.3 From ec1b28ca9674def4a158808a6493bdb87b993d81 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:52 +0300 Subject: ipvs: provide just conn to ip_vs_state_name In preparation for followup patches, provide just the cp ptr to ip_vs_state_name. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_conn.c | 8 ++++---- net/netfilter/ipvs/ip_vs_proto.c | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a0bec23c6d5e..4d76abcf1c41 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1221,7 +1221,7 @@ struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, struct ip_vs_dest *dest, __u32 fwmark); void ip_vs_conn_expire_now(struct ip_vs_conn *cp); -const char *ip_vs_state_name(__u16 proto, int state); +const char *ip_vs_state_name(const struct ip_vs_conn *cp); void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 99e0aa350dc5..de5a64e42ebd 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1107,7 +1107,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), (cp->timer.expires-jiffies)/HZ, pe_data); else #endif @@ -1118,7 +1118,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), (cp->timer.expires-jiffies)/HZ, pe_data); } return 0; @@ -1169,7 +1169,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); else @@ -1181,7 +1181,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index ca880a3ad033..85c446621758 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -193,13 +193,13 @@ ip_vs_create_timeout_table(int *table, int size) } -const char * ip_vs_state_name(__u16 proto, int state) +const char *ip_vs_state_name(const struct ip_vs_conn *cp) { - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); + return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; + return pp->state_name(cp->state); } -- cgit v1.2.3 From 275411430f892407b885be1de2548b2e632892c3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:53 +0300 Subject: ipvs: add assured state for conn templates cp->state was not used for templates. Add support for state bits and for the first "assured" bit which indicates that some connection controlled by this template was established or assured by the real server. In a followup patch we will use it to drop templates under SYN attack. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 16 ++++++++++++++++ net/netfilter/ipvs/ip_vs_proto.c | 17 +++++++++++++++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 ++ net/netfilter/ipvs/ip_vs_proto_tcp.c | 2 ++ net/netfilter/ipvs/ip_vs_proto_udp.c | 2 ++ net/netfilter/ipvs/ip_vs_sync.c | 18 ++++++------------ 6 files changed, 43 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4d76abcf1c41..a0d2e0bb9a94 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -335,6 +335,11 @@ enum ip_vs_sctp_states { IP_VS_SCTP_S_LAST }; +/* Connection templates use bits from state */ +#define IP_VS_CTPL_S_NONE 0x0000 +#define IP_VS_CTPL_S_ASSURED 0x0001 +#define IP_VS_CTPL_S_LAST 0x0002 + /* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT. @@ -1289,6 +1294,17 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } +/* Mark our template as assured */ +static inline void +ip_vs_control_assure_ct(struct ip_vs_conn *cp) +{ + struct ip_vs_conn *ct = cp->control; + + if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) && + (ct->flags & IP_VS_CONN_F_TEMPLATE)) + ct->state |= IP_VS_CTPL_S_ASSURED; +} + /* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct netns_ipvs *ipvs); int ip_vs_control_net_init(struct netns_ipvs *ipvs); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 85c446621758..54ee84adf0bd 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -42,6 +42,11 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; +/* States for conn templates: NONE or words separated with ",", max 15 chars */ +static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { + [IP_VS_CTPL_S_NONE] = "NONE", + [IP_VS_CTPL_S_ASSURED] = "ASSURED", +}; /* * register an ipvs protocol @@ -195,11 +200,19 @@ ip_vs_create_timeout_table(int *table, int size) const char *ip_vs_state_name(const struct ip_vs_conn *cp) { - struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol); + unsigned int state = cp->state; + struct ip_vs_protocol *pp; + + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (state >= IP_VS_CTPL_S_LAST) + return "ERR!"; + return ip_vs_ctpl_state_name_table[state] ? : "?"; + } + pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; - return pp->state_name(cp->state); + return pp->state_name(state); } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 3250c4a1111e..b0cd7d08f2a7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } + if (next_state == IP_VS_SCTP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 80d10ad12a15..1770fc6ce960 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } + if (new_state == IP_VS_TCP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); } if (likely(pd)) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index e0ef11c3691e..0f53c49025f8 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, } cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; + if (direction == IP_VS_DIR_OUTPUT) + ip_vs_control_assure_ct(cp); } static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 001501e25625..d4020c5e831d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer continue; } } else { - /* protocol in templates is not used for state/timeout */ - if (state > 0) { - IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", - state); - state = 0; - } + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", + state); } ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, @@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m goto out; } } else { - /* protocol in templates is not used for state/timeout */ - if (state > 0) { - IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", - state); - state = 0; - } + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", + state); } if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, pe_data_len, pe_name, pe_name_len)) { -- cgit v1.2.3 From 762c40076684771c0efbce6490ded26086441ce6 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:54 +0300 Subject: ipvs: drop conn templates under attack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before now, connection templates were ignored by the random dropentry procedure. But Michal Koutný suggests that we should add exception for connections under SYN attack. He provided patch that implements it for TCP: IPVS includes protection against filling the ip_vs_conn_tab by dropping 1/32 of feasible entries every second. The template entries (for persistent services) are never directly deleted by this mechanism but when a picked TCP connection entry is being dropped (1), the respective template entry is dropped too (realized by expiring 60 seconds after the connection entry being dropped). There is another mechanism that removes connection entries when they time out (2), in this case the associated template entry is not deleted. Under SYN flood template entries would accumulate (due to their entry longer timeout). The accumulation takes place also with drop_entry being enabled. Roughly 15% ((31/32)^60) of SYN_RECV connections survive the dropping mechanism (1) and are removed by the timeout mechanism (2)(defaults to 60 seconds for SYN_RECV), thus template entries would still accumulate. The patch ensures that when a connection entry times out, we also remove the template entry from the table. To prevent breaking persistent services (since the connection may time out in already established state) we add a new entry flag to protect templates what spawned at least one established TCP connection. We already added ASSURED flag for the templates in previous patch, so that we can use it now to decide which connection templates should be dropped under attack. But we also have some cases that need special handling. We modify the dropentry procedure as follows: - Linux timers currently use LIFO ordering but we can not rely on this to drop controlling connections. So, set cp->timeout to 0 to indicate that connection was dropped and that on expiration we should try to drop our controlling connections. As result, we can now avoid the ip_vs_conn_expire_now call. - move the cp->n_control check above, so that it avoids restarting the timer for controlling connections when not needed. - drop unassured connection templates here if they are not referred by any connections. On connection expiration: if connection was dropped (cp->timeout=0) try to drop our controlling connection except if it is a template in assured state. In ip_vs_conn_flush change order of ip_vs_conn_expire_now calls according to the LIFO timer expiration order. It should work faster for controlling connections with single controlled one. Suggested-by: Michal Koutný Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 59 +++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index de5a64e42ebd..0edc62910ebf 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -825,12 +825,23 @@ static void ip_vs_conn_expire(struct timer_list *t) /* Unlink conn if not referenced anymore */ if (likely(ip_vs_conn_unlink(cp))) { + struct ip_vs_conn *ct = cp->control; + /* delete the timer if it is activated by other users */ del_timer(&cp->timer); /* does anybody control me? */ - if (cp->control) + if (ct) { ip_vs_control_del(cp); + /* Drop CTL or non-assured TPL if not used anymore */ + if (!cp->timeout && !atomic_read(&ct->n_control) && + (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || + !(ct->state & IP_VS_CTPL_S_ASSURED))) { + IP_VS_DBG(4, "drop controlling connection\n"); + ct->timeout = 0; + ip_vs_conn_expire_now(ct); + } + } if ((cp->flags & IP_VS_CONN_F_NFCT) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { @@ -872,6 +883,10 @@ static void ip_vs_conn_expire(struct timer_list *t) /* Modify timer, so that it expires as soon as possible. * Can be called without reference only if under RCU lock. + * We can have such chain of conns linked with ->control: DATA->CTL->TPL + * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup + * - cp->timeout=0 indicates all conns from chain should be dropped but + * TPL is not dropped if in assured state */ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { @@ -1197,8 +1212,11 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { #endif -/* - * Randomly drop connection entries before running out of memory +/* Randomly drop connection entries before running out of memory + * Can be used for DATA and CTL conns. For TPL conns there are exceptions: + * - traffic for services in OPS mode increases ct->in_pkts, so it is supported + * - traffic for services not in OPS mode does not increase ct->in_pkts in + * all cases, so it is not supported */ static inline int todrop_entry(struct ip_vs_conn *cp) { @@ -1242,7 +1260,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { int idx; - struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn *cp; rcu_read_lock(); /* @@ -1254,13 +1272,15 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->ipvs != ipvs) continue; + if (atomic_read(&cp->n_control)) + continue; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { - if (atomic_read(&cp->n_control) || - !ip_vs_conn_ops_mode(cp)) - continue; - else - /* connection template of OPS */ + /* connection template of OPS */ + if (ip_vs_conn_ops_mode(cp)) goto try_drop; + if (!(cp->state & IP_VS_CTPL_S_ASSURED)) + goto drop; + continue; } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { @@ -1294,15 +1314,10 @@ try_drop: continue; } - IP_VS_DBG(4, "del connection\n"); +drop: + IP_VS_DBG(4, "drop connection\n"); + cp->timeout = 0; ip_vs_conn_expire_now(cp); - cp_c = cp->control; - /* cp->control is valid only with reference to cp */ - if (cp_c && __ip_vs_conn_get(cp)) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp_c); - __ip_vs_conn_put(cp); - } } cond_resched_rcu(); } @@ -1325,15 +1340,19 @@ flush_again: hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (cp->ipvs != ipvs) continue; - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); + /* As timers are expired in LIFO order, restart + * the timer of controlling connection first, so + * that it is expired after us. + */ cp_c = cp->control; /* cp->control is valid only with reference to cp */ if (cp_c && __ip_vs_conn_get(cp)) { - IP_VS_DBG(4, "del conn template\n"); + IP_VS_DBG(4, "del controlling connection\n"); ip_vs_conn_expire_now(cp_c); __ip_vs_conn_put(cp); } + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); } cond_resched_rcu(); } -- cgit v1.2.3 From 440534d3c56be04abfb26850ee882d19d223557a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 9 Jul 2018 18:06:33 +0800 Subject: netfilter: Remove useless param helper of nf_ct_helper_ext_add The param helper of nf_ct_helper_ext_add is useless now, then remove it now. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 4 +--- net/netfilter/nf_conntrack_core.c | 3 +-- net/netfilter/nf_conntrack_helper.c | 5 ++--- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nft_ct.c | 2 +- net/netfilter/xt_CT.c | 2 +- net/openvswitch/conntrack.c | 2 +- 7 files changed, 8 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 32c2a94a219d..2492120b8097 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -103,9 +103,7 @@ int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, unsigned int); -struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, - gfp_t gfp); +struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4ced7c7102b6..d97d7e9a9ee7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1401,8 +1401,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; if (exp->helper) { - help = nf_ct_helper_ext_add(ct, exp->helper, - GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, exp->helper); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a55a58c706a9..d557a425289d 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -192,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); struct nf_conn_help * -nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, gfp_t gfp) +nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; @@ -262,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } if (help == NULL) { - help = nf_ct_helper_ext_add(ct, helper, flags); + help = nf_ct_helper_ext_add(ct, flags); if (help == NULL) return -ENOMEM; } else { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 40152b9ad772..f981bfa8db72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1947,7 +1947,7 @@ ctnetlink_create_conntrack(struct net *net, } else { struct nf_conn_help *help; - help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 1435ffc5f57e..3bc82ee5464d 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -870,7 +870,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (test_bit(IPS_HELPER_BIT, &ct->status)) return; - help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 03b9a50ec93b..7ba454e9e3fa 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, return -ENOENT; } - help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); if (help == NULL) { nf_conntrack_helper_put(helper); return -ENOMEM; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e05bd3e53f0f..3e33c382367f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1303,7 +1303,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return -EINVAL; } - help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL); if (!help) { nf_conntrack_helper_put(helper); return -ENOMEM; -- cgit v1.2.3 From 452238e8d5ffd8b77f92387519513839d4ca7379 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:10 +0200 Subject: netfilter: nf_tables: add and use helper for module autoload module autoload is problematic, it requires dropping the mutex that protects the transaction. Once the mutex has been dropped, another client can start a new transaction before we had a chance to abort current transaction log. This helper makes sure we first zap the transaction log, then drop mutex for module autoload. In case autload is successful, the caller has to reply entire message anyway. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 81 +++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3f211e1025c1..5e95e92e547b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -455,8 +455,40 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) return NULL; } +/* + * Loading a module requires dropping mutex that guards the + * transaction. + * We first need to abort any pending transactions as once + * mutex is unlocked a different client could start a new + * transaction. It must not see any 'future generation' + * changes * as these changes will never happen. + */ +#ifdef CONFIG_MODULES +static int __nf_tables_abort(struct net *net); + +static void nft_request_module(struct net *net, const char *fmt, ...) +{ + char module_name[MODULE_NAME_LEN]; + va_list args; + int ret; + + __nf_tables_abort(net); + + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); + if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) + return; + + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("%s", module_name); + nfnl_lock(NFNL_SUBSYS_NFTABLES); +} +#endif + static const struct nft_chain_type * -nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) +nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, + u8 family, bool autoload) { const struct nft_chain_type *type; @@ -465,10 +497,8 @@ nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) return type; #ifdef CONFIG_MODULES if (autoload) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), (const char *)nla_data(nla)); type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return ERR_PTR(-EAGAIN); @@ -1412,7 +1442,7 @@ static int nft_chain_parse_hook(struct net *net, type = chain_type[family][NFT_CHAIN_T_DEFAULT]; if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], + type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, create); if (IS_ERR(type)) return PTR_ERR(type); @@ -1875,7 +1905,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, return NULL; } -static const struct nft_expr_type *nft_expr_type_get(u8 family, +static const struct nft_expr_type *nft_expr_type_get(struct net *net, + u8 family, struct nlattr *nla) { const struct nft_expr_type *type; @@ -1889,17 +1920,13 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family, #ifdef CONFIG_MODULES if (type == NULL) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), (char *)nla_data(nla)); if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); } @@ -1968,7 +1995,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); @@ -2744,9 +2771,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-set"); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(ctx->net, "nft-set"); if (!list_empty(&nf_tables_set_types)) return ERR_PTR(-EAGAIN); } @@ -4779,7 +4804,8 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype) return NULL; } -static const struct nft_object_type *nft_obj_type_get(u32 objtype) +static const struct nft_object_type * +nft_obj_type_get(struct net *net, u32 objtype) { const struct nft_object_type *type; @@ -4789,9 +4815,7 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype) #ifdef CONFIG_MODULES if (type == NULL) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-obj-%u", objtype); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nft-obj-%u", objtype); if (__nft_obj_type_get(objtype)) return ERR_PTR(-EAGAIN); } @@ -4843,7 +4867,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - type = nft_obj_type_get(objtype); + type = nft_obj_type_get(net, objtype); if (IS_ERR(type)) return PTR_ERR(type); @@ -5339,7 +5363,8 @@ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) return NULL; } -static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) +static const struct nf_flowtable_type * +nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; @@ -5349,9 +5374,7 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) #ifdef CONFIG_MODULES if (type == NULL) { - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nf-flowtable-%u", family); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + nft_request_module(net, "nf-flowtable-%u", family); if (__nft_flowtable_type_get(family)) return ERR_PTR(-EAGAIN); } @@ -5431,7 +5454,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err1; } - type = nft_flowtable_type_get(family); + type = nft_flowtable_type_get(net, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err2; -- cgit v1.2.3 From ca2f18be792fddd0db2bbf6cbe1ec12d1bb32dd7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:11 +0200 Subject: netfilter: nf_tables: make valid_genid callback mandatory always call this function, followup patch can use this to aquire a per-netns transaction log to guard the entire batch instead of using the nfnl susbsys mutex (which is shared among all namespaces). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nfnetlink.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5e95e92e547b..594b395442d6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6591,7 +6591,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) static bool nf_tables_valid_genid(struct net *net, u32 genid) { - return net->nft.base_seq == genid; + return genid == 0 || net->nft.base_seq == genid; } static const struct nfnetlink_subsystem nf_tables_subsys = { diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e1b6be29848d..94f9bcaa0799 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -331,13 +331,13 @@ replay: } } - if (!ss->commit || !ss->abort) { + if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } - if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { + if (!ss->valid_genid(net, genid)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); -- cgit v1.2.3 From be2ab5b4d5c0bf041a34ec2e1397d50afbfb095e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:12 +0200 Subject: netfilter: nf_tables: take module reference when starting a batch Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 + net/netfilter/nf_tables_api.c | 1 + net/netfilter/nfnetlink.c | 9 +++++++++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 3ecc3050be0e..4a520d3304a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,6 +29,7 @@ struct nfnetlink_subsystem { __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ + struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb); void (*cleanup)(struct net *net); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 594b395442d6..c16c481fc52a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6603,6 +6603,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .abort = nf_tables_abort, .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, + .owner = THIS_MODULE, }; int nft_chain_validate_dependency(const struct nft_chain *chain, diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 94f9bcaa0799..dd1d7bc23b03 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -337,7 +337,14 @@ replay: return kfree_skb(skb); } + if (!try_module_get(ss->owner)) { + nfnl_unlock(subsys_id); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); + return kfree_skb(skb); + } + if (!ss->valid_genid(net, genid)) { + module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); @@ -472,6 +479,7 @@ done: nfnl_err_reset(&err_list); nfnl_unlock(subsys_id); kfree_skb(skb); + module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { err = ss->commit(net, oskb); @@ -491,6 +499,7 @@ done: nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); kfree_skb(skb); + module_put(ss->owner); } static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { -- cgit v1.2.3 From 2a43ecf96ba6a6eed70dbcd99d0888fc0ad3b82b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:13 +0200 Subject: netfilter: nf_tables: avoid global info storage This works because all accesses are currently serialized by nfnl nf_tables subsys mutex. If we want to have per-netns locking, we need to make this scratch area pernetns or allocate it on demand. This does the latter, its ~28kbyte but we can fallback to vmalloc so it should be fine. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c16c481fc52a..68436edd9cdf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2454,8 +2454,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) #define NFT_RULE_MAXEXPRS 128 -static struct nft_expr_info *info; - static int nf_tables_newrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2463,6 +2461,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + struct nft_expr_info *info = NULL; int family = nfmsg->nfgen_family; struct nft_table *table; struct nft_chain *chain; @@ -2533,6 +2532,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { + info = kvmalloc_array(NFT_RULE_MAXEXPRS, + sizeof(struct nft_expr_info), + GFP_KERNEL); + if (!info) + return -ENOMEM; + nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) @@ -2625,6 +2630,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_rcu(&rule->list, &chain->rules); } } + kvfree(info); chain->use++; if (net->nft.validate_state == NFT_VALIDATE_DO) @@ -2638,6 +2644,7 @@ err1: if (info[i].ops != NULL) module_put(info[i].ops->type->owner); } + kvfree(info); return err; } @@ -7203,29 +7210,19 @@ static int __init nf_tables_module_init(void) nft_chain_filter_init(); - info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info), - GFP_KERNEL); - if (info == NULL) { - err = -ENOMEM; - goto err1; - } - err = nf_tables_core_module_init(); if (err < 0) - goto err2; + return err; err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err3; + goto err; register_netdevice_notifier(&nf_tables_flowtable_notifier); return register_pernet_subsys(&nf_tables_net_ops); -err3: +err: nf_tables_core_module_exit(); -err2: - kfree(info); -err1: return err; } @@ -7237,7 +7234,6 @@ static void __exit nf_tables_module_exit(void) unregister_pernet_subsys(&nf_tables_net_ops); rcu_barrier(); nf_tables_core_module_exit(); - kfree(info); } module_init(nf_tables_module_init); -- cgit v1.2.3 From f102d66b335a417d4848da9441f585695a838934 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:14 +0200 Subject: netfilter: nf_tables: use dedicated mutex to guard transactions Continue to use nftnl subsys mutex to protect (un)registration of hook types, expressions and so on, but force batch operations to do their own locking. This allows distinct net namespaces to perform transactions in parallel. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 88 +++++++++++++++++++++++++++++++--------- net/netfilter/nfnetlink.c | 10 ++--- net/netfilter/nft_chain_filter.c | 4 +- net/netfilter/nft_dynset.c | 2 + 5 files changed, 77 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 94767ea3a490..286fd960896f 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,6 +7,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; + struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; u8 validate_state; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 68436edd9cdf..c0fb2bcd30fe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -480,12 +480,19 @@ static void nft_request_module(struct net *net, const char *fmt, ...) if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) return; - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&net->nft.commit_mutex); request_module("%s", module_name); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&net->nft.commit_mutex); } #endif +static void lockdep_nfnl_nft_mutex_not_held(void) +{ +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); +#endif +} + static const struct nft_chain_type * nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, u8 family, bool autoload) @@ -495,6 +502,8 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; + + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { nft_request_module(net, "nft-chain-%u-%.*s", family, @@ -802,6 +811,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int err; + lockdep_assert_held(&net->nft.commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask); if (IS_ERR(table)) { @@ -1042,7 +1052,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) return ERR_PTR(-ENOENT); } -static struct nft_chain *nft_chain_lookup(struct nft_table *table, +static bool lockdep_commit_lock_is_held(struct net *net) +{ +#ifdef CONFIG_PROVE_LOCKING + return lockdep_is_held(&net->nft.commit_mutex); +#else + return true; +#endif +} + +static struct nft_chain *nft_chain_lookup(struct net *net, + struct nft_table *table, const struct nlattr *nla, u8 genmask) { char search[NFT_CHAIN_MAXNAMELEN + 1]; @@ -1055,7 +1075,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table, nla_strlcpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && - !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + !lockdep_commit_lock_is_held(net)); chain = ERR_PTR(-ENOENT); rcu_read_lock(); @@ -1295,7 +1315,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); @@ -1428,6 +1448,9 @@ static int nft_chain_parse_hook(struct net *net, struct net_device *dev; int err; + lockdep_assert_held(&net->nft.commit_mutex); + lockdep_nfnl_nft_mutex_not_held(); + err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], nft_hook_policy, NULL); if (err < 0) @@ -1662,7 +1685,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; - chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain2 = nft_chain_lookup(ctx->net, table, + nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) return -EEXIST; } @@ -1724,6 +1748,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + lockdep_assert_held(&net->nft.commit_mutex); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); @@ -1742,7 +1768,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } attr = nla[NFTA_CHAIN_HANDLE]; } else { - chain = nft_chain_lookup(table, attr, genmask); + chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { NL_SET_BAD_ATTR(extack, attr); @@ -1820,7 +1846,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { attr = nla[NFTA_CHAIN_NAME]; - chain = nft_chain_lookup(table, attr, genmask); + chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, attr); @@ -1918,6 +1944,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { nft_request_module(net, "nft-expr-%u-%.*s", family, @@ -2352,7 +2379,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2386,6 +2413,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, { struct nft_expr *expr; + lockdep_assert_held(&ctx->net->nft.commit_mutex); /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). @@ -2476,6 +2504,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, bool create; u64 handle, pos_handle; + lockdep_assert_held(&net->nft.commit_mutex); + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); @@ -2484,7 +2514,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2684,7 +2714,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, } if (nla[NFTA_RULE_CHAIN]) { - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2776,6 +2807,8 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_type *type; u32 flags = 0; + lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { nft_request_module(ctx->net, "nft-set"); @@ -4820,6 +4853,7 @@ nft_obj_type_get(struct net *net, u32 objtype) if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { nft_request_module(net, "nft-obj-%u", objtype); @@ -5379,6 +5413,7 @@ nft_flowtable_type_get(struct net *net, u8 family) if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { nft_request_module(net, "nf-flowtable-%u", family); @@ -6232,9 +6267,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha next_genbit = nft_gencursor_next(net); g0 = rcu_dereference_protected(chain->rules_gen_0, - lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + lockdep_commit_lock_is_held(net)); g1 = rcu_dereference_protected(chain->rules_gen_1, - lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ if (chain->rules_next == NULL) { @@ -6442,6 +6477,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_commit_release(net); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + mutex_unlock(&net->nft.commit_mutex); return 0; } @@ -6593,12 +6629,25 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb) { - return __nf_tables_abort(net); + int ret = __nf_tables_abort(net); + + mutex_unlock(&net->nft.commit_mutex); + + return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { - return genid == 0 || net->nft.base_seq == genid; + bool genid_ok; + + mutex_lock(&net->nft.commit_mutex); + + genid_ok = genid == 0 || net->nft.base_seq == genid; + if (!genid_ok) + mutex_unlock(&net->nft.commit_mutex); + + /* else, commit mutex has to be released by commit or abort function */ + return genid_ok; } static const struct nfnetlink_subsystem nf_tables_subsys = { @@ -6937,8 +6986,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_GOTO: if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; - chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], - genmask); + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) @@ -7183,6 +7232,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7191,11 +7241,11 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net); __nft_release_tables(net); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index dd1d7bc23b03..916913454624 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -350,6 +350,8 @@ replay: return kfree_skb(skb); } + nfnl_unlock(subsys_id); + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -471,13 +473,8 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - const struct nfnetlink_subsystem *ss2; - - ss2 = nfnl_dereference_protected(subsys_id); - if (ss2 == ss) - ss->abort(net, oskb); + ss->abort(net, oskb); nfnl_err_reset(&err_list); - nfnl_unlock(subsys_id); kfree_skb(skb); module_put(ss->owner); goto replay; @@ -497,7 +494,6 @@ done: ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); - nfnl_unlock(subsys_id); kfree_skb(skb); module_put(ss->owner); } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index d21834bed805..ea5b7c4944f6 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -322,7 +322,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, if (!ctx.net) return NOTIFY_DONE; - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&ctx.net->nft.commit_mutex); list_for_each_entry(table, &ctx.net->nft.tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -337,7 +337,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_netdev_event(event, dev, &ctx); } } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&ctx.net->nft.commit_mutex); put_net(ctx.net); return NOTIFY_DONE; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 27d7e4598ab6..81184c244d1a 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u64 timeout; int err; + lockdep_assert_held(&ctx->net->nft.commit_mutex); + if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || tb[NFTA_DYNSET_SREG_KEY] == NULL) -- cgit v1.2.3 From 06ff4aa252303bd2a5d706008210bb49d9889b9d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 13 Jul 2018 14:54:43 +0200 Subject: netfilter: nf_osf: add nf_osf_match_one() This new function allows us to check if there is TCP syn packet matching with a given fingerprint that can be reused from the upcoming new nf_osf_find() function. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_osf.c | 207 ++++++++++++++++++++++++++----------------------- 1 file changed, 111 insertions(+), 96 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c index 5ba5c7bef2f9..bd7b34dd7d87 100644 --- a/net/netfilter/nf_osf.c +++ b/net/netfilter/nf_osf.c @@ -21,15 +21,14 @@ #include static inline int nf_osf_ttl(const struct sk_buff *skb, - const struct nf_osf_info *info, - unsigned char f_ttl) + int ttl_check, unsigned char f_ttl) { const struct iphdr *ip = ip_hdr(skb); - if (info->flags & NF_OSF_TTL) { - if (info->ttl == NF_OSF_TTL_TRUE) + if (ttl_check != -1) { + if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; - if (info->ttl == NF_OSF_TTL_NOCHECK) + if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; @@ -52,6 +51,104 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, return ip->ttl == f_ttl; } +static bool nf_osf_match_one(const struct sk_buff *skb, + const struct nf_osf_user_finger *f, + int ttl_check, u16 totlen, u16 window, + const unsigned char *optp, + unsigned int optsize) +{ + unsigned int check_WSS = 0; + int fmatch = FMATCH_WRONG; + int foptsize, optnum; + u16 mss = 0; + + if (totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) + return false; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + return false; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + optsize > MAX_IPOPTLEN || + optsize != foptsize) + return false; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == (*optp)) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = optp + len; + + fmatch = FMATCH_OK; + + switch (*optp) { + case OSFOPT_MSS: + mss = optp[3]; + mss <<= 8; + mss |= optp[2]; + + mss = ntohs((__force __be16)mss); + break; + case OSFOPT_TS: + break; + } + + optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (window == f->wss.val * mss || + window == f->wss.val * SMART_MSS_1 || + window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (window == f->wss.val * (mss + 40) || + window == f->wss.val * (SMART_MSS_1 + 40) || + window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + return fmatch == FMATCH_OK; +} + bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, @@ -59,15 +156,16 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, const struct list_head *nf_osf_fingers) { const unsigned char *optp = NULL, *_optp = NULL; - unsigned int optsize = 0, check_WSS = 0; - int fmatch = FMATCH_WRONG, fcount = 0; const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; - u16 window, totlen, mss = 0; + int fcount = 0, ttl_check; + int fmatch = FMATCH_WRONG; + unsigned int optsize = 0; const struct tcphdr *tcp; struct tcphdr _tcph; + u16 window, totlen; bool df; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); @@ -88,103 +186,20 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, sizeof(struct tcphdr), optsize, opts); } + ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; + list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) { - int foptsize, optnum; f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; - optp = _optp; - fmatch = FMATCH_WRONG; - - if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl)) - continue; - - /* - * Should not happen if userspace parser was written correctly. - */ - if (f->wss.wc >= OSF_WSS_MAX) + if (!nf_osf_match_one(skb, f, + ttl_check, totlen, window, optp, optsize)) continue; - /* Check options */ - - foptsize = 0; - for (optnum = 0; optnum < f->opt_num; ++optnum) - foptsize += f->opt[optnum].length; - - if (foptsize > MAX_IPOPTLEN || - optsize > MAX_IPOPTLEN || - optsize != foptsize) - continue; - - check_WSS = f->wss.wc; - - for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == (*optp)) { - __u32 len = f->opt[optnum].length; - const __u8 *optend = optp + len; - - fmatch = FMATCH_OK; - - switch (*optp) { - case OSFOPT_MSS: - mss = optp[3]; - mss <<= 8; - mss |= optp[2]; - - mss = ntohs((__force __be16)mss); - break; - case OSFOPT_TS: - break; - } - - optp = optend; - } else - fmatch = FMATCH_OPT_WRONG; - - if (fmatch != FMATCH_OK) - break; - } - - if (fmatch != FMATCH_OPT_WRONG) { - fmatch = FMATCH_WRONG; - - switch (check_WSS) { - case OSF_WSS_PLAIN: - if (f->wss.val == 0 || window == f->wss.val) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MSS: - /* - * Some smart modems decrease mangle MSS to - * SMART_MSS_2, so we check standard, decreased - * and the one provided in the fingerprint MSS - * values. - */ -#define SMART_MSS_1 1460 -#define SMART_MSS_2 1448 - if (window == f->wss.val * mss || - window == f->wss.val * SMART_MSS_1 || - window == f->wss.val * SMART_MSS_2) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MTU: - if (window == f->wss.val * (mss + 40) || - window == f->wss.val * (SMART_MSS_1 + 40) || - window == f->wss.val * (SMART_MSS_2 + 40)) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MODULO: - if ((window % f->wss.val) == 0) - fmatch = FMATCH_OK; - break; - } - } - - if (fmatch != FMATCH_OK) - continue; + fmatch = FMATCH_OK; fcount++; -- cgit v1.2.3 From 31a9c29210e2d8129d2e81acb89babb56916c6c9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 13 Jul 2018 14:54:44 +0200 Subject: netfilter: nf_osf: add struct nf_osf_hdr_ctx Wrap context that allow us to guess the OS into a structure. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_osf.c | 105 +++++++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c index bd7b34dd7d87..b44d62d5d9a9 100644 --- a/net/netfilter/nf_osf.c +++ b/net/netfilter/nf_osf.c @@ -51,18 +51,25 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, return ip->ttl == f_ttl; } +struct nf_osf_hdr_ctx { + bool df; + u16 window; + u16 totlen; + const unsigned char *optp; + unsigned int optsize; +}; + static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, - int ttl_check, u16 totlen, u16 window, - const unsigned char *optp, - unsigned int optsize) + int ttl_check, + struct nf_osf_hdr_ctx *ctx) { unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; - if (totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) + if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* @@ -78,24 +85,24 @@ static bool nf_osf_match_one(const struct sk_buff *skb, foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || - optsize > MAX_IPOPTLEN || - optsize != foptsize) + ctx->optsize > MAX_IPOPTLEN || + ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == (*optp)) { + if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; - const __u8 *optend = optp + len; + const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; - switch (*optp) { + switch (*ctx->optp) { case OSFOPT_MSS: - mss = optp[3]; + mss = ctx->optp[3]; mss <<= 8; - mss |= optp[2]; + mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; @@ -103,7 +110,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb, break; } - optp = optend; + ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; @@ -116,7 +123,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb, switch (check_WSS) { case OSF_WSS_PLAIN: - if (f->wss.val == 0 || window == f->wss.val) + if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: @@ -128,19 +135,19 @@ static bool nf_osf_match_one(const struct sk_buff *skb, */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 - if (window == f->wss.val * mss || - window == f->wss.val * SMART_MSS_1 || - window == f->wss.val * SMART_MSS_2) + if (ctx->window == f->wss.val * mss || + ctx->window == f->wss.val * SMART_MSS_1 || + ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: - if (window == f->wss.val * (mss + 40) || - window == f->wss.val * (SMART_MSS_1 + 40) || - window == f->wss.val * (SMART_MSS_2 + 40)) + if (ctx->window == f->wss.val * (mss + 40) || + ctx->window == f->wss.val * (SMART_MSS_1 + 40) || + ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: - if ((window % f->wss.val) == 0) + if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } @@ -149,54 +156,66 @@ static bool nf_osf_match_one(const struct sk_buff *skb, return fmatch == FMATCH_OK; } +static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, + const struct sk_buff *skb, + const struct iphdr *ip, + unsigned char *opts) +{ + const struct tcphdr *tcp; + struct tcphdr _tcph; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return NULL; + + if (!tcp->syn) + return NULL; + + ctx->totlen = ntohs(ip->tot_len); + ctx->df = ntohs(ip->frag_off) & IP_DF; + ctx->window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), ctx->optsize, opts); + } + + return tcp; +} + bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { - const unsigned char *optp = NULL, *_optp = NULL; const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; - unsigned int optsize = 0; + struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; - struct tcphdr _tcph; - u16 window, totlen; - bool df; - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); - if (!tcp) - return false; + memset(&ctx, 0, sizeof(ctx)); - if (!tcp->syn) + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + if (!tcp) return false; - totlen = ntohs(ip->tot_len); - df = ntohs(ip->frag_off) & IP_DF; - window = ntohs(tcp->window); - - if (tcp->doff * 4 > sizeof(struct tcphdr)) { - optsize = tcp->doff * 4 - sizeof(struct tcphdr); - - _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + - sizeof(struct tcphdr), optsize, opts); - } - ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; - list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) { + list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; - if (!nf_osf_match_one(skb, f, - ttl_check, totlen, window, optp, optsize)) + if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; -- cgit v1.2.3 From 365b5a36f352e9884e85c47aa33026fd4df18633 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 12 Jul 2018 17:18:46 +0200 Subject: netfilter: nft_socket: Break evaluation if no socket found MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Actual implementation stores 0 in the destination register if no socket is found by the lookup, but that is not intentional as it is not really a value of any socket metadata. This patch fixes this and breaks rule evaluation in this case. Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index e43c1939d25f..622ac2012a40 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -43,7 +43,7 @@ static void nft_socket_eval(const struct nft_expr *expr, } if (!sk) { - nft_reg_store8(dest, 0); + regs->verdict.code = NFT_BREAK; return; } -- cgit v1.2.3 From 7d25f8851a2c03319bfa8e56bb40bde2c4621392 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 12 Jul 2018 17:48:06 +0200 Subject: netfilter: nft_socket: Expose socket mark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +++- net/netfilter/nft_socket.c | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 89438e68dc03..f466860bcf75 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -921,10 +921,12 @@ enum nft_socket_attributes { /* * enum nft_socket_keys - nf_tables socket expression keys * - * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_ + * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option + * @NFT_SOCKET_MARK: Value of the socket mark */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, + NFT_SOCKET_MARK, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 622ac2012a40..d7f3776dfd71 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -54,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFT_SOCKET_TRANSPARENT: nft_reg_store8(dest, inet_sk_transparent(sk)); break; + case NFT_SOCKET_MARK: + if (sk_fullsock(sk)) { + *dest = sk->sk_mark; + } else { + regs->verdict.code = NFT_BREAK; + return; + } + break; default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -91,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_TRANSPARENT: len = sizeof(u8); break; + case NFT_SOCKET_MARK: + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } -- cgit v1.2.3 From 70b095c84326640eeacfd69a411db8fc36e8ab1a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 14 Jul 2018 01:14:01 +0200 Subject: ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module IPV6=m DEFRAG_IPV6=m CONNTRACK=y yields: net/netfilter/nf_conntrack_proto.o: In function `nf_ct_netns_do_get': net/netfilter/nf_conntrack_proto.c:802: undefined reference to `nf_defrag_ipv6_enable' net/netfilter/nf_conntrack_proto.o:(.rodata+0x640): undefined reference to `nf_conntrack_l4proto_icmpv6' Setting DEFRAG_IPV6=y causes undefined references to ip6_rhash_params ip6_frag_init and ip6_expire_frag_queue so it would be needed to force IPV6=y too. This patch gets rid of the 'followup linker error' by removing the dependency of ipv6.ko symbols from netfilter ipv6 defrag. Shared code is placed into a header, then used from both. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/ipv6.h | 28 -------- include/net/ipv6_frag.h | 104 ++++++++++++++++++++++++++++++ net/ieee802154/6lowpan/reassembly.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 17 +++-- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 +- net/ipv6/reassembly.c | 92 ++------------------------ net/openvswitch/conntrack.c | 1 + 7 files changed, 126 insertions(+), 121 deletions(-) create mode 100644 include/net/ipv6_frag.h (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index aa6fd11a887c..3720958cd4e1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -581,34 +581,6 @@ static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, } #endif -struct inet_frag_queue; - -enum ip6_defrag_users { - IP6_DEFRAG_LOCAL_DELIVER, - IP6_DEFRAG_CONNTRACK_IN, - __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_OUT, - __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_BRIDGE_IN, - __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, -}; - -void ip6_frag_init(struct inet_frag_queue *q, const void *a); -extern const struct rhashtable_params ip6_rhash_params; - -/* - * Equivalent of ipv4 struct ip - */ -struct frag_queue { - struct inet_frag_queue q; - - int iif; - __u16 nhoffset; - u8 ecn; -}; - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); - static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h new file mode 100644 index 000000000000..6ced1e6899b6 --- /dev/null +++ b/include/net/ipv6_frag.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IPV6_FRAG_H +#define _IPV6_FRAG_H +#include +#include +#include +#include + +enum ip6_defrag_users { + IP6_DEFRAG_LOCAL_DELIVER, + IP6_DEFRAG_CONNTRACK_IN, + __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_OUT, + __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, +}; + +/* + * Equivalent of ipv4 struct ip + */ +struct frag_queue { + struct inet_frag_queue q; + + int iif; + __u16 nhoffset; + u8 ecn; +}; + +#if IS_ENABLED(CONFIG_IPV6) +static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) +{ + struct frag_queue *fq = container_of(q, struct frag_queue, q); + const struct frag_v6_compare_key *key = a; + + q->key.v6 = *key; + fq->ecn = 0; +} + +static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline int +ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static inline void +ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) +{ + struct net_device *dev = NULL; + struct sk_buff *head; + + rcu_read_lock(); + spin_lock(&fq->q.lock); + + if (fq->q.flags & INET_FRAG_COMPLETE) + goto out; + + inet_frag_kill(&fq->q); + + dev = dev_get_by_index_rcu(net, fq->iif); + if (!dev) + goto out; + + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + + /* Don't send error if the first segment did not arrive. */ + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; + + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + +out: + spin_unlock(&fq->q.lock); +out_rcu_unlock: + rcu_read_unlock(); + inet_frag_put(&fq->q); +} +#endif +#endif diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2cc224106b69..ec7a5da56129 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include "6lowpan_i.h" diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a452d99c9f52..333ee3256964 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,9 +33,8 @@ #include #include -#include +#include -#include #include #include #include @@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } /* Creation primitives. */ @@ -622,16 +621,24 @@ static struct pernet_operations nf_ct_net_ops = { .exit = nf_ct_net_exit, }; +static const struct rhashtable_params nfct_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, + .automatic_shrinking = true, +}; + int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.constructor = ip6_frag_init; + nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; - nf_frags.rhash_params = ip6_rhash_params; + nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e631be25337e..72dd3e202375 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b939b94e7e91..6edd2ac8ae4b 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include static const char ip6_frag_cache_name[] = "ip6-frags"; @@ -72,61 +72,6 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -void ip6_frag_init(struct inet_frag_queue *q, const void *a) -{ - struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct frag_v6_compare_key *key = a; - - q->key.v6 = *key; - fq->ecn = 0; -} -EXPORT_SYMBOL(ip6_frag_init); - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) -{ - struct net_device *dev = NULL; - struct sk_buff *head; - - rcu_read_lock(); - spin_lock(&fq->q.lock); - - if (fq->q.flags & INET_FRAG_COMPLETE) - goto out; - - inet_frag_kill(&fq->q); - - dev = dev_get_by_index_rcu(net, fq->iif); - if (!dev) - goto out; - - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); - - /* Don't send error if the first segment did not arrive. */ - head = fq->q.fragments; - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) - goto out; - - /* But use as source device on which LAST ARRIVED - * segment was received. And do not use fq->dev - * pointer directly, device might already disappeared. - */ - head->dev = dev; - skb_get(head); - spin_unlock(&fq->q.lock); - - icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); - goto out_rcu_unlock; - -out: - spin_unlock(&fq->q.lock); -out_rcu_unlock: - rcu_read_unlock(); - inet_frag_put(&fq->q); -} -EXPORT_SYMBOL(ip6_expire_frag_queue); - static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } static struct frag_queue * @@ -696,42 +641,19 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; -static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) -{ - return jhash2(data, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct inet_frag_queue *fq = data; - - return jhash2((const u32 *)&fq->key.v6, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) -{ - const struct frag_v6_compare_key *key = arg->key; - const struct inet_frag_queue *fq = ptr; - - return !!memcmp(&fq->key, key, sizeof(*key)); -} - -const struct rhashtable_params ip6_rhash_params = { +static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), - .hashfn = ip6_key_hashfn, - .obj_hashfn = ip6_obj_hashfn, - .obj_cmpfn = ip6_obj_cmpfn, + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; -EXPORT_SYMBOL(ip6_rhash_params); int __init ipv6_frag_init(void) { int ret; - ip6_frags.constructor = ip6_frag_init; + ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3e33c382367f..86a75105af1a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef CONFIG_NF_NAT_NEEDED #include -- cgit v1.2.3 From b71c69c26b4916d11b8d403d8e667bbd191f1b8f Mon Sep 17 00:00:00 2001 From: Philipp Puschmann Date: Tue, 17 Jul 2018 13:41:12 +0200 Subject: Bluetooth: Use lock_sock_nested in bt_accept_enqueue Fixes this warning that was provoked by a pairing: [60258.016221] WARNING: possible recursive locking detected [60258.021558] 4.15.0-RD1812-BSP #1 Tainted: G O [60258.027146] -------------------------------------------- [60258.032464] kworker/u5:0/70 is trying to acquire lock: [60258.037609] (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.}, at: [<87759073>] bt_accept_enqueue+0x3c/0x74 [60258.046863] [60258.046863] but task is already holding lock: [60258.052704] (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.}, at: [] l2cap_sock_new_connection_cb+0x1c/0x88 [60258.062905] [60258.062905] other info that might help us debug this: [60258.069441] Possible unsafe locking scenario: [60258.069441] [60258.075368] CPU0 [60258.077821] ---- [60258.080272] lock(sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP); [60258.085510] lock(sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP); [60258.090748] [60258.090748] *** DEADLOCK *** [60258.090748] [60258.096676] May be due to missing lock nesting notation [60258.096676] [60258.103472] 5 locks held by kworker/u5:0/70: [60258.107747] #0: ((wq_completion)%shdev->name#2){+.+.}, at: [<9460d092>] process_one_work+0x130/0x4fc [60258.117263] #1: ((work_completion)(&hdev->rx_work)){+.+.}, at: [<9460d092>] process_one_work+0x130/0x4fc [60258.126942] #2: (&conn->chan_lock){+.+.}, at: [<7877c8c3>] l2cap_connect+0x80/0x4f8 [60258.134806] #3: (&chan->lock/2){+.+.}, at: [<2e16c724>] l2cap_connect+0x8c/0x4f8 [60258.142410] #4: (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.}, at: [] l2cap_sock_new_connection_cb+0x1c/0x88 [60258.153043] [60258.153043] stack backtrace: [60258.157413] CPU: 1 PID: 70 Comm: kworker/u5:0 Tainted: G O 4.15.0-RD1812-BSP #1 [60258.165945] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [60258.172485] Workqueue: hci0 hci_rx_work [60258.176331] Backtrace: [60258.178797] [<8010c9fc>] (dump_backtrace) from [<8010ccbc>] (show_stack+0x18/0x1c) [60258.186379] r7:80e55fe4 r6:80e55fe4 r5:20050093 r4:00000000 [60258.192058] [<8010cca4>] (show_stack) from [<809864e8>] (dump_stack+0xb0/0xdc) [60258.199301] [<80986438>] (dump_stack) from [<8016ecc8>] (__lock_acquire+0xffc/0x11d4) [60258.207144] r9:5e2bb019 r8:630f974c r7:ba8a5940 r6:ba8a5ed8 r5:815b5220 r4:80fa081c [60258.214901] [<8016dccc>] (__lock_acquire) from [<8016f620>] (lock_acquire+0x78/0x98) [60258.222655] r10:00000040 r9:00000040 r8:808729f0 r7:00000001 r6:00000000 r5:60050013 [60258.230491] r4:00000000 [60258.233045] [<8016f5a8>] (lock_acquire) from [<806ee974>] (lock_sock_nested+0x64/0x88) [60258.240970] r7:00000000 r6:b796e870 r5:00000001 r4:b796e800 [60258.246643] [<806ee910>] (lock_sock_nested) from [<808729f0>] (bt_accept_enqueue+0x3c/0x74) [60258.255004] r8:00000001 r7:ba7d3c00 r6:ba7d3ea4 r5:ba7d2000 r4:b796e800 [60258.261717] [<808729b4>] (bt_accept_enqueue) from [<808aa39c>] (l2cap_sock_new_connection_cb+0x68/0x88) [60258.271117] r5:b796e800 r4:ba7d2000 [60258.274708] [<808aa334>] (l2cap_sock_new_connection_cb) from [<808a294c>] (l2cap_connect+0x190/0x4f8) [60258.283933] r5:00000001 r4:ba6dce00 [60258.287524] [<808a27bc>] (l2cap_connect) from [<808a4a14>] (l2cap_recv_frame+0x744/0x2cf8) [60258.295800] r10:ba6dcf24 r9:00000004 r8:b78d8014 r7:00000004 r6:bb05d000 r5:00000004 [60258.303635] r4:bb05d008 [60258.306183] [<808a42d0>] (l2cap_recv_frame) from [<808a7808>] (l2cap_recv_acldata+0x210/0x214) [60258.314805] r10:b78e7800 r9:bb05d960 r8:00000001 r7:bb05d000 r6:0000000c r5:b7957a80 [60258.322641] r4:ba6dce00 [60258.325188] [<808a75f8>] (l2cap_recv_acldata) from [<8087630c>] (hci_rx_work+0x35c/0x4e8) [60258.333374] r6:80e5743c r5:bb05d7c8 r4:b7957a80 [60258.338004] [<80875fb0>] (hci_rx_work) from [<8013dc7c>] (process_one_work+0x1a4/0x4fc) [60258.346018] r10:00000001 r9:00000000 r8:baabfef8 r7:ba997500 r6:baaba800 r5:baaa5d00 [60258.353853] r4:bb05d7c8 [60258.356401] [<8013dad8>] (process_one_work) from [<8013e028>] (worker_thread+0x54/0x5cc) [60258.364503] r10:baabe038 r9:baaba834 r8:80e05900 r7:00000088 r6:baaa5d18 r5:baaba800 [60258.372338] r4:baaa5d00 [60258.374888] [<8013dfd4>] (worker_thread) from [<801448f8>] (kthread+0x134/0x160) [60258.382295] r10:ba8310b8 r9:bb07dbfc r8:8013dfd4 r7:baaa5d00 r6:00000000 r5:baaa8ac0 [60258.390130] r4:ba831080 [60258.392682] [<801447c4>] (kthread) from [<801080b4>] (ret_from_fork+0x14/0x20) [60258.399915] r10:00000000 r9:00000000 r8:00000000 r7:00000000 r6:00000000 r5:801447c4 [60258.407751] r4:baaa8ac0 r3:baabe000 Signed-off-by: Philipp Puschmann Signed-off-by: Marcel Holtmann --- net/bluetooth/af_bluetooth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3264e1873219..deacc52d7ff1 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -159,7 +159,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk) BT_DBG("parent %p, sk %p", parent, sk); sock_hold(sk); - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; release_sock(sk); -- cgit v1.2.3 From 202aabe84a8fd809e8f401bc05e20f35a5102ece Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Jul 2018 19:08:50 -0700 Subject: xdp: fix uninitialized 'err' variable Smatch caught an uninitialized variable error which GCC seems to miss. Fixes: a25717d2b604 ("xdp: support simultaneous driver and hw XDP attachment") Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e03258e954c8..92b6fa5d5f6e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1414,14 +1414,17 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) prog_id = 0; mode = XDP_ATTACHED_NONE; - if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, - IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb)) + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); + if (err) goto err_cancel; - if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, - IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv)) + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); + if (err) goto err_cancel; - if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, - IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw)) + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); + if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); -- cgit v1.2.3 From a48d189ef53146a8df132a327a637c4182f50a16 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 17 Jul 2018 11:52:57 +0200 Subject: net: Move skb decrypted field, avoid explicity copy Commit 784abe24c903 ("net: Add decrypted field to skb") introduced a 'decrypted' field that is explicitly copied on skb copy and clone. Move it between headers_start[0] and headers_end[0], so that we don't need to copy it explicitly as it's copied by the memcpy() in __copy_skb_header(). While at it, drop the assignment in __skb_clone(), it was already redundant. This doesn't change the size of sk_buff or cacheline boundaries. The 15-bits hole before tc_index becomes a 14-bits hole, and will be again a 15-bits hole when this change is merged with commit 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_header()"). v2: as reported by kbuild test robot (oops, I forgot to build with CONFIG_TLS_DEVICE it seems), we can't use CHECK_SKB_FIELD() on a bit-field member. Just drop the check for the moment being, perhaps we could think of some magic to also check bit-field members one day. Fixes: 784abe24c903 ("net: Add decrypted field to skb") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++----- net/core/skbuff.c | 6 ------ 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ceb8dcc54da..14bc9ebe30f2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,7 +630,6 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue - * @decrypted: Decrypted SKB * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -641,6 +640,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @dst_pending_confirm: need to confirm neighbour + * @decrypted: Decrypted SKB * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark @@ -737,11 +737,7 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, -#ifdef CONFIG_TLS_DEVICE - decrypted:1; -#else __unused:1; -#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -797,6 +793,9 @@ struct sk_buff { __u8 tc_redirected:1; __u8 tc_from_ingress:1; #endif +#ifdef CONFIG_TLS_DEVICE + __u8 decrypted:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cfd6c6f35f9c..c4e24ac27464 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -805,9 +805,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; -#ifdef CONFIG_TLS_DEVICE - new->decrypted = old->decrypted; -#endif memcpy(&new->headers_start, &old->headers_start, offsetof(struct sk_buff, headers_end) - @@ -868,9 +865,6 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); -#ifdef CONFIG_TLS_DEVICE - C(decrypted); -#endif refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); -- cgit v1.2.3 From c94b1ac73244ff7eafb1a5df0b1e9c64f1b46113 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 17 Jul 2018 21:58:46 +0800 Subject: tipc: remove unused tipc_link_is_active tipc_link_is_active is no longer used and can be removed. Signed-off-by: YueHaibing Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index df763be38541..6987ffc8e7a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l) return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l)); } -int tipc_link_is_active(struct tipc_link *l) -{ - return l->active; -} - void tipc_link_set_active(struct tipc_link *l, bool active) { l->active = active; -- cgit v1.2.3 From d81d25e66a0f218e7d6b6d81b2d57dacf8924195 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 17 Jul 2018 22:11:23 +0800 Subject: tipc: remove unused tipc_group_size After commit eb929a91b213 ("tipc: improve poll() for group member socket"), it is no longer used. Signed-off-by: YueHaibing Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index cbe39e8db39c..8f43e7d6046b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp) return 0; } -int tipc_group_size(struct tipc_group *grp) -{ - return grp->member_cnt; -} - struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) -- cgit v1.2.3 From 0015b80abccecca82622d9e9d48eb210572a0c3b Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Mon, 16 Jul 2018 21:10:34 -0700 Subject: net: dsa: Remove VLA usage We avoid 2 VLAs by using a pre-allocated field in dsa_switch. We also try to avoid dynamic allocation whenever possible (when using fewer than bits-per-long ports, which is the common case). Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch Signed-off-by: Salvatore Mesoraca [kees: tweak commit subject and message slightly] Signed-off-by: Kees Cook Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/dsa2.c | 14 ++++++++++++++ net/dsa/switch.c | 22 ++++++++++------------ 3 files changed, 27 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fdbd6082945d..461e8a7661b7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -259,6 +259,9 @@ struct dsa_switch { /* Number of switch port queues */ unsigned int num_tx_queues; + unsigned long *bitmap; + unsigned long _bitmap; + /* Dynamically allocated ports, keep last */ size_t num_ports; struct dsa_port ports[]; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dc5d9af3dc80..a1917025e155 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) if (!ds) return NULL; + /* We avoid allocating memory outside dsa_switch + * if it is not needed. + */ + if (n <= sizeof(ds->_bitmap) * 8) { + ds->bitmap = &ds->_bitmap; + } else { + ds->bitmap = devm_kcalloc(dev, + BITS_TO_LONGS(n), + sizeof(unsigned long), + GFP_KERNEL); + if (unlikely(!ds->bitmap)) + return NULL; + } + ds->dev = dev; ds->num_ports = n; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index b93511726069..142b294d3446 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, { const struct switchdev_obj_port_mdb *mdb = info->mdb; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(group, ds->num_ports); int port; /* Build a mask of Multicast group members */ - bitmap_zero(group, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, group); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_dsa_port(ds, port)) - set_bit(port, group); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); + return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap); - dsa_switch_mdb_add_bitmap(ds, mdb, group); + dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap); return 0; } @@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, { const struct switchdev_obj_port_vlan *vlan = info->vlan; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(members, ds->num_ports); int port; /* Build a mask of VLAN members */ - bitmap_zero(members, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, members); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) - set_bit(port, members); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); + return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap); - dsa_switch_vlan_add_bitmap(ds, vlan, members); + dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap); return 0; } -- cgit v1.2.3 From f15f084ff11519172c67fac4dde61daf4e9ad345 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 14:32:24 -0700 Subject: pktgen: convert safe uses of strncpy() to strcpy() to avoid string truncation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 8 complains: net/core/pktgen.c: In function ‘pktgen_if_write’: net/core/pktgen.c:1419:4: warning: ‘strncpy’ output may be truncated copying between 0 and 31 bytes from a string of length 127 [-Wstringop-truncation] strncpy(pkt_dev->src_max, buf, len); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/pktgen.c:1399:4: warning: ‘strncpy’ output may be truncated copying between 0 and 31 bytes from a string of length 127 [-Wstringop-truncation] strncpy(pkt_dev->src_min, buf, len); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/pktgen.c:1290:4: warning: ‘strncpy’ output may be truncated copying between 0 and 31 bytes from a string of length 127 [-Wstringop-truncation] strncpy(pkt_dev->dst_max, buf, len); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/pktgen.c:1268:4: warning: ‘strncpy’ output may be truncated copying between 0 and 31 bytes from a string of length 127 [-Wstringop-truncation] strncpy(pkt_dev->dst_min, buf, len); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There is no bug here, but the code is not perfect either. It copies sizeof(pkt_dev->/member/) - 1 from user space into buf, and then does a strcmp(pkt_dev->/member/, buf) hence assuming buf will be null-terminated and shorter than pkt_dev->/member/ (pkt_dev->/member/ is never explicitly null-terminated, and strncpy() doesn't have to null-terminate so the assumption must be on buf). The use of strncpy() without explicit null-termination looks suspicious. Convert to use straight strcpy(). strncpy() would also null-pad the output, but that's clearly unnecessary since the author calls memset(pkt_dev->/member/, 0, sizeof(..)); prior to strncpy(), anyway. While at it format the code for "dst_min", "dst_max", "src_min" and "src_max" in the same way by removing extra new lines in one case. Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 49368e21d228..308ed04984de 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strncpy(pkt_dev->dst_min, buf, len); + strcpy(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } @@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) return len; - if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; - buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strncpy(pkt_dev->dst_max, buf, len); + strcpy(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } @@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strncpy(pkt_dev->src_min, buf, len); + strcpy(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } @@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strncpy(pkt_dev->src_max, buf, len); + strcpy(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } -- cgit v1.2.3 From fcb662deeb83bbc6df58b472a3bfe76981a8cc36 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 26 Jun 2018 14:19:10 -0700 Subject: xfrm: don't check offload_handle for nonzero The offload_handle should be an opaque data cookie for the driver to use, much like the data cookie for a timer or alarm callback. Thus, the XFRM stack should not be checking for non-zero, because the driver might use that to store an array reference, which could be zero, or some other zero but meaningful value. We can remove the checks for non-zero because there are plenty other attributes also being checked to see if there is an offload in place for the SA in question. Signed-off-by: Shannon Nelson Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 6 ++---- net/ipv6/esp6_offload.c | 6 ++---- net/xfrm/xfrm_device.c | 6 +++--- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..133589d693a9 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~NETIF_F_CSUM_MASK; @@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ if (!xo) return -EINVAL; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) { + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 27f59b61f70f..96af267835c3 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -162,8 +162,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~NETIF_F_CSUM_MASK; @@ -207,8 +206,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features if (!xo) return -EINVAL; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) { + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 11d56a44e9e8..5611b7521020 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -56,7 +56,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (skb_is_gso(skb)) { struct net_device *dev = skb->dev; - if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) { + if (unlikely(x->xso.dev != dev)) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ @@ -211,8 +211,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (!x->type_offload || x->encap) return false; - if ((!dev || (x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev))) && - (!xdst->child->xfrm && x->type->get_mtu)) { + if ((!dev || (dev == xfrm_dst_path(dst)->dev)) && + (!xdst->child->xfrm && x->type->get_mtu)) { mtu = x->type->get_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) -- cgit v1.2.3 From 07a557f47d7e09b2c60ad4d51b1ac8b035b75f73 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:16 +0300 Subject: net/sched: tunnel_key: Allow to set tos and ttl for tc based ip tunnels Allow user-space to provide tos and ttl to be set for the tunnel headers. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 2 ++ net/sched/act_tunnel_key.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index e284fec8c467..be384d63e1b5 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -39,6 +39,8 @@ enum { TCA_TUNNEL_KEY_ENC_OPTS, /* Nested TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_TOS, /* u8 */ + TCA_TUNNEL_KEY_ENC_TTL, /* u8 */ __TCA_TUNNEL_KEY_MAX, }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 3ec585d58762..22f26e9ea8f1 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -197,6 +197,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, @@ -216,6 +218,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, int opts_len = 0; __be64 key_id; __be16 flags; + u8 tos, ttl; int ret = 0; int err; @@ -273,6 +276,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } } + tos = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TOS]) + tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]); + ttl = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TTL]) + ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]); + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -281,7 +291,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); - metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, + metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl, dst_port, flags, key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && @@ -292,7 +302,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); - metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, + metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, key_id, 0); } else { @@ -504,6 +514,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, !(key->tun_flags & TUNNEL_CSUM)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; + + if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos)) + goto nla_put_failure; + + if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl)) + goto nla_put_failure; } tcf_tm_dump(&tm, &t->tcf_tm); -- cgit v1.2.3 From 5544adb9707fda5d54494c37940701894c16b9a0 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:17 +0300 Subject: flow_dissector: Dissect tos and ttl from the tunnel info Add dissection of the tos and ttl from the ip tunnel headers fields in case a match is needed on them. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 +- net/core/flow_dissector.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index c64406717eee..2a17f041f7a1 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -207,7 +207,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ - + FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b555fc229e96..08a5184f4b34 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -152,7 +152,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_PORTS)) + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP)) return; info = skb_tunnel_info(skb); @@ -212,6 +214,16 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->src = key->tp_src; tp->dst = key->tp_dst; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; + + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); -- cgit v1.2.3 From 0e2c17b64d5c7f57bcd7054ef87797376dcdee26 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:18 +0300 Subject: net/sched: cls_flower: Support matching on ip tos and ttl for tunnels Allow users to set rules matching on ipv4 tos and ttl or ipv6 traffic-class and hoplimit of tunnel headers. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ net/sched/cls_flower.c | 43 ++++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c4262d911596..b4512254036b 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -473,6 +473,11 @@ enum { TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c53fdd411f90..38d74803e2df 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -52,6 +52,7 @@ struct fl_flow_key { struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; + struct flow_dissector_key_ip enc_ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -453,6 +454,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -561,17 +566,17 @@ static int fl_set_key_flags(struct nlattr **tb, return 0; } -static void fl_set_key_ip(struct nlattr **tb, +static void fl_set_key_ip(struct nlattr **tb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, - &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, - sizeof(key->tos)); + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; - fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, - &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, - sizeof(key->ttl)); + fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)); + fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)); } static int fl_set_key(struct net *net, struct nlattr **tb, @@ -633,7 +638,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); - fl_set_key_ip(tb, &key->ip, &mask->ip); + fl_set_key_ip(tb, false, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { @@ -768,6 +773,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -860,6 +867,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask) enc_control); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); skb_flow_dissector_init(&mask->dissector, keys, cnt); } @@ -1208,14 +1217,17 @@ static int fl_dump_key_mpls(struct sk_buff *skb, return 0; } -static int fl_dump_key_ip(struct sk_buff *skb, +static int fl_dump_key_ip(struct sk_buff *skb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, - TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || - fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, - TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; + + if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) || + fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl))) return -1; return 0; @@ -1361,7 +1373,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)) || - fl_dump_key_ip(skb, &key->ip, &mask->ip))) + fl_dump_key_ip(skb, false, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && @@ -1486,7 +1498,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, - sizeof(key->enc_tp.dst))) + sizeof(key->enc_tp.dst)) || + fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip)) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) -- cgit v1.2.3 From bc56b33404599edc412b91933d74b36873e8ea25 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Thu, 19 Jul 2018 10:50:44 -0700 Subject: xfrm: Remove xfrmi interface ID from flowi In order to remove performance impact of having the extra u32 in every single flowi, this change removes the flowi_xfrm struct, prefering to take the if_id as a method parameter where needed. In the inbound direction, if_id is only needed during the __xfrm_check_policy() function, and the if_id can be determined at that point based on the skb. As such, xfrmi_decode_session() is only called with the skb in __xfrm_check_policy(). In the outbound direction, the only place where if_id is needed is the xfrm_lookup() call in xfrmi_xmit2(). With this change, the if_id is directly passed into the xfrm_lookup_with_ifid() call. All existing callers can still call xfrm_lookup(), which uses a default if_id of 0. This change does not change any behavior of XFRMIs except for improving overall system performance via flowi size reduction. This change has been tested against the Android Kernel Networking Tests: https://android.googlesource.com/kernel/tests/+/master/net/test Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- include/net/dst.h | 14 +++++++ include/net/flow.h | 9 ----- include/net/xfrm.h | 2 +- net/xfrm/xfrm_interface.c | 4 +- net/xfrm/xfrm_policy.c | 98 +++++++++++++++++++++++++++++++---------------- net/xfrm/xfrm_state.c | 3 +- 6 files changed, 83 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index b3219cd8a5a1..7f735e76ca73 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -475,6 +475,14 @@ static inline struct dst_entry *xfrm_lookup(struct net *net, return dst_orig; } +static inline struct dst_entry * +xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags, u32 if_id) +{ + return dst_orig; +} + static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, @@ -494,6 +502,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, int flags, + u32 if_id); + struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); diff --git a/include/net/flow.h b/include/net/flow.h index 187c9bef672f..8ce21793094e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,10 +26,6 @@ struct flowi_tunnel { __be64 tun_id; }; -struct flowi_xfrm { - __u32 if_id; -}; - struct flowi_common { int flowic_oif; int flowic_iif; @@ -43,7 +39,6 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; - struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -83,7 +78,6 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid -#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -115,7 +109,6 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; - fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -145,7 +138,6 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid -#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -193,7 +185,6 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid -#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1350e2cf0749..ca820945f30c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1557,7 +1557,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family); + unsigned short family, u32 if_id); struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 31cb1c7e3881..ccfe18d67e98 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -307,10 +307,8 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (!dst) goto tx_err_link_failure; - fl->flowi_xfrm.if_id = xi->p.if_id; - dst_hold(dst); - dst = xfrm_lookup(xi->net, dst, fl, NULL, 0); + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5d2f734f4309..2f70fe68b9b0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1068,14 +1068,14 @@ EXPORT_SYMBOL(xfrm_policy_walk_done); */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, - u8 type, u16 family, int dir) + u8 type, u16 family, int dir, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || - pol->if_id != fl->flowi_xfrm.if_id || + pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; @@ -1090,7 +1090,8 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, - u16 family, u8 dir) + u16 family, u8 dir, + u32 if_id) { int err; struct xfrm_policy *pol, *ret; @@ -1114,7 +1115,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, priority = ~0U; ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { - err = xfrm_policy_match(pol, fl, type, family, dir); + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; @@ -1133,7 +1134,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if ((pol->priority >= priority) && ret) break; - err = xfrm_policy_match(pol, fl, type, family, dir); + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; @@ -1158,21 +1159,25 @@ fail: return ret; } -static struct xfrm_policy * -xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) +static struct xfrm_policy *xfrm_policy_lookup(struct net *net, + const struct flowi *fl, + u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; - pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, + dir, if_id); if (pol != NULL) return pol; #endif - return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); + return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, + dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, - const struct flowi *fl, u16 family) + const struct flowi *fl, + u16 family, u32 if_id) { struct xfrm_policy *pol; @@ -1191,7 +1196,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v || - pol->if_id != fl->flowi_xfrm.if_id) { + pol->if_id != if_id) { pol = NULL; goto out; } @@ -1405,7 +1410,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, } } - x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); + x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, + family, policy->if_id); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; @@ -1708,7 +1714,8 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, - XFRM_POLICY_OUT); + XFRM_POLICY_OUT, + pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); @@ -1942,8 +1949,10 @@ free_dst: goto out; } -static struct xfrm_dst * -xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo) +static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, + const struct flowi *fl, + u16 family, u8 dir, + struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; @@ -1952,7 +1961,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; - pols[0] = xfrm_policy_lookup(net, fl, family, dir); + pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2020,14 +2029,19 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, return ret; } -/* Main function: finds/creates a bundle for given flow. +/* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. + * + * xfrm_lookup uses an if_id of 0 by default, and is provided for + * compatibility */ -struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, - const struct flowi *fl, - const struct sock *sk, int flags) +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, + int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; @@ -2043,7 +2057,8 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; - pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family); + pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, + if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2087,7 +2102,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo); + xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { @@ -2168,6 +2183,19 @@ dropdst: xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } +EXPORT_SYMBOL(xfrm_lookup_with_ifid); + +/* Main function: finds/creates a bundle for given flow. + * + * At the moment we eat a raw IP route. Mostly to speed up lookups + * on interfaces with disabled IPsec. + */ +struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags) +{ + return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); +} EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). @@ -2257,19 +2285,12 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - const struct xfrm_if_cb *ifcb = xfrm_if_get_cb(); - struct xfrm_if *xi; int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); - if (ifcb) { - xi = ifcb->decode_session(skb); - if (xi) - fl->flowi_xfrm.if_id = xi->p.if_id; - } err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); @@ -2301,6 +2322,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int reverse; struct flowi fl; int xerr_idx = -1; + const struct xfrm_if_cb *ifcb; + struct xfrm_if *xi; + u32 if_id = 0; + + rcu_read_lock(); + ifcb = xfrm_if_get_cb(); + + if (ifcb) { + xi = ifcb->decode_session(skb); + if (xi) + if_id = xi->p.if_id; + } + rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; @@ -2328,7 +2362,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { - pol = xfrm_sk_policy_lookup(sk, dir, &fl, family); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; @@ -2336,7 +2370,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) - pol = xfrm_policy_lookup(net, &fl, family, dir); + pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); @@ -2360,7 +2394,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, - XFRM_POLICY_IN); + XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 27c84e63c7ff..bd5cb7ad2447 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -930,7 +930,7 @@ struct xfrm_state * xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family) + unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); @@ -940,7 +940,6 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; - u32 if_id = fl->flowi_xfrm.if_id; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; -- cgit v1.2.3 From 5baf4f9c0035f3e33bb693a1a1e87599f6e804e6 Mon Sep 17 00:00:00 2001 From: Nathan Harold Date: Thu, 19 Jul 2018 19:07:47 -0700 Subject: xfrm: Allow xfrmi if_id to be updated by UPDSA Allow attaching an SA to an xfrm interface id after the creation of the SA, so that tasks such as keying which must be done as the SA is created, can remain separate from the decision on how to route traffic from an SA. This permits SA creation to be decomposed in to three separate steps: 1) allocation of a SPI 2) algorithm and key negotiation 3) insertion into the data path Signed-off-by: Nathan Harold Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bd5cb7ad2447..b669262682c9 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1561,10 +1561,14 @@ out: if (x1->curlft.use_time) xfrm_state_check_expire(x1); - if (x->props.smark.m || x->props.smark.v) { + if (x->props.smark.m || x->props.smark.v || x->if_id) { spin_lock_bh(&net->xfrm.xfrm_state_lock); - x1->props.smark = x->props.smark; + if (x->props.smark.m || x->props.smark.v) + x1->props.smark = x->props.smark; + + if (x->if_id) + x1->if_id = x->if_id; __xfrm_state_bump_genids(x1); spin_unlock_bh(&net->xfrm.xfrm_state_lock); -- cgit v1.2.3 From eecd6857709e08781e41f3eb0e0c669d9ca07d87 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 18 Jul 2018 08:27:41 -0500 Subject: tls: Fix copy-paste error in tls_device_reencrypt It seems that the proper structure to use in this particular case is *skb_iter* instead of skb. Addresses-Coverity-ID: 1471906 ("Copy-paste error") Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/tls/tls_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 4995d84d228d..1e968d238adf 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -615,7 +615,7 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) TLS_CIPHER_AES_GCM_128_TAG_SIZE); if (skb_iter->decrypted) - skb_store_bits(skb, offset, buf, copy); + skb_store_bits(skb_iter, offset, buf, copy); offset += copy; buf += copy; -- cgit v1.2.3 From 40999f11ce677ce3c5d0e8f5f76c40192a26b479 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 18 Jul 2018 19:50:06 +0200 Subject: tipc: make link capability update thread safe The commit referred to below introduced an update of the link capabilities field that is not safe. Given the recently added feature to remove idle node and link items after 5 minutes, there is a small risk that the update will happen at the very moment the targeted link is being removed. To avoid this we have to perform the update inside the node item's write lock protection. Fixes: 9012de508956 ("tipc: add sequence number check for link STATE messages") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 52fd80b0e728..3819ab14e073 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -370,13 +370,17 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); if (n) { + if (n->capabilities == capabilities) + goto exit; /* Same node may come back with new capabilities */ + write_lock_bh(&n->lock); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } + write_unlock_bh(&n->lock); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); -- cgit v1.2.3 From 7c4ec749a3bd89237d7195ccd621bf5d4124d6b5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 20 Jul 2018 23:37:55 -0700 Subject: net: Init backlog NAPI's gro_hash. Based upon a patch by Sean Tranchetti. Fixes: d4546c2509b1 ("net: Convert GRO SKB handling to list_head.") Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4f8b92d81d10..87c42c8249ae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6115,19 +6115,24 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +static void init_gro_hash(struct napi_struct *napi) { int i; - INIT_LIST_HEAD(&napi->poll_list); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; - napi->gro_bitmask = 0; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; } + napi->gro_bitmask = 0; +} + +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + INIT_LIST_HEAD(&napi->poll_list); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; + init_gro_hash(napi); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -9554,6 +9559,7 @@ static int __init net_dev_init(void) sd->cpu = i; #endif + init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; } -- cgit v1.2.3 From 3033fced2f689d4a870b3ba6a8a676db1261d262 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:51 +0000 Subject: net-sysfs: require net admin in the init ns for setting tx_maxrate An upcoming change will allow container root to open some /sys/class/net files for writing. The tx_maxrate attribute can result in changes to actual hardware devices so err on the side of caution by requiring CAP_NET_ADMIN in the init namespace in the corresponding attribute store operation. Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ffa1d18f2c2c..405c41ecb20b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1087,6 +1087,9 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, int err, index = get_netdev_queue_index(queue); u32 rate = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = kstrtou32(buf, 10, &rate); if (err < 0) return err; -- cgit v1.2.3 From b0e37c0d8a6abed0cd1b611314a7ebf50b0a8ed4 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:52 +0000 Subject: net-sysfs: make sure objects belong to container's owner When creating various objects in /sys/class/net/... make sure that they belong to container's owner instead of global root (if they belong to a container/namespace). Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 405c41ecb20b..ada065fc685e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -656,6 +656,24 @@ static const struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ +static void net_ns_get_ownership(const struct net *net, + kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} + #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) @@ -905,11 +923,20 @@ static const void *rx_queue_namespace(struct kobject *kobj) return ns; } +static void rx_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = rx_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + static struct kobj_type rx_queue_ktype __ro_after_init = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_attrs = rx_queue_default_attrs, - .namespace = rx_queue_namespace + .namespace = rx_queue_namespace, + .get_ownership = rx_queue_get_ownership, }; static int rx_queue_add_kobject(struct net_device *dev, int index) @@ -1431,11 +1458,20 @@ static const void *netdev_queue_namespace(struct kobject *kobj) return ns; } +static void netdev_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = netdev_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + static struct kobj_type netdev_queue_ktype __ro_after_init = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_attrs = netdev_queue_default_attrs, .namespace = netdev_queue_namespace, + .get_ownership = netdev_queue_get_ownership, }; static int netdev_queue_add_kobject(struct net_device *dev, int index) @@ -1625,6 +1661,14 @@ static const void *net_namespace(struct device *d) return dev_net(dev); } +static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid) +{ + struct net_device *dev = to_net_dev(d); + const struct net *net = dev_net(dev); + + net_ns_get_ownership(net, uid, gid); +} + static struct class net_class __ro_after_init = { .name = "net", .dev_release = netdev_release, @@ -1632,6 +1676,7 @@ static struct class net_class __ro_after_init = { .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, + .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF_NET -- cgit v1.2.3 From fbdeaed408cf2728c62640c10848ddb1b67e63d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:53 +0000 Subject: net: create reusable function for getting ownership info of sysfs inodes Make net_ns_get_ownership() reusable by networking code outside of core. This is useful, for example, to allow bridge related sysfs files to be owned by container root. Add a function comment since this is a potentially dangerous function to use given the way that kobject_get_ownership() works by initializing uid and gid before calling .get_ownership(). Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ net/core/net-sysfs.c | 18 ------------------ net/core/net_namespace.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a71264d75d7f..9b5fdc50519a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,8 @@ extern struct net init_net; struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); + void net_ns_barrier(void); #else /* CONFIG_NET_NS */ #include @@ -182,6 +185,13 @@ static inline struct net *copy_net_ns(unsigned long flags, return old_net; } +static inline void net_ns_get_ownership(const struct net *net, + kuid_t *uid, kgid_t *gid) +{ + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; +} + static inline void net_ns_barrier(void) {} #endif /* CONFIG_NET_NS */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ada065fc685e..0a95bcf64cdc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -656,24 +656,6 @@ static const struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ -static void net_ns_get_ownership(const struct net *net, - kuid_t *uid, kgid_t *gid) -{ - if (net) { - kuid_t ns_root_uid = make_kuid(net->user_ns, 0); - kgid_t ns_root_gid = make_kgid(net->user_ns, 0); - - if (uid_valid(ns_root_uid)) - *uid = ns_root_uid; - - if (gid_valid(ns_root_gid)) - *gid = ns_root_gid; - } else { - *uid = GLOBAL_ROOT_UID; - *gid = GLOBAL_ROOT_GID; - } -} - #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a11e03f920d3..738871af5efa 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -448,6 +449,33 @@ dec_ucounts: return net; } +/** + * net_ns_get_ownership - get sysfs ownership data for @net + * @net: network namespace in question (can be NULL) + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns the uid/gid pair of root in the user namespace associated with the + * given network namespace. + */ +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} +EXPORT_SYMBOL_GPL(net_ns_get_ownership); + static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; -- cgit v1.2.3 From 705e0dea4d52ef420a7d37fd9cc6725092e5e1ff Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:54 +0000 Subject: bridge: make sure objects belong to container's owner When creating various bridge objects in /sys/class/net/... make sure that they belong to the container's owner instead of global root (if they belong to a container/namespace). Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- net/bridge/br_if.c | 9 +++++++++ net/bridge/br_private.h | 2 ++ net/bridge/br_sysfs_if.c | 5 ++--- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 05e42d86882d..e7c8d55212aa 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "br_private.h" @@ -204,11 +205,19 @@ static void release_nbp(struct kobject *kobj) kfree(p); } +static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +{ + struct net_bridge_port *p = kobj_to_brport(kobj); + + net_ns_get_ownership(dev_net(p->dev), uid, gid); +} + static struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, + .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5216a524b537..cf0005d2a4d0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -283,6 +283,8 @@ struct net_bridge_port { u16 group_fwd_mask; }; +#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) + #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) #define br_promisc_port(p) ((p)->flags & BR_PROMISC) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index f99c5bf5c906..ab4c7f8adf68 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -249,13 +249,12 @@ static const struct brport_attribute *brport_attrs[] = { }; #define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) -#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj) static ssize_t brport_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct brport_attribute *brport_attr = to_brport_attr(attr); - struct net_bridge_port *p = to_brport(kobj); + struct net_bridge_port *p = kobj_to_brport(kobj); if (!brport_attr->show) return -EINVAL; @@ -268,7 +267,7 @@ static ssize_t brport_store(struct kobject *kobj, const char *buf, size_t count) { struct brport_attribute *brport_attr = to_brport_attr(attr); - struct net_bridge_port *p = to_brport(kobj); + struct net_bridge_port *p = kobj_to_brport(kobj); ssize_t ret = -EINVAL; char *endp; unsigned long val; -- cgit v1.2.3 From 9bcc66e1983d10861deb6920fb0c151c5b01772a Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 19 Jul 2018 11:14:42 +1000 Subject: tcp: convert icsk_user_timeout from jiffies to msecs This is a preparatory commit. Part of this series that improves the socket TCP_USER_TIMEOUT option accuracy. Implement Eric Dumazets idea to convert icsk->icsk_user_timeout from jiffies to msecs. To eliminate the msecs_to_jiffies() and jiffies_to_msecs() dance in future. Signed-off-by: Jon Maxwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_timer.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bce53b1728a6..514aaac1626f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2989,7 +2989,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (val < 0) err = -EINVAL; else - icsk->icsk_user_timeout = msecs_to_jiffies(val); + icsk->icsk_user_timeout = val; break; case TCP_FASTOPEN: @@ -3445,7 +3445,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_USER_TIMEOUT: - val = jiffies_to_msecs(icsk->icsk_user_timeout); + val = icsk->icsk_user_timeout; break; case TCP_FASTOPEN: diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3b3611729928..fa34984d0b12 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -183,8 +183,9 @@ static bool retransmits_timed_out(struct sock *sk, else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; } /* A write timeout has occurred. Process the after effects. */ @@ -337,8 +338,7 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > - jiffies_to_msecs(icsk->icsk_user_timeout)) + (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -672,7 +672,7 @@ static void tcp_keepalive_timer (struct timer_list *t) * to determine when to timeout instead. */ if ((icsk->icsk_user_timeout != 0 && - elapsed >= icsk->icsk_user_timeout && + elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && icsk->icsk_probes_out > 0) || (icsk->icsk_user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { -- cgit v1.2.3 From a7fa37703d495310819d0a6747e5b32362305374 Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 19 Jul 2018 11:14:43 +1000 Subject: tcp: Add tcp_retransmit_stamp() helper routine Create a seperate helper routine as per Neal Cardwells suggestion. To be used by the final commit in this series and retransmits_timed_out(). Signed-off-by: Jon Maxwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index fa34984d0b12..d212f183dd2d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,20 @@ #include #include +u32 tcp_retransmit_stamp(const struct sock *sk) +{ + u32 start_ts = tcp_sk(sk)->retrans_stamp; + + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return 0; + start_ts = tcp_skb_timestamp(head); + } + return start_ts; +} + /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -166,14 +180,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return false; - start_ts = tcp_skb_timestamp(head); - } + start_ts = tcp_retransmit_stamp(sk); + if (!start_ts) + return false; if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); -- cgit v1.2.3 From b701a99e431db784714c32fc6b68123045714679 Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 19 Jul 2018 11:14:44 +1000 Subject: tcp: Add tcp_clamp_rto_to_user_timeout() helper to improve accuracy Create the tcp_clamp_rto_to_user_timeout() helper routine. To calculate the correct rto, so that the TCP_USER_TIMEOUT socket option is more accurate. Taking suggestions and feedback into account from Eric Dumazet, Neal Cardwell and David Laight. Due to the 1st commit we can avoid the msecs_to_jiffies() and jiffies_to_msecs() dance. Signed-off-by: Jon Maxwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d212f183dd2d..a242f8874629 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -36,6 +36,21 @@ u32 tcp_retransmit_stamp(const struct sock *sk) return start_ts; } +static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + u32 elapsed, start_ts; + + start_ts = tcp_retransmit_stamp(sk); + if (!icsk->icsk_user_timeout || !start_ts) + return icsk->icsk_rto; + elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; + if (elapsed >= icsk->icsk_user_timeout) + return 1; /* user timeout has passed; fire ASAP */ + else + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); +} + /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -544,7 +559,8 @@ out_reset_timer: /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); -- cgit v1.2.3 From baa2d2b17ee93e2a5b8accc2dd5328db17caf90e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 18 Jul 2018 23:14:17 -0500 Subject: net: sched: use PTR_ERR_OR_ZERO macro in tcf_block_cb_register This line makes up what macro PTR_ERR_OR_ZERO already does. So, make use of PTR_ERR_OR_ZERO rather than an open-code version. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 623fe2cfe529..620067209ba8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -818,7 +818,7 @@ int tcf_block_cb_register(struct tcf_block *block, block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, extack); - return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; + return PTR_ERR_OR_ZERO(block_cb); } EXPORT_SYMBOL(tcf_block_cb_register); -- cgit v1.2.3 From e064cce130497023806e2ae6a4114f1fed28eacd Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 19 Jul 2018 17:16:59 +0800 Subject: tipc: make some functions static Fixes the following sparse warnings: net/tipc/link.c:376:5: warning: symbol 'link_bc_rcv_gap' was not declared. Should it be static? net/tipc/link.c:823:6: warning: symbol 'link_prepare_wakeup' was not declared. Should it be static? net/tipc/link.c:959:6: warning: symbol 'tipc_link_advance_backlog' was not declared. Should it be static? net/tipc/link.c:1009:5: warning: symbol 'tipc_link_retrans' was not declared. Should it be static? net/tipc/monitor.c:687:5: warning: symbol '__tipc_nl_add_monitor_peer' was not declared. Should it be static? net/tipc/group.c:230:20: warning: symbol 'tipc_group_find_member' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/tipc/group.c | 4 ++-- net/tipc/link.c | 11 ++++++----- net/tipc/monitor.c | 3 ++- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index 8f43e7d6046b..e82f13cb2dc5 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -227,8 +227,8 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp) kfree(grp); } -struct tipc_member *tipc_group_find_member(struct tipc_group *grp, - u32 node, u32 port) +static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, + u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; diff --git a/net/tipc/link.c b/net/tipc/link.c index 6987ffc8e7a1..b1f0bee54eac 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -373,7 +373,7 @@ int tipc_link_bc_peers(struct tipc_link *l) return l->ackers; } -u16 link_bc_rcv_gap(struct tipc_link *l) +static u16 link_bc_rcv_gap(struct tipc_link *l) { struct sk_buff *skb = skb_peek(&l->deferdq); u16 gap = 0; @@ -820,7 +820,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) * Wake up a number of waiting users, as permitted by available space * in the send queue */ -void link_prepare_wakeup(struct tipc_link *l) +static void link_prepare_wakeup(struct tipc_link *l) { struct sk_buff *skb, *tmp; int imp, i = 0; @@ -956,7 +956,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return rc; } -void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) +static void tipc_link_advance_backlog(struct tipc_link *l, + struct sk_buff_head *xmitq) { struct sk_buff *skb, *_skb; struct tipc_msg *hdr; @@ -1006,8 +1007,8 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) * @to: retransmit to (inclusive) this sequence number * xmitq: queue for accumulating the retransmitted packets */ -int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq) +static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 5453e564da82..67f69389ec17 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -684,7 +684,8 @@ int tipc_nl_monitor_get_threshold(struct net *net) return tn->mon_threshold; } -int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) +static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, + struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; -- cgit v1.2.3 From ef32477971b50c1fa11f52f5ed44cfbc98075030 Mon Sep 17 00:00:00 2001 From: Mark Railton Date: Fri, 20 Jul 2018 00:11:46 +0100 Subject: net: wimax: stack: fixed multi line comment issue Moved end of comment to it's own line per guide Signed-off-by: Mark Railton Signed-off-by: David S. Miller --- net/wimax/stack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 5db731512014..73dba9c077bb 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -486,7 +486,8 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev); /* Do the RFKILL setup before locking, as RFKILL will call - * into our functions. */ + * into our functions. + */ wimax_dev->net_dev = net_dev; result = wimax_rfkill_add(wimax_dev); if (result < 0) -- cgit v1.2.3 From 0ae0d60a379c11d6f3b11d9b9e8dbdd1fc683a1a Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 20 Jul 2018 14:07:42 +0800 Subject: multicast: remove useless parameter for group add Remove the mode parameter for igmp/igmp6_group_added as we can get it from first parameter. Fixes: 6e2059b53f988 (ipv4/igmp: init group mode as INCLUDE when join source group) Fixes: c7ea20c9da5b9 (ipv6/mcast: init as INCLUDE when join SSM INCLUDE group) Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 10 +++++----- net/ipv6/mcast.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b3c899a630a0..598333b123b9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1289,7 +1289,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #endif } -static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) +static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1321,7 +1321,7 @@ static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) * not send filter-mode change record as the mode should be from * IN() to IN(A). */ - if (mode == MCAST_EXCLUDE) + if (im->sfmode == MCAST_EXCLUDE) im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_event(in_dev); @@ -1432,7 +1432,7 @@ void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif - igmp_group_added(im, mode); + igmp_group_added(im); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: @@ -1699,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc, pmc->sfmode); + igmp_group_added(pmc); } } @@ -1762,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc, pmc->sfmode); + igmp_group_added(pmc); } } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 2699be7202be..195ed2db2207 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -660,7 +660,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } -static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode) +static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; @@ -690,7 +690,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode) * should not send filter-mode change record as the mode * should be from IN() to IN(A). */ - if (mode == MCAST_EXCLUDE) + if (mc->mca_sfmode == MCAST_EXCLUDE) mc->mca_crcount = mc->idev->mc_qrv; mld_ifc_event(mc->idev); @@ -932,7 +932,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, write_unlock_bh(&idev->lock); mld_del_delrec(idev, mc); - igmp6_group_added(mc, mode); + igmp6_group_added(mc); ma_put(mc); return 0; } @@ -2572,7 +2572,7 @@ void ipv6_mc_up(struct inet6_dev *idev) ipv6_mc_reset(idev); for (i = idev->mc_list; i; i = i->next) { mld_del_delrec(idev, i); - igmp6_group_added(i, i->mca_sfmode); + igmp6_group_added(i); } read_unlock_bh(&idev->lock); } -- cgit v1.2.3 From a5f3ea54f3ccd881562d47f34b4a83441796bf19 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 23 Jul 2018 11:16:58 +0300 Subject: net: bridge: add support for raw sysfs port options This patch adds a new alternative store callback for port sysfs options which takes a raw value (buf) and can use it directly. It is needed for the backup port sysfs support since we have to pass the device by its name. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_sysfs_if.c | 56 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index ab4c7f8adf68..4ac940067754 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -25,6 +25,15 @@ struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); int (*store)(struct net_bridge_port *, unsigned long); + int (*store_raw)(struct net_bridge_port *, char *); +}; + +#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \ +const struct brport_attribute brport_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = _mode }, \ + .show = _show, \ + .store_raw = _store, \ }; #define BRPORT_ATTR(_name, _mode, _show, _store) \ @@ -269,27 +278,46 @@ static ssize_t brport_store(struct kobject *kobj, struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = kobj_to_brport(kobj); ssize_t ret = -EINVAL; - char *endp; unsigned long val; + char *endp; if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - val = simple_strtoul(buf, &endp, 0); - if (endp != buf) { - if (!rtnl_trylock()) - return restart_syscall(); - if (p->dev && p->br && brport_attr->store) { - spin_lock_bh(&p->br->lock); - ret = brport_attr->store(p, val); - spin_unlock_bh(&p->br->lock); - if (!ret) { - br_ifinfo_notify(RTM_NEWLINK, NULL, p); - ret = count; - } + if (!rtnl_trylock()) + return restart_syscall(); + + if (!p->dev || !p->br) + goto out_unlock; + + if (brport_attr->store_raw) { + char *buf_copy; + + buf_copy = kstrndup(buf, count, GFP_KERNEL); + if (!buf_copy) { + ret = -ENOMEM; + goto out_unlock; } - rtnl_unlock(); + spin_lock_bh(&p->br->lock); + ret = brport_attr->store_raw(p, buf_copy); + spin_unlock_bh(&p->br->lock); + kfree(buf_copy); + } else if (brport_attr->store) { + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + goto out_unlock; + spin_lock_bh(&p->br->lock); + ret = brport_attr->store(p, val); + spin_unlock_bh(&p->br->lock); } + + if (!ret) { + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + ret = count; + } +out_unlock: + rtnl_unlock(); + return ret; } -- cgit v1.2.3 From 2756f68c314917d03eb348084edb08bb929139d9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 23 Jul 2018 11:16:59 +0300 Subject: net: bridge: add support for backup port This patch adds a new port attribute - IFLA_BRPORT_BACKUP_PORT, which allows to set a backup port to be used for known unicast traffic if the port has gone carrier down. The backup pointer is rcu protected and set only under RTNL, a counter is maintained so when deleting a port we know how many other ports reference it as a backup and we remove it from all. Also the pointer is in the first cache line which is hot at the time of the check and thus in the common case we only add one more test. The backup port will be used only for the non-flooding case since it's a part of the bridge and the flooded packets will be forwarded to it anyway. To remove the forwarding just send a 0/non-existing backup port. This is used to avoid numerous scalability problems when using MLAG most notably if we have thousands of fdbs one would need to change all of them on port carrier going down which takes too long and causes a storm of fdb notifications (and again when the port comes back up). In a Multi-chassis Link Aggregation setup usually hosts are connected to two different switches which act as a single logical switch. Those switches usually have a control and backup link between them called peerlink which might be used for communication in case a host loses connectivity to one of them. We need a fast way to failover in case a host port goes down and currently none of the solutions (like bond) cannot fulfill the requirements because the participating ports are actually the "master" devices and must have the same peerlink as their backup interface and at the same time all of them must participate in the bridge device. As Roopa noted it's normal practice in routing called fast re-route where a precalculated backup path is used when the main one is down. Another use case of this is with EVPN, having a single vxlan device which is backup of every port. Due to the nature of master devices it's not currently possible to use one device as a backup for many and still have all of them participate in the bridge (which is master itself). More detailed information about MLAG is available at the link below. https://docs.cumulusnetworks.com/display/DOCS/Multi-Chassis+Link+Aggregation+-+MLAG Further explanation and a diagram by Roopa: Two switches acting in a MLAG pair are connected by the peerlink interface which is a bridge port. the config on one of the switches looks like the below. The other switch also has a similar config. eth0 is connected to one port on the server. And the server is connected to both switches. br0 -- team0---eth0 | -- switch-peerlink Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 16 ++++++++++++- net/bridge/br_if.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_netlink.c | 30 ++++++++++++++++++++++++- net/bridge/br_private.h | 3 +++ net/bridge/br_sysfs_if.c | 33 +++++++++++++++++++++++++++ 6 files changed, 134 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8759cfb8aa2e..01b5069a73a5 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -334,6 +334,7 @@ enum { IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 9019f326fe81..5372e2042adf 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev, void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig) { - if (to && should_deliver(to, skb)) { + if (unlikely(!to)) + goto out; + + /* redirect to backup link if the destination port is down */ + if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) { + struct net_bridge_port *backup_port; + + backup_port = rcu_dereference(to->backup_port); + if (unlikely(!backup_port)) + goto out; + to = backup_port; + } + + if (should_deliver(to, skb)) { if (local_rcv) deliver_clone(to, skb, local_orig); else @@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to, return; } +out: if (!local_rcv) kfree_skb(skb); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e7c8d55212aa..0363f1bdc401 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -170,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br) } } +int nbp_backup_change(struct net_bridge_port *p, + struct net_device *backup_dev) +{ + struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); + struct net_bridge_port *backup_p = NULL; + + ASSERT_RTNL(); + + if (backup_dev) { + if (!br_port_exists(backup_dev)) + return -ENOENT; + + backup_p = br_port_get_rtnl(backup_dev); + if (backup_p->br != p->br) + return -EINVAL; + } + + if (p == backup_p) + return -EINVAL; + + if (old_backup == backup_p) + return 0; + + /* if the backup link is already set, clear it */ + if (old_backup) + old_backup->backup_redirected_cnt--; + + if (backup_p) + backup_p->backup_redirected_cnt++; + rcu_assign_pointer(p->backup_port, backup_p); + + return 0; +} + +static void nbp_backup_clear(struct net_bridge_port *p) +{ + nbp_backup_change(p, NULL); + if (p->backup_redirected_cnt) { + struct net_bridge_port *cur_p; + + list_for_each_entry(cur_p, &p->br->port_list, list) { + struct net_bridge_port *backup_p; + + backup_p = rtnl_dereference(cur_p->backup_port); + if (backup_p == p) + nbp_backup_change(cur_p, NULL); + } + } + + WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); +} + static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; @@ -295,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p) nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); + nbp_backup_clear(p); nbp_update_port_count(br); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 9f5eb05b0373..ec2b58a09f76 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + nla_total_size(br_get_link_af_size_filtered(dev, - filter_mask)); /* IFLA_AF_SPEC */ + filter_mask)) /* IFLA_AF_SPEC */ + + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */ } static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); + struct net_bridge_port *backup_p; u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || @@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb, return -EMSGSIZE; #endif + /* we might be called only with br->lock */ + rcu_read_lock(); + backup_p = rcu_dereference(p->backup_port); + if (backup_p) + nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, + backup_p->dev->ifindex); + rcu_read_unlock(); + return 0; } @@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, + [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; + if (tb[IFLA_BRPORT_BACKUP_PORT]) { + struct net_device *backup_dev = NULL; + u32 backup_ifindex; + + backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]); + if (backup_ifindex) { + backup_dev = __dev_get_by_index(dev_net(p->dev), + backup_ifindex); + if (!backup_dev) + return -ENOENT; + } + + err = nbp_backup_change(p, backup_dev); + if (err) + return err; + } + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index cf0005d2a4d0..11ed2029985f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -237,6 +237,7 @@ struct net_bridge_port { #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif + struct net_bridge_port __rcu *backup_port; /* STP */ u8 priority; @@ -281,6 +282,7 @@ struct net_bridge_port { int offload_fwd_mark; #endif u16 group_fwd_mask; + u16 backup_redirected_cnt; }; #define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) @@ -597,6 +599,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); +int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); /* br_input.c */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 4ac940067754..7c87a2fe5248 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -191,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p, static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); +static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) +{ + struct net_bridge_port *backup_p; + int ret = 0; + + rcu_read_lock(); + backup_p = rcu_dereference(p->backup_port); + if (backup_p) + ret = sprintf(buf, "%s\n", backup_p->dev->name); + rcu_read_unlock(); + + return ret; +} + +static int store_backup_port(struct net_bridge_port *p, char *buf) +{ + struct net_device *backup_dev = NULL; + char *nl = strchr(buf, '\n'); + + if (nl) + *nl = '\0'; + + if (strlen(buf) > 0) { + backup_dev = __dev_get_by_name(dev_net(p->dev), buf); + if (!backup_dev) + return -ENOENT; + } + + return nbp_backup_change(p, backup_dev); +} +static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port); + BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); @@ -254,6 +286,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, &brport_attr_isolated, + &brport_attr_backup_port, NULL }; -- cgit v1.2.3 From 7fa41efac14ffbe8db7660ad2da3928969d10caf Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Jul 2018 16:33:19 +0800 Subject: ipv6: sr: Use kmemdup instead of duplicating it in parse_nla_srh Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv6/seg6_local.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index cd6e4cab63f6..e1025b493a18 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -637,12 +637,10 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) if (!seg6_validate_srh(srh, len)) return -EINVAL; - slwt->srh = kmalloc(len, GFP_KERNEL); + slwt->srh = kmemdup(srh, len, GFP_KERNEL); if (!slwt->srh) return -ENOMEM; - memcpy(slwt->srh, srh, len); - slwt->headroom += len; return 0; -- cgit v1.2.3 From c601171d7a60b5b09d7c2fe0579953323a80744e Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 23 Jul 2018 13:53:08 +0200 Subject: net/smc: provide smc mode in smc_diag.c Rename field diag_fallback into diag_mode and set the smc mode of a connection explicitly. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/uapi/linux/smc_diag.h | 9 ++++++++- net/smc/smc_diag.c | 7 ++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 92be255e534c..48ae3ee22b2d 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -20,7 +20,7 @@ struct smc_diag_req { struct smc_diag_msg { __u8 diag_family; __u8 diag_state; - __u8 diag_fallback; + __u8 diag_mode; __u8 diag_shutdown; struct inet_diag_sockid id; @@ -28,6 +28,13 @@ struct smc_diag_msg { __u64 diag_inode; }; +/* Mode of a connection */ +enum { + SMC_DIAG_MODE_SMCR, + SMC_DIAG_MODE_FALLBACK_TCP, + SMC_DIAG_MODE_SMCD, +}; + /* Extensions */ enum { diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 6d83eef1b743..d772cd10297e 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -91,7 +91,12 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r = nlmsg_data(nlh); smc_diag_msg_common_fill(r, sk); r->diag_state = sk->sk_state; - r->diag_fallback = smc->use_fallback; + if (smc->use_fallback) + r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; + else if (smc->conn.lgr && smc->conn.lgr->is_smcd) + r->diag_mode = SMC_DIAG_MODE_SMCD; + else + r->diag_mode = SMC_DIAG_MODE_SMCR; user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; -- cgit v1.2.3 From bac6de7b637018f4caacfdf2b4ad8c8749de7420 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jul 2018 13:53:09 +0200 Subject: net/smc: eliminate cursor read and write calls The functions to read and write cursors are exclusively used to copy cursors. Therefore switch to a respective function instead. Signed-off-by: Stefan Raspl Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 8 ++------ net/smc/smc_cdc.c | 33 ++++++++++++--------------------- net/smc/smc_cdc.h | 43 +++++++++++++++---------------------------- net/smc/smc_rx.c | 15 ++++----------- net/smc/smc_tx.c | 46 +++++++++++++--------------------------------- net/smc/smc_tx.h | 4 ++-- 6 files changed, 48 insertions(+), 101 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 143b2220c0c8..7fc810ec31c5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1755,12 +1755,8 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, smc->sk.sk_state == SMC_CLOSED) { answ = 0; } else { - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); - smc_curs_write(&urg, - smc_curs_read(&conn->urg_curs, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&urg, &conn->urg_curs, conn); answ = smc_curs_diff(conn->rmb_desc->len, &cons, &urg) == 1; } diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 621d8cca570b..f3a1497953ee 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -34,14 +34,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, enum ib_wc_status wc_status) { struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; + struct smc_connection *conn = cdcpend->conn; struct smc_sock *smc; int diff; - if (!cdcpend->conn) + if (!conn) /* already dismissed */ return; - smc = container_of(cdcpend->conn, struct smc_sock, conn); + smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, @@ -52,9 +53,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, atomic_add(diff, &cdcpend->conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); - smc_curs_write(&cdcpend->conn->tx_curs_fin, - smc_curs_read(&cdcpend->cursor, cdcpend->conn), - cdcpend->conn); + smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); } smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); @@ -110,9 +109,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, &conn->local_tx_ctrl, conn); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) - smc_curs_write(&conn->rx_curs_confirmed, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&conn->rx_curs_confirmed, + &conn->local_tx_ctrl.cons, conn); return rc; } @@ -194,8 +192,8 @@ int smcd_cdc_msg_send(struct smc_connection *conn) rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); if (rc) return rc; - smc_curs_write(&conn->rx_curs_confirmed, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn); + smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons, + conn); /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -204,8 +202,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn) atomic_add(diff, &conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); - smc_curs_write(&conn->tx_curs_fin, - smc_curs_read(&conn->tx_curs_sent, conn), conn); + smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); smc_tx_sndbuf_nonfull(smc); return rc; @@ -225,9 +222,7 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, char *base; /* new data included urgent business */ - smc_curs_write(&conn->urg_curs, - smc_curs_read(&conn->local_rx_ctrl.prod, conn), - conn); + smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn); conn->urg_state = SMC_URG_VALID; if (!sock_flag(&smc->sk, SOCK_URGINLINE)) /* we'll skip the urgent byte, so don't account for it */ @@ -247,12 +242,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, struct smc_connection *conn = &smc->conn; int diff_cons, diff_prod; - smc_curs_write(&prod_old, - smc_curs_read(&conn->local_rx_ctrl.prod, conn), - conn); - smc_curs_write(&cons_old, - smc_curs_read(&conn->local_rx_ctrl.cons, conn), - conn); + smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); + smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn); diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old, diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 8fbce4fee3e4..934df4473a7c 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -104,47 +104,34 @@ static inline u64 smc_curs_read(union smc_host_cursor *curs, #endif } -static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs, - struct smc_connection *conn) -{ -#ifndef KERNEL_HAS_ATOMIC64 - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&conn->acurs_lock, flags); - ret = curs->acurs; - spin_unlock_irqrestore(&conn->acurs_lock, flags); - return ret; -#else - return atomic64_read(&curs->acurs); -#endif -} - -static inline void smc_curs_write(union smc_host_cursor *curs, u64 val, - struct smc_connection *conn) +/* Copy cursor src into tgt */ +static inline void smc_curs_copy(union smc_host_cursor *tgt, + union smc_host_cursor *src, + struct smc_connection *conn) { #ifndef KERNEL_HAS_ATOMIC64 unsigned long flags; spin_lock_irqsave(&conn->acurs_lock, flags); - curs->acurs = val; + tgt->acurs = src->acurs; spin_unlock_irqrestore(&conn->acurs_lock, flags); #else - atomic64_set(&curs->acurs, val); + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); #endif } -static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val, - struct smc_connection *conn) +static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt, + union smc_cdc_cursor *src, + struct smc_connection *conn) { #ifndef KERNEL_HAS_ATOMIC64 unsigned long flags; spin_lock_irqsave(&conn->acurs_lock, flags); - curs->acurs = val; + tgt->acurs = src->acurs; spin_unlock_irqrestore(&conn->acurs_lock, flags); #else - atomic64_set(&curs->acurs, val); + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); #endif } @@ -179,7 +166,7 @@ static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, { union smc_host_cursor temp; - smc_curs_write(&temp, smc_curs_read(local, conn), conn); + smc_curs_copy(&temp, local, conn); peer->count = htonl(temp.count); peer->wrap = htons(temp.wrap); /* peer->reserved = htons(0); must be ensured by caller */ @@ -206,8 +193,8 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, union smc_host_cursor temp, old; union smc_cdc_cursor net; - smc_curs_write(&old, smc_curs_read(local, conn), conn); - smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn); + smc_curs_copy(&old, local, conn); + smc_curs_copy_net(&net, peer, conn); temp.count = ntohl(net.count); temp.wrap = ntohs(net.wrap); if ((old.wrap > temp.wrap) && temp.wrap) @@ -215,7 +202,7 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, if ((old.wrap == temp.wrap) && (old.count > temp.count)) return; - smc_curs_write(local, smc_curs_read(&temp, conn), conn); + smc_curs_copy(local, &temp, conn); } static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b329803c8339..c99c987097b1 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -82,8 +82,7 @@ static int smc_rx_update_consumer(struct smc_sock *smc, } } - smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn), - conn); + smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn); /* send consumer cursor update if required */ /* similar to advertising new TCP rcv_wnd if required */ @@ -97,8 +96,7 @@ static void smc_rx_update_cons(struct smc_sock *smc, size_t len) struct smc_connection *conn = &smc->conn; union smc_host_cursor cons; - smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_rx_update_consumer(smc, cons, len); } @@ -245,10 +243,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, if (!(flags & MSG_TRUNC)) rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); len = 1; - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, - conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); if (smc_curs_diff(conn->rmb_desc->len, &cons, &conn->urg_curs) > 1) conn->urg_rx_skip_pend = true; @@ -370,9 +365,7 @@ copy: continue; } - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); /* subsequent splice() calls pick up where previous left */ if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 142bcb134dd6..2f5e324e54b9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -181,9 +181,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) copylen = min_t(size_t, send_remaining, writespace); /* determine start of sndbuf */ sndbuf_base = conn->sndbuf_desc->cpu_addr; - smc_curs_write(&prep, - smc_curs_read(&conn->tx_curs_prep, conn), - conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ @@ -214,9 +212,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); - smc_curs_write(&conn->tx_curs_prep, - smc_curs_read(&prep, conn), - conn); + smc_curs_copy(&conn->tx_curs_prep, &prep, conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); @@ -417,8 +413,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) int rc; /* source: sndbuf */ - smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); - smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + smc_curs_copy(&sent, &conn->tx_curs_sent, conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); /* cf. wmem_alloc - (snd_max - snd_una) */ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) @@ -429,12 +425,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) rmbespace = atomic_read(&conn->peer_rmbe_space); if (rmbespace <= 0) return 0; - smc_curs_write(&prod, - smc_curs_read(&conn->local_tx_ctrl.prod, conn), - conn); - smc_curs_write(&cons, - smc_curs_read(&conn->local_rx_ctrl.cons, conn), - conn); + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ pflags = &conn->local_tx_ctrl.prod_flags; @@ -481,14 +473,9 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ - smc_curs_write(&conn->local_tx_ctrl.prod, - smc_curs_read(&prod, conn), - conn); + smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); /* dst: peer RMBE */ - smc_curs_write(&conn->tx_curs_sent, - smc_curs_read(&sent, conn), - conn); - /* src: local sndbuf */ + smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ return 0; } @@ -606,17 +593,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) int sender_free = conn->rmb_desc->len; int to_confirm; - smc_curs_write(&cons, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); - smc_curs_write(&cfed, - smc_curs_read(&conn->rx_curs_confirmed, conn), - conn); + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (to_confirm > conn->rmbe_update_limit) { - smc_curs_write(&prod, - smc_curs_read(&conn->local_rx_ctrl.prod, conn), - conn); + smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); sender_free = conn->rmb_desc->len - smc_curs_diff(conn->rmb_desc->len, &prod, &cfed); } @@ -632,9 +613,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) SMC_TX_WORK_DELAY); return; } - smc_curs_write(&conn->rx_curs_confirmed, - smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); + smc_curs_copy(&conn->rx_curs_confirmed, + &conn->local_tx_ctrl.cons, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } if (conn->local_rx_ctrl.prod_flags.write_blocked && diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index b22bdc5694c4..07e6ad76224a 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -22,8 +22,8 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; - smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); - smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + smc_curs_copy(&sent, &conn->tx_curs_sent, conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } -- cgit v1.2.3 From 00e5fb263f9f5f2af60754b79b7dcec0d5e88154 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 23 Jul 2018 13:53:10 +0200 Subject: net/smc: add function to get link group from link Replace a frequently used construct with a more readable variant, reducing the code. Also might come handy when we start to support more than a single per link group. Signed-off-by: Stefan Raspl Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 2 +- net/smc/smc_core.h | 5 +++++ net/smc/smc_ib.c | 3 +-- net/smc/smc_llc.c | 30 ++++++++---------------------- net/smc/smc_wr.c | 27 +++++---------------------- 5 files changed, 20 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index f3a1497953ee..a7af2289cdff 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -365,7 +365,7 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) return; /* invalid message */ /* lookup connection */ - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); read_unlock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8b47e0168fc3..8807865483bb 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -266,4 +266,9 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, u64 peer_gid); void smcd_conn_free(struct smc_connection *conn); void smc_core_exit(void); + +static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) +{ + return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); +} #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 36de2fd76170..4706ab7092a9 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -112,8 +112,7 @@ int smc_ib_modify_qp_reset(struct smc_link *lnk) int smc_ib_ready_link(struct smc_link *lnk) { - struct smc_link_group *lgr = - container_of(lnk, struct smc_link_group, lnk[0]); + struct smc_link_group *lgr = smc_get_lgr(lnk); int rc = 0; rc = smc_ib_modify_qp_init(lnk); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5800a6b43d83..b7944aa1ffc3 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -186,8 +186,7 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); struct smc_llc_msg_confirm_link *confllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; @@ -381,11 +380,9 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { - struct smc_link_group *lgr; + struct smc_link_group *lgr = smc_get_lgr(link); int conf_rc; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); - /* RMBE eyecatchers are not supported */ if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) conf_rc = 0; @@ -411,8 +408,7 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, static void smc_llc_rx_add_link(struct smc_link *link, struct smc_llc_msg_add_link *llc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); if (llc->hd.flags & SMC_LLC_FLAG_RESP) { if (link->state == SMC_LNK_ACTIVATING) @@ -442,8 +438,7 @@ static void smc_llc_rx_add_link(struct smc_link *link, static void smc_llc_rx_delete_link(struct smc_link *link, struct smc_llc_msg_del_link *llc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); if (llc->hd.flags & SMC_LLC_FLAG_RESP) { if (lgr->role == SMC_SERV) @@ -476,17 +471,14 @@ static void smc_llc_rx_test_link(struct smc_link *link, static void smc_llc_rx_confirm_rkey(struct smc_link *link, struct smc_llc_msg_confirm_rkey *llc) { - struct smc_link_group *lgr; int rc; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { link->llc_confirm_rkey_rc = llc->hd.flags & SMC_LLC_FLAG_RKEY_NEG; complete(&link->llc_confirm_rkey); } else { - rc = smc_rtoken_add(lgr, + rc = smc_rtoken_add(smc_get_lgr(link), llc->rtoken[0].rmb_vaddr, llc->rtoken[0].rmb_key); @@ -514,18 +506,15 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, static void smc_llc_rx_delete_rkey(struct smc_link *link, struct smc_llc_msg_delete_rkey *llc) { - struct smc_link_group *lgr; u8 err_mask = 0; int i, max; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { /* unused as long as we don't send this type of msg */ } else { max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { - if (smc_rtoken_delete(lgr, llc->rkey[i])) + if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } @@ -583,12 +572,10 @@ static void smc_llc_testlink_work(struct work_struct *work) struct smc_link *link = container_of(to_delayed_work(work), struct smc_link, llc_testlink_wrk); unsigned long next_interval; - struct smc_link_group *lgr; unsigned long expire_time; u8 user_data[16] = { 0 }; int rc; - lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); if (link->state != SMC_LNK_ACTIVE) return; /* don't reschedule worker */ expire_time = link->wr_rx_tstamp + link->llc_testlink_time; @@ -602,7 +589,7 @@ static void smc_llc_testlink_work(struct work_struct *work) rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); if (rc <= 0) { - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); return; } next_interval = link->llc_testlink_time; @@ -613,8 +600,7 @@ out: int smc_llc_link_init(struct smc_link *link) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + struct smc_link_group *lgr = smc_get_lgr(link); link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, *((u32 *)lgr->id), link->link_id); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index dbd2605d1962..b6df69756bef 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -92,8 +92,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) return; if (wc->status) { - struct smc_link_group *lgr; - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { /* clear full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[i], 0, @@ -103,9 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -188,8 +184,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { struct smc_link_group *lgr; - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); + lgr = smc_get_lgr(link); rc = wait_event_timeout( link->wr_tx_wait, list_empty(&lgr->list) || /* lgr terminated */ @@ -250,12 +245,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); if (rc) { - struct smc_link_group *lgr = - container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); } return rc; } @@ -283,11 +274,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - struct smc_link_group *lgr; - - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -380,8 +367,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) smc_wr_rx_demultiplex(&wc[i]); smc_wr_rx_post(link); /* refill WR RX */ } else { - struct smc_link_group *lgr; - /* handle status errors */ switch (wc[i].status) { case IB_WC_RETRY_EXC_ERR: @@ -390,9 +375,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ -- cgit v1.2.3 From 144ce4b9b5a788953b5373162a1921267497fb38 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 23 Jul 2018 13:53:11 +0200 Subject: net/smc: use DECLARE_BITMAP for rtokens_used_mask Link group field tokens_used_mask is a bitmap. Use macro DECLARE_BITMAP for its definition. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8807865483bb..1e8974c50550 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -192,8 +192,7 @@ struct smc_link_group { struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] [SMC_LINKS_PER_LGR_MAX]; /* remote addr/key pairs */ - unsigned long rtokens_used_mask[BITS_TO_LONGS - (SMC_RMBS_PER_LGR_MAX)]; + DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ }; struct { /* SMC-D */ -- cgit v1.2.3 From 48bf5231771c7e3961c8326353b6027b1bed6eb5 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 23 Jul 2018 13:53:12 +0200 Subject: net/smc: remove local variable page in smc_rx_splice() The page map address is already stored in the RMB descriptor. There is no need to derive it from the cpu_addr value. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_rx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index c99c987097b1..bbcf0fe4ae10 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -155,10 +155,8 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct splice_pipe_desc spd; struct partial_page partial; struct smc_spd_priv *priv; - struct page *page; int bytes; - page = virt_to_page(smc->conn.rmb_desc->cpu_addr); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -170,7 +168,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, spd.nr_pages_max = 1; spd.nr_pages = 1; - spd.pages = &page; + spd.pages = &smc->conn.rmb_desc->pages; spd.partial = &partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; -- cgit v1.2.3 From f34e8bff58f071e1a3452a5a6e29dac218c83f69 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:04 +0200 Subject: net: sched: push ops lookup bits into tcf_proto_lookup_ops() Push all bits that take care of ops lookup, including module loading outside tcf_proto_create() function, into tcf_proto_lookup_ops() Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 620067209ba8..1e8d69790d82 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -39,7 +39,7 @@ static DEFINE_RWLOCK(cls_mod_lock); /* Find classifier type by string name */ -static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) +static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) { const struct tcf_proto_ops *t, *res = NULL; @@ -57,6 +57,33 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) return res; } +static const struct tcf_proto_ops * +tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + + ops = __tcf_proto_lookup_ops(kind); + if (ops) + return ops; +#ifdef CONFIG_MODULES + rtnl_unlock(); + request_module("cls_%s", kind); + rtnl_lock(); + ops = __tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to perform + * the module load. So, even if we succeeded in loading + * the module we have to replay the request. We indicate + * this using -EAGAIN. + */ + if (ops) { + module_put(ops->owner); + return ERR_PTR(-EAGAIN); + } +#endif + NL_SET_ERR_MSG(extack, "TC classifier not found"); + return ERR_PTR(-ENOENT); +} + /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) @@ -133,27 +160,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, if (!tp) return ERR_PTR(-ENOBUFS); - err = -ENOENT; - tp->ops = tcf_proto_lookup_ops(kind); - if (!tp->ops) { -#ifdef CONFIG_MODULES - rtnl_unlock(); - request_module("cls_%s", kind); - rtnl_lock(); - tp->ops = tcf_proto_lookup_ops(kind); - /* We dropped the RTNL semaphore in order to perform - * the module load. So, even if we succeeded in loading - * the module we have to replay the request. We indicate - * this using -EAGAIN. - */ - if (tp->ops) { - module_put(tp->ops->owner); - err = -EAGAIN; - } else { - NL_SET_ERR_MSG(extack, "TC classifier not found"); - err = -ENOENT; - } -#endif + tp->ops = tcf_proto_lookup_ops(kind, extack); + if (IS_ERR(tp->ops)) { + err = PTR_ERR(tp->ops); goto errout; } tp->classify = tp->ops->classify; -- cgit v1.2.3 From f71e0ca4db187af7c44987e9d21e9042c3046070 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:05 +0200 Subject: net: sched: Avoid implicit chain 0 creation Currently, chain 0 is implicitly created during block creation. However that does not align with chain object exposure, creation and destruction api introduced later on. So make the chain 0 behave the same way as any other chain and only create it when it is needed. Since chain 0 is somehow special as the qdiscs need to hold pointer to the first chain tp, this requires to move the chain head change callback infra to the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ++- net/sched/cls_api.c | 86 +++++++++++++++++++++-------------------------- 2 files changed, 43 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7432100027b7..86f4651784e8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -300,7 +300,6 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { struct tcf_proto __rcu *filter_chain; - struct list_head filter_chain_list; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ @@ -318,6 +317,10 @@ struct tcf_block { bool keep_dst; unsigned int offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ + struct { + struct tcf_chain *chain; + struct list_head filter_chain_list; + } chain0; }; static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1e8d69790d82..eb0bf9037ef9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -204,11 +204,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; - INIT_LIST_HEAD(&chain->filter_chain_list); list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; chain->refcnt = 1; + if (!chain->index) + block->chain0.chain = chain; return chain; } @@ -218,12 +219,16 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, if (item->chain_head_change) item->chain_head_change(tp_head, item->chain_head_change_priv); } -static void tcf_chain_head_change(struct tcf_chain *chain, - struct tcf_proto *tp_head) + +static void tcf_chain0_head_change(struct tcf_chain *chain, + struct tcf_proto *tp_head) { struct tcf_filter_chain_list_item *item; + struct tcf_block *block = chain->block; - list_for_each_entry(item, &chain->filter_chain_list, list) + if (chain->index) + return; + list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); } @@ -231,7 +236,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - tcf_chain_head_change(chain, NULL); + tcf_chain0_head_change(chain, NULL); while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp, NULL); @@ -245,8 +250,10 @@ static void tcf_chain_destroy(struct tcf_chain *chain) struct tcf_block *block = chain->block; list_del(&chain->list); + if (!chain->index) + block->chain0.chain = NULL; kfree(chain); - if (list_empty(&block->chain_list)) + if (list_empty(&block->chain_list) && block->refcnt == 0) kfree(block); } @@ -346,10 +353,11 @@ no_offload_dev_dec: } static int -tcf_chain_head_change_cb_add(struct tcf_chain *chain, - struct tcf_block_ext_info *ei, - struct netlink_ext_ack *extack) +tcf_chain0_head_change_cb_add(struct tcf_block *block, + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -359,23 +367,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain, } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; - if (chain->filter_chain) - tcf_chain_head_change_item(item, chain->filter_chain); - list_add(&item->list, &chain->filter_chain_list); + if (chain0 && chain0->filter_chain) + tcf_chain_head_change_item(item, chain0->filter_chain); + list_add(&item->list, &block->chain0.filter_chain_list); return 0; } static void -tcf_chain_head_change_cb_del(struct tcf_chain *chain, - struct tcf_block_ext_info *ei) +tcf_chain0_head_change_cb_del(struct tcf_block *block, + struct tcf_block_ext_info *ei) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; - list_for_each_entry(item, &chain->filter_chain_list, list) { + list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { - tcf_chain_head_change_item(item, NULL); + if (chain0) + tcf_chain_head_change_item(item, NULL); list_del(&item->list); kfree(item); return; @@ -411,8 +421,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, struct netlink_ext_ack *extack) { struct tcf_block *block; - struct tcf_chain *chain; - int err; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) { @@ -422,14 +430,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); INIT_LIST_HEAD(&block->owner_list); + INIT_LIST_HEAD(&block->chain0.filter_chain_list); - /* Create chain 0 by default, it has to be always present. */ - chain = tcf_chain_create(block, 0); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create new tcf chain"); - err = -ENOMEM; - goto err_chain_create; - } block->refcnt = 1; block->net = net; block->index = block_index; @@ -438,10 +440,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, if (!tcf_block_shared(block)) block->q = q; return block; - -err_chain_create: - kfree(block); - return ERR_PTR(err); } static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) @@ -523,11 +521,6 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, return block; } -static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) -{ - return list_first_entry(&block->chain_list, struct tcf_chain, list); -} - struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; @@ -621,10 +614,9 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); - err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block), - ei, extack); + err = tcf_chain0_head_change_cb_add(block, ei, extack); if (err) - goto err_chain_head_change_cb_add; + goto err_chain0_head_change_cb_add; err = tcf_block_offload_bind(block, q, ei, extack); if (err) @@ -634,15 +626,14 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, return 0; err_block_offload_bind: - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); -err_chain_head_change_cb_add: + tcf_chain0_head_change_cb_del(block, ei); +err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: if (created) { if (tcf_block_shared(block)) tcf_block_remove(block, net); err_block_insert: - kfree(tcf_block_chain_zero(block)); kfree(block); } else { block->refcnt--; @@ -682,10 +673,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (!block) return; - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); + tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - if (--block->refcnt == 0) { + if (block->refcnt == 1) { if (tcf_block_shared(block)) tcf_block_remove(block, block->net); @@ -701,13 +692,14 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, tcf_block_offload_unbind(block, q, ei); - if (block->refcnt == 0) { + if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_put(chain); - /* Finally, put chain 0 and allow block to be freed. */ - tcf_chain_put(tcf_block_chain_zero(block)); + block->refcnt--; + if (list_empty(&block->chain_list)) + kfree(block); } } EXPORT_SYMBOL(tcf_block_put_ext); @@ -947,7 +939,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_proto *tp) { if (*chain_info->pprev == chain->filter_chain) - tcf_chain_head_change(chain, tp); + tcf_chain0_head_change(chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); tcf_chain_hold(chain); @@ -960,7 +952,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_proto *next = rtnl_dereference(chain_info->next); if (tp == chain->filter_chain) - tcf_chain_head_change(chain, next); + tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); tcf_chain_put(chain); } -- cgit v1.2.3 From 32a4f5ecd7381f30ae3bb36dea77a150ba68af2e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:06 +0200 Subject: net: sched: introduce chain object to uapi Allow user to create, destroy, get and dump chain objects. Do that by extending rtnl commands by the chain-specific ones. User will now be able to explicitly create or destroy chains (so far this was done only automatically according the filter/act needs and refcounting). Also, the user will receive notification about any chain creation or destuction. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 7 + net/sched/cls_api.c | 308 +++++++++++++++++++++++++++++++++++++++-- security/selinux/nlmsgtab.c | 2 +- 4 files changed, 309 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 86f4651784e8..81ec8276db9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -304,6 +304,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + bool explicitly_created; }; struct tcf_block { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 7d8502313c99..46399367627f 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -150,6 +150,13 @@ enum { RTM_NEWCACHEREPORT = 96, #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT + RTM_NEWCHAIN = 100, +#define RTM_NEWCHAIN RTM_NEWCHAIN + RTM_DELCHAIN, +#define RTM_DELCHAIN RTM_DELCHAIN + RTM_GETCHAIN, +#define RTM_GETCHAIN RTM_GETCHAIN + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index eb0bf9037ef9..e65b390336aa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,29 +262,57 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, + u32 chain_index) { struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - tcf_chain_hold(chain); + if (chain->index == chain_index) return chain; - } + } + return NULL; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast); + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) +{ + struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); + + if (chain) { + tcf_chain_hold(chain); + return chain; } - return create ? tcf_chain_create(block, chain_index) : NULL; + if (!create) + return NULL; + chain = tcf_chain_create(block, chain_index); + if (!chain) + return NULL; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + return chain; } EXPORT_SYMBOL(tcf_chain_get); void tcf_chain_put(struct tcf_chain *chain) { - if (--chain->refcnt == 0) + if (--chain->refcnt == 0) { + tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); tcf_chain_destroy(chain); + } } EXPORT_SYMBOL(tcf_chain_put); +static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) +{ + if (chain->explicitly_created) + tcf_chain_put(chain); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -694,8 +722,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_put_explicitly_created(chain); tcf_chain_put(chain); + } block->refcnt--; if (list_empty(&block->chain_list)) @@ -1609,6 +1639,264 @@ out: return skb->len; } +static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, + struct sk_buff *skb, struct tcf_block *block, + u32 portid, u32 seq, u16 flags, int event) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlmsghdr *nlh; + struct tcmsg *tcm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_handle = 0; + if (block->q) { + tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; + tcm->tcm_parent = block->q->handle; + } else { + tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; + tcm->tcm_block_index = block->index; + } + + if (nla_put_u32(skb, TCA_CHAIN, chain->index)) + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast) +{ + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + struct tcf_block *block = chain->block; + struct net *net = block->net; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_chain_fill_node(chain, net, skb, block, portid, + seq, flags, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + if (unicast) + return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); +} + +/* Add/delete/get a chain */ + +static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct tcmsg *t; + u32 parent; + u32 chain_index; + struct Qdisc *q = NULL; + struct tcf_chain *chain = NULL; + struct tcf_block *block; + unsigned long cl; + int err; + + if (n->nlmsg_type != RTM_GETCHAIN && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + +replay: + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + if (err < 0) + return err; + + t = nlmsg_data(n); + parent = t->tcm_parent; + cl = 0; + + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) + return PTR_ERR(block); + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); + return -EINVAL; + } + chain = tcf_chain_lookup(block, chain_index); + if (n->nlmsg_type == RTM_NEWCHAIN) { + if (chain) { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } + } else { + if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + return -EINVAL; + } + tcf_chain_hold(chain); + } + + switch (n->nlmsg_type) { + case RTM_NEWCHAIN: + /* In case the chain was successfully added, take a reference + * to the chain. This ensures that an empty chain + * does not disappear at the end of this function. + */ + tcf_chain_hold(chain); + chain->explicitly_created = true; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + break; + case RTM_DELCHAIN: + /* Flush the chain first as the user requested chain removal. */ + tcf_chain_flush(chain); + /* In case the chain was successfully deleted, put a reference + * to the chain previously taken during addition. + */ + tcf_chain_put_explicitly_created(chain); + break; + case RTM_GETCHAIN: + break; + err = tc_chain_notify(chain, skb, n->nlmsg_seq, + n->nlmsg_seq, n->nlmsg_type, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); + break; + default: + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Unsupported message type"); + goto errout; + } + +errout: + tcf_chain_put(chain); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +/* called with RTNL */ +static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct Qdisc *q = NULL; + struct tcf_block *block; + struct tcf_chain *chain; + struct tcmsg *tcm = nlmsg_data(cb->nlh); + long index_start; + long index; + u32 parent; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, tcm->tcm_block_index); + if (!block) + goto out; + /* If we work with block index, q is NULL and parent value + * will never be used in the following code. The check + * in tcf_fill_node prevents it. However, compiler does not + * see that far, so set parent to zero to silence the warning + * about parent being uninitialized. + */ + parent = 0; + } else { + const struct Qdisc_class_ops *cops; + struct net_device *dev; + unsigned long cl = 0; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + parent = tcm->tcm_parent; + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } + if (!q) + goto out; + cops = q->ops->cl_ops; + if (!cops) + goto out; + if (!cops->tcf_block) + goto out; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->find(q, tcm->tcm_parent); + if (cl == 0) + goto out; + } + block = cops->tcf_block(q, cl, NULL); + if (!block) + goto out; + if (tcf_block_shared(block)) + q = NULL; + } + + index_start = cb->args[0]; + index = 0; + + list_for_each_entry(chain, &block->chain_list, list) { + if ((tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index)) + continue; + if (index < index_start) { + index++; + continue; + } + err = tc_chain_fill_node(chain, net, skb, block, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWCHAIN); + if (err <= 0) + break; + index++; + } + + cb->args[0] = index; + +out: + /* If we did no progress, the error (EMSGSIZE) is real */ + if (skb->len == 0 && err) + return err; + return skb->len; +} + void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT @@ -1825,6 +2113,10 @@ static int __init tc_filter_init(void) rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, tc_dump_tfilter, 0); + rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, + tc_dump_chain, 0); return 0; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 7b7433a1a34c..74b951f55608 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -159,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) switch (sclass) { case SECCLASS_NETLINK_ROUTE_SOCKET: /* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From 9f407f1768d3e1a5ddd7bd49fa4d1f5a26e10ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:07 +0200 Subject: net: sched: introduce chain templates Allow user to set a template for newly created chains. Template lock down the chain for particular classifier type/options combinations. The classifier needs to support templates, otherwise kernel would reply with error. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 +++++++++ net/sched/cls_api.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 81ec8276db9c..085c509c8674 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -238,6 +238,8 @@ struct tcf_result { }; }; +struct tcf_chain; + struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; @@ -263,10 +265,18 @@ struct tcf_proto_ops { tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*bind_class)(void *, u32, unsigned long); + void * (*tmplt_create)(struct net *net, + struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack); + void (*tmplt_destroy)(void *tmplt_priv); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*); + int (*tmplt_dump)(struct sk_buff *skb, + struct net *net, + void *tmplt_priv); struct module *owner; }; @@ -305,6 +315,8 @@ struct tcf_chain { u32 index; /* chain index */ unsigned int refcnt; bool explicitly_created; + const struct tcf_proto_ops *tmplt_ops; + void *tmplt_priv; }; struct tcf_block { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e65b390336aa..5f7098b5405e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -298,10 +298,13 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +static void tc_chain_tmplt_del(struct tcf_chain *chain); + void tcf_chain_put(struct tcf_chain *chain) { if (--chain->refcnt == 0) { tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + tc_chain_tmplt_del(chain); tcf_chain_destroy(chain); } } @@ -1258,6 +1261,12 @@ replay: goto errout; } + if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); + err = -EINVAL; + goto errout; + } + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, extack); @@ -1644,8 +1653,13 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, u32 portid, u32 seq, u16 flags, int event) { unsigned char *b = skb_tail_pointer(skb); + const struct tcf_proto_ops *ops; struct nlmsghdr *nlh; struct tcmsg *tcm; + void *priv; + + ops = chain->tmplt_ops; + priv = chain->tmplt_priv; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -1666,6 +1680,13 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, if (nla_put_u32(skb, TCA_CHAIN, chain->index)) goto nla_put_failure; + if (ops) { + if (nla_put_string(skb, TCA_KIND, ops->kind)) + goto nla_put_failure; + if (ops->tmplt_dump(skb, net, priv) < 0) + goto nla_put_failure; + } + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; @@ -1699,6 +1720,47 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } +static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + void *tmplt_priv; + + /* If kind is not set, user did not specify template. */ + if (!tca[TCA_KIND]) + return 0; + + ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); + if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { + NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + return -EOPNOTSUPP; + } + + tmplt_priv = ops->tmplt_create(net, chain, tca, extack); + if (IS_ERR(tmplt_priv)) { + module_put(ops->owner); + return PTR_ERR(tmplt_priv); + } + chain->tmplt_ops = ops; + chain->tmplt_priv = tmplt_priv; + return 0; +} + +static void tc_chain_tmplt_del(struct tcf_chain *chain) +{ + const struct tcf_proto_ops *ops = chain->tmplt_ops; + + /* If template ops are set, no work to do for us. */ + if (!ops) + return; + + ops->tmplt_destroy(chain->tmplt_priv); + module_put(ops->owner); +} + /* Add/delete/get a chain */ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, @@ -1763,6 +1825,9 @@ replay: switch (n->nlmsg_type) { case RTM_NEWCHAIN: + err = tc_chain_tmplt_add(chain, net, tca, extack); + if (err) + goto errout; /* In case the chain was successfully added, take a reference * to the chain. This ensures that an empty chain * does not disappear at the end of this function. -- cgit v1.2.3 From f5749081f0d48ae585233232df6cfc4c7c9642f9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:08 +0200 Subject: net: sched: cls_flower: move key/mask dumping into a separate function Push key/mask dumping from fl_dump() into a separate function fl_dump_key(), that will be reused for template dumping. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 62 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 38d74803e2df..ab10a7c88359 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1296,29 +1296,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); } -static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) +static int fl_dump_key(struct sk_buff *skb, struct net *net, + struct fl_flow_key *key, struct fl_flow_key *mask) { - struct cls_fl_filter *f = fh; - struct nlattr *nest; - struct fl_flow_key *key, *mask; - - if (!f) - return skb->len; - - t->tcm_handle = f->handle; - - nest = nla_nest_start(skb, TCA_OPTIONS); - if (!nest) - goto nla_put_failure; - - if (f->res.classid && - nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) - goto nla_put_failure; - - key = &f->key; - mask = &f->mask->key; - if (mask->indev_ifindex) { struct net_device *dev; @@ -1327,9 +1307,6 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; } - if (!tc_skip_hw(f->flags)) - fl_hw_update_stats(tp, f); - if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || @@ -1505,6 +1482,41 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_filter *f = fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &f->mask->key; + + if (fl_dump_key(skb, net, key, mask)) + goto nla_put_failure; + + if (!tc_skip_hw(f->flags)) + fl_hw_update_stats(tp, f); + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; -- cgit v1.2.3 From 33fb5cba11ff639c32f4f0104b04b2415fcd9ecc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:09 +0200 Subject: net: sched: cls_flower: change fl_init_dissector to accept mask and dissector This function is going to be used for templates as well, so we need to pass the pointer separately. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index ab10a7c88359..bb7aa1e9d281 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -826,51 +826,52 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask) FL_KEY_SET(keys, cnt, id, member); \ } while(0); -static void fl_init_dissector(struct fl_flow_mask *mask) +static void fl_init_dissector(struct flow_dissector *dissector, + struct fl_flow_key *mask) { struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_TCP, tcp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ARP, arp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_MPLS, mpls); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CVLAN, cvlan); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6); - if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) || - FL_KEY_IS_MASKED(&mask->key, enc_ipv6)) + if (FL_KEY_IS_MASKED(mask, enc_ipv4) || + FL_KEY_IS_MASKED(mask, enc_ipv6)) FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); - skb_flow_dissector_init(&mask->dissector, keys, cnt); + skb_flow_dissector_init(dissector, keys, cnt); } static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, @@ -889,7 +890,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, if (err) goto errout_free; - fl_init_dissector(newmask); + fl_init_dissector(&newmask->dissector, &newmask->key); INIT_LIST_HEAD_RCU(&newmask->filters); -- cgit v1.2.3 From b95ec7eb3b4d2f158dd15c912cf670b546f09571 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:10 +0200 Subject: net: sched: cls_flower: implement chain templates Use the previously introduced template extension and implement callback to create, destroy and dump chain template. The existing parsing and dumping functions are re-used. Also, check if newly added filters fit the template if it is set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index bb7aa1e9d281..f0c80758a594 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -72,6 +72,13 @@ struct fl_flow_mask { struct list_head list; }; +struct fl_flow_tmplt { + struct fl_flow_key dummy_key; + struct fl_flow_key mask; + struct flow_dissector dissector; + struct tcf_chain *chain; +}; + struct cls_fl_head { struct rhashtable ht; struct list_head masks; @@ -147,6 +154,23 @@ static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, *lmkey++ = *lkey++ & *lmask++; } +static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt, + struct fl_flow_mask *mask) +{ + const long *lmask = fl_key_get_start(&mask->key, mask); + const long *ltmplt; + int i; + + if (!tmplt) + return true; + ltmplt = fl_key_get_start(&tmplt->mask, mask); + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) { + if (~*ltmplt++ & *lmask++) + return false; + } + return true; +} + static void fl_clear_masked_range(struct fl_flow_key *key, struct fl_flow_mask *mask) { @@ -939,6 +963,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr, + struct fl_flow_tmplt *tmplt, struct netlink_ext_ack *extack) { int err; @@ -959,6 +984,11 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, fl_mask_update_range(mask); fl_set_masked_key(&f->mkey, &f->key, mask); + if (!fl_mask_fits_tmplt(tmplt, mask)) { + NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template"); + return -EINVAL; + } + return 0; } @@ -1024,7 +1054,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, - extack); + tp->chain->tmplt_priv, extack); if (err) goto errout_idr; @@ -1164,6 +1194,52 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } +static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + struct fl_flow_tmplt *tmplt; + struct nlattr **tb; + int err; + + if (!tca[TCA_OPTIONS]) + return ERR_PTR(-EINVAL); + + tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + if (!tb) + return ERR_PTR(-ENOBUFS); + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], + fl_policy, NULL); + if (err) + goto errout_tb; + + tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL); + if (!tmplt) + goto errout_tb; + tmplt->chain = chain; + err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); + if (err) + goto errout_tmplt; + kfree(tb); + + fl_init_dissector(&tmplt->dissector, &tmplt->mask); + + return tmplt; + +errout_tmplt: + kfree(tmplt); +errout_tb: + kfree(tb); + return ERR_PTR(err); +} + +static void fl_tmplt_destroy(void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + + kfree(tmplt); +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1536,6 +1612,31 @@ nla_put_failure: return -1; } +static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + struct fl_flow_key *key, *mask; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + key = &tmplt->dummy_key; + mask = &tmplt->mask; + + if (fl_dump_key(skb, net, key, mask)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static void fl_bind_class(void *fh, u32 classid, unsigned long cl) { struct cls_fl_filter *f = fh; @@ -1556,6 +1657,9 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, + .tmplt_create = fl_tmplt_create, + .tmplt_destroy = fl_tmplt_destroy, + .tmplt_dump = fl_tmplt_dump, .owner = THIS_MODULE, }; -- cgit v1.2.3 From 34738452739069947e528123810533f28dd8332b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:11 +0200 Subject: net: sched: cls_flower: propagate chain teplate creation and destruction to drivers Introduce a couple of flower offload commands in order to propagate template creation/destruction events down to device drivers. Drivers may use this information to prepare HW in an optimal way for future filter insertions. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4f405ca8346f..a3101582f642 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -721,6 +721,8 @@ enum tc_fl_command { TC_CLSFLOWER_REPLACE, TC_CLSFLOWER_DESTROY, TC_CLSFLOWER_STATS, + TC_CLSFLOWER_TMPLT_CREATE, + TC_CLSFLOWER_TMPLT_DESTROY, }; struct tc_cls_flower_offload { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f0c80758a594..6ccf60364297 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1194,6 +1194,42 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } +static void fl_hw_create_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + struct tcf_exts dummy_exts = { 0, }; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE; + cls_flower.cookie = (unsigned long) tmplt; + cls_flower.dissector = &tmplt->dissector; + cls_flower.mask = &tmplt->mask; + cls_flower.key = &tmplt->dummy_key; + cls_flower.exts = &dummy_exts; + + /* We don't care if driver (any of them) fails to handle this + * call. It serves just as a hint for it. + */ + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + +static void fl_hw_destroy_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; + cls_flower.cookie = (unsigned long) tmplt; + + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack) @@ -1224,6 +1260,8 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, fl_init_dissector(&tmplt->dissector, &tmplt->mask); + fl_hw_create_tmplt(chain, tmplt); + return tmplt; errout_tmplt: @@ -1237,6 +1275,7 @@ static void fl_tmplt_destroy(void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; + fl_hw_destroy_tmplt(tmplt->chain, tmplt); kfree(tmplt); } -- cgit v1.2.3 From eee2fa6ab3225192d6d894c54a6fb02ac9efdff6 Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Mon, 23 Jul 2018 20:51:21 -0700 Subject: rds: Changing IP address internal representation to struct in6_addr This patch changes the internal representation of an IP address to use struct in6_addr. IPv4 address is stored as an IPv4 mapped address. All the functions which take an IP address as argument are also changed to use struct in6_addr. But RDS socket layer is not modified such that it still does not accept IPv6 address from an application. And RDS layer does not accept nor initiate IPv6 connections. v2: Fixed sparse warnings. Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/af_rds.c | 138 +++++++++++++++------- net/rds/bind.c | 91 ++++++++++----- net/rds/cong.c | 23 ++-- net/rds/connection.c | 132 +++++++++++++-------- net/rds/ib.c | 17 +-- net/rds/ib.h | 51 ++++++-- net/rds/ib_cm.c | 299 ++++++++++++++++++++++++++++++++++------------- net/rds/ib_rdma.c | 15 +-- net/rds/ib_recv.c | 18 +-- net/rds/ib_send.c | 10 +- net/rds/loop.c | 7 +- net/rds/rdma.c | 6 +- net/rds/rdma_transport.c | 56 ++++++--- net/rds/rds.h | 70 +++++++---- net/rds/recv.c | 51 +++++--- net/rds/send.c | 67 ++++++++--- net/rds/tcp.c | 32 ++++- net/rds/tcp_connect.c | 34 +++--- net/rds/tcp_listen.c | 18 +-- net/rds/tcp_recv.c | 9 +- net/rds/tcp_send.c | 4 +- net/rds/threads.c | 69 +++++++++-- net/rds/transport.c | 15 ++- 23 files changed, 863 insertions(+), 369 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index ab751a150f70..fc1a5c63b783 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -113,26 +114,63 @@ void rds_wake_sk_sleep(struct rds_sock *rs) static int rds_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sock->sk); - - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int uaddr_len; /* racey, don't care */ if (peer) { - if (!rs->rs_conn_addr) + if (ipv6_addr_any(&rs->rs_conn_addr)) return -ENOTCONN; - sin->sin_port = rs->rs_conn_port; - sin->sin_addr.s_addr = rs->rs_conn_addr; + if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin->sin_family = AF_INET; + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr_v4; + uaddr_len = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)uaddr; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = rs->rs_conn_port; + sin6->sin6_addr = rs->rs_conn_addr; + sin6->sin6_flowinfo = 0; + /* scope_id is the same as in the bound address. */ + sin6->sin6_scope_id = rs->rs_bound_scope_id; + uaddr_len = sizeof(*sin6); + } } else { - sin->sin_port = rs->rs_bound_port; - sin->sin_addr.s_addr = rs->rs_bound_addr; + /* If socket is not yet bound, set the return address family + * to be AF_UNSPEC (value 0) and the address size to be that + * of an IPv4 address. + */ + if (ipv6_addr_any(&rs->rs_bound_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_UNSPEC; + return sizeof(*sin); + } + if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin->sin_family = AF_INET; + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr_v4; + uaddr_len = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)uaddr; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = rs->rs_bound_port; + sin6->sin6_addr = rs->rs_bound_addr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + uaddr_len = sizeof(*sin6); + } } - sin->sin_family = AF_INET; - - return sizeof(*sin); + return uaddr_len; } /* @@ -203,11 +241,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, int len) { + struct sockaddr_in6 sin6; struct sockaddr_in sin; int ret = 0; /* racing with another thread binding seems ok here */ - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -215,14 +254,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, if (len < sizeof(struct sockaddr_in)) { ret = -EINVAL; goto out; + } else if (len < sizeof(struct sockaddr_in6)) { + /* Assume IPv4 */ + if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { + ret = -EFAULT; + goto out; + } + ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); + sin6.sin6_port = sin.sin_port; + } else { + if (copy_from_user(&sin6, optval, + sizeof(struct sockaddr_in6))) { + ret = -EFAULT; + goto out; + } } - if (copy_from_user(&sin, optval, sizeof(sin))) { - ret = -EFAULT; - goto out; - } - - rds_send_drop_to(rs, &sin); + rds_send_drop_to(rs, &sin6); out: return ret; } @@ -435,31 +483,41 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct sockaddr_in *sin; struct rds_sock *rs = rds_sk_to_rs(sk); int ret = 0; lock_sock(sk); - if (addr_len != sizeof(struct sockaddr_in)) { - ret = -EINVAL; - goto out; - } + switch (addr_len) { + case sizeof(struct sockaddr_in): + sin = (struct sockaddr_in *)uaddr; + if (sin->sin_family != AF_INET) { + ret = -EAFNOSUPPORT; + break; + } + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + break; + } + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { + ret = -EINVAL; + break; + } + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); + rs->rs_conn_port = sin->sin_port; + break; - if (sin->sin_family != AF_INET) { - ret = -EAFNOSUPPORT; - goto out; - } + case sizeof(struct sockaddr_in6): + ret = -EPROTONOSUPPORT; + break; - if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { - ret = -EDESTADDRREQ; - goto out; + default: + ret = -EINVAL; + break; } - rs->rs_conn_addr = sin->sin_addr.s_addr; - rs->rs_conn_port = sin->sin_port; - -out: release_sock(sk); return ret; } @@ -578,8 +636,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) - rds_inc_info_copy(inc, iter, inc->i_saddr, - rs->rs_bound_addr, 1); + rds_inc_info_copy(inc, iter, + inc->i_saddr.s6_addr32[3], + rs->rs_bound_addr_v4, + 1); } read_unlock(&rs->rs_recv_lock); @@ -608,8 +668,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len, list_for_each_entry(rs, &rds_sock_list, rs_item) { sinfo.sndbuf = rds_sk_sndbuf(rs); sinfo.rcvbuf = rds_sk_rcvbuf(rs); - sinfo.bound_addr = rs->rs_bound_addr; - sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_addr = rs->rs_bound_addr_v4; + sinfo.connected_addr = rs->rs_conn_addr_v4; sinfo.bound_port = rs->rs_bound_port; sinfo.connected_port = rs->rs_conn_port; sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); diff --git a/net/rds/bind.c b/net/rds/bind.c index 5aa3a64aa4f0..c401776ad938 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table; static const struct rhashtable_params ht_parms = { .nelem_hint = 768, - .key_len = sizeof(u64), + .key_len = RDS_BOUND_KEY_LEN, .key_offset = offsetof(struct rds_sock, rs_bound_key), .head_offset = offsetof(struct rds_sock, rs_bound_node), .max_size = 16384, .min_size = 1024, }; +/* Create a key for the bind hash table manipulation. Port is in network byte + * order. + */ +static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, + __be16 port, __u32 scope_id) +{ + memcpy(key, addr, sizeof(*addr)); + key += sizeof(*addr); + memcpy(key, &port, sizeof(port)); + key += sizeof(port); + memcpy(key, &scope_id, sizeof(scope_id)); +} + /* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ -struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, + __u32 scope_id) { - u64 key = ((u64)addr << 32) | port; + u8 key[RDS_BOUND_KEY_LEN]; struct rds_sock *rs; - rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); + __rds_create_bind_key(key, addr, port, scope_id); + rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, - ntohs(port)); + rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, + ntohs(port)); return rs; } /* returns -ve errno or +ve port */ -static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, + __be16 *port, __u32 scope_id) { int ret = -EADDRINUSE; u16 rover, last; - u64 key; + u8 key[RDS_BOUND_KEY_LEN]; if (*port != 0) { rover = be16_to_cpu(*port); @@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (rover == RDS_FLAG_PROBE_PORT) continue; - key = ((u64)addr << 32) | cpu_to_be16(rover); - if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) + __rds_create_bind_key(key, addr, cpu_to_be16(rover), + scope_id); + if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) continue; - rs->rs_bound_key = key; - rs->rs_bound_addr = addr; + memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); + rs->rs_bound_addr = *addr; net_get_random_once(&rs->rs_hash_initval, sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); @@ -114,7 +132,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rs, &addr, (int)ntohs(*port)); break; } else { - rs->rs_bound_addr = 0; + rs->rs_bound_addr = in6addr_any; rds_sock_put(rs); ret = -ENOMEM; break; @@ -127,44 +145,61 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) void rds_remove_bound(struct rds_sock *rs) { - if (!rs->rs_bound_addr) + if (ipv6_addr_any(&rs->rs_bound_addr)) return; - rdsdebug("rs %p unbinding from %pI4:%d\n", + rdsdebug("rs %p unbinding from %pI6c:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); rds_sock_put(rs); - rs->rs_bound_addr = 0; + rs->rs_bound_addr = in6addr_any; } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sk); + struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; + __u32 scope_id = 0; int ret = 0; + __be16 port; + /* We only allow an RDS socket to be bound to an IPv4 address. IPv6 + * address support will be added later. + */ + if (addr_len == sizeof(struct sockaddr_in)) { + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + + if (sin->sin_family != AF_INET || + sin->sin_addr.s_addr == htonl(INADDR_ANY)) + return -EINVAL; + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); + binding_addr = &v6addr; + port = sin->sin_port; + } else if (addr_len == sizeof(struct sockaddr_in6)) { + return -EPROTONOSUPPORT; + } else { + return -EINVAL; + } lock_sock(sk); - if (addr_len != sizeof(struct sockaddr_in) || - sin->sin_family != AF_INET || - rs->rs_bound_addr || - sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + /* RDS socket does not allow re-binding. */ + if (!ipv6_addr_any(&rs->rs_bound_addr)) { ret = -EINVAL; goto out; } - ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) goto out; if (rs->rs_transport) { /* previously bound */ trans = rs->rs_transport; if (trans->laddr_check(sock_net(sock->sk), - sin->sin_addr.s_addr) != 0) { + binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; rds_remove_bound(rs); } else { @@ -172,13 +207,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } goto out; } - trans = rds_trans_get_preferred(sock_net(sock->sk), - sin->sin_addr.s_addr); + trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, + scope_id); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); - pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", - __func__, &sin->sin_addr.s_addr); + pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", + __func__, binding_addr); goto out; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 63da9d2f142d..ccdff09a79c8 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Oracle. All rights reserved. + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock); static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; -static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, +static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; @@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, struct rds_cong_map *map; while (*p) { + int diff; + parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); - if (addr < map->m_addr) + diff = rds_addr_cmp(addr, &map->m_addr); + if (diff < 0) p = &(*p)->rb_left; - else if (addr > map->m_addr) + else if (diff > 0) p = &(*p)->rb_right; else return map; @@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ -static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; @@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) if (!map) return NULL; - map->m_addr = addr; + map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); @@ -171,7 +174,7 @@ out: kfree(map); } - rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } @@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn) int rds_cong_get_maps(struct rds_connection *conn) { - conn->c_lcong = rds_cong_from_addr(conn->c_laddr); - conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; @@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs) /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); - map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { diff --git a/net/rds/connection.c b/net/rds/connection.c index cfb05953b0e5..3176ead0ab4d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,7 +34,8 @@ #include #include #include -#include +#include +#include #include "rds.h" #include "loop.h" @@ -49,18 +50,21 @@ static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; -static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, + const struct in6_addr *faddr) { + static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; - unsigned long hash; + u32 lhash, fhash, hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); + net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); + hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); - /* Pass NULL, don't need struct net for hash */ - hash = __inet_ehashfn(be32_to_cpu(laddr), 0, - be32_to_cpu(faddr), 0, - rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } @@ -72,20 +76,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, - __be32 laddr, __be32 faddr, - struct rds_transport *trans) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { - if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_trans == trans && net == rds_conn_net(conn)) { + if (ipv6_addr_equal(&conn->c_faddr, faddr) && + ipv6_addr_equal(&conn->c_laddr, laddr) && + conn->c_trans == trans && + net == rds_conn_net(conn) && + conn->c_dev_if == dev_if) { ret = conn; break; } } - rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, - &laddr, &faddr); + rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, + laddr, faddr); return ret; } @@ -99,8 +108,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; - rdsdebug("connection %pI4 to %pI4 reset\n", - &conn->c_laddr, &conn->c_faddr); + rdsdebug("connection %pI6c to %pI6c reset\n", + &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); @@ -142,9 +151,12 @@ static void __rds_conn_path_init(struct rds_connection *conn, * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp, - int is_outgoing) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, + int is_outgoing, + int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); @@ -154,9 +166,12 @@ static struct rds_connection *__rds_conn_create(struct net *net, int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); - conn = rds_conn_lookup(net, head, laddr, faddr, trans); - if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && - laddr == faddr && !is_outgoing) { + conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); + if (conn && + conn->c_loopback && + conn->c_trans != &rds_loop_transport && + ipv6_addr_equal(laddr, faddr) && + !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -181,8 +196,10 @@ static struct rds_connection *__rds_conn_create(struct net *net, } INIT_HLIST_NODE(&conn->c_hash_node); - conn->c_laddr = laddr; - conn->c_faddr = faddr; + conn->c_laddr = *laddr; + conn->c_isv6 = !ipv6_addr_v4mapped(laddr); + conn->c_faddr = *faddr; + conn->c_dev_if = dev_if; rds_conn_net_set(conn, net); @@ -199,7 +216,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(net, faddr); + loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; @@ -233,10 +250,10 @@ static struct rds_connection *__rds_conn_create(struct net *net, goto out; } - rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", - conn, &laddr, &faddr, - strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : - "[unknown]", is_outgoing ? "(outgoing)" : ""); + rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", + conn, laddr, faddr, + strnlen(trans->t_name, sizeof(trans->t_name)) ? + trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could @@ -262,7 +279,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(net, head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans, + dev_if); if (found) { struct rds_conn_path *cp; int i; @@ -295,18 +313,22 @@ out: } struct rds_connection *rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, gfp_t gfp, + int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); @@ -502,12 +524,17 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { + __be32 laddr; + __be32 faddr; + total++; + laddr = conn->c_laddr.s6_addr32[3]; + faddr = conn->c_faddr.s6_addr32[3]; if (total <= len) rds_inc_info_copy(&rm->m_inc, iter, - conn->c_laddr, - conn->c_faddr, + laddr, + faddr, 0); } @@ -584,7 +611,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct hlist_head *head; struct rds_connection *conn; size_t i; - int j; rcu_read_lock(); @@ -595,17 +621,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; - int npaths; - npaths = (conn->c_trans->t_mp_capable ? - RDS_MPATH_WORKERS : 1); - for (j = 0; j < npaths; j++) { - cp = &conn->c_path[j]; + /* XXX We only copy the information from the first + * path for now. The problem is that if there are + * more than one underlying paths, we cannot report + * information of all of them using the existing + * API. For example, there is only one next_tx_seq, + * which path's next_tx_seq should we report? It is + * a bug in the design of MPRDS. + */ + cp = conn->c_path; - /* XXX no cp_lock usage.. */ - if (!visitor(cp, buffer)) - continue; - } + /* XXX no cp_lock usage.. */ + if (!visitor(cp, buffer)) + continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller @@ -624,12 +653,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; + struct rds_connection *conn = cp->cp_conn; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; - cinfo->laddr = cp->cp_conn->c_laddr; - cinfo->faddr = cp->cp_conn->c_faddr; - strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, + cinfo->laddr = conn->c_laddr.s6_addr32[3]; + cinfo->faddr = conn->c_faddr.s6_addr32[3]; + strncpy(cinfo->transport, conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; diff --git a/net/rds/ib.c b/net/rds/ib.c index b6ad38e48f62..c712a848957d 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -296,8 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, if (conn->c_trans != &rds_ib_transport) return 0; - iinfo->src_addr = conn->c_laddr; - iinfo->dst_addr = conn->c_faddr; + iinfo->src_addr = conn->c_laddr.s6_addr32[3]; + iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); @@ -341,7 +341,8 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_ib_laddr_check(struct net *net, __be32 addr) +static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, + __u32 scope_id) { int ret; struct rdma_cm_id *cm_id; @@ -357,7 +358,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr) memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; + sin.sin_addr.s_addr = addr->s6_addr32[3]; /* rdma_bind_addr will only succeed for IB & iWARP devices */ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); @@ -367,9 +368,9 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr) cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI4 ret %d node type %d\n", - &addr, ret, - cm_id->device ? cm_id->device->node_type : -1); + rdsdebug("addr %pI6c ret %d node type %d\n", + addr, ret, + cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); diff --git a/net/rds/ib.h b/net/rds/ib.h index a6f4d7d68e95..beb95b893f78 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -57,16 +57,44 @@ struct rds_ib_refill_cache { struct list_head *ready; }; +/* This is the common structure for the IB private data exchange in setting up + * an RDS connection. The exchange is different for IPv4 and IPv6 connections. + * The reason is that the address size is different and the addresses + * exchanged are in the beginning of the structure. Hence it is not possible + * for interoperability if same structure is used. + */ +struct rds_ib_conn_priv_cmn { + u8 ricpc_protocol_major; + u8 ricpc_protocol_minor; + __be16 ricpc_protocol_minor_mask; /* bitmask */ + __be32 ricpc_reserved1; + __be64 ricpc_ack_seq; + __be32 ricpc_credit; /* non-zero enables flow ctl */ +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ - __be32 dp_saddr; - __be32 dp_daddr; - u8 dp_protocol_major; - u8 dp_protocol_minor; - __be16 dp_protocol_minor_mask; /* bitmask */ - __be32 dp_reserved1; - __be64 dp_ack_seq; - __be32 dp_credit; /* non-zero enables flow ctl */ + __be32 dp_saddr; + __be32 dp_daddr; + struct rds_ib_conn_priv_cmn dp_cmn; +}; + +struct rds6_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + struct in6_addr dp_saddr; + struct in6_addr dp_daddr; + struct rds_ib_conn_priv_cmn dp_cmn; +}; + +#define dp_protocol_major dp_cmn.ricpc_protocol_major +#define dp_protocol_minor dp_cmn.ricpc_protocol_minor +#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask +#define dp_ack_seq dp_cmn.ricpc_ack_seq +#define dp_credit dp_cmn.ricpc_credit + +union rds_ib_conn_priv { + struct rds_ib_connect_private ricp_v4; + struct rds6_ib_connect_private ricp_v6; }; struct rds_ib_send_work { @@ -351,8 +379,8 @@ void rds_ib_listen_stop(void); __printf(2, 3) void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); -int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); + struct rdma_cm_event *event, bool isv6); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); @@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) /* ib_rdma.c */ -int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, + struct in6_addr *ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f1684ae6abfd..dd8a867e5a9c 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +35,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "rds.h" @@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) */ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) { - const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; + const union rds_ib_conn_priv *dp = NULL; struct ib_qp_attr qp_attr; + __be64 ack_seq = 0; + __be32 credit = 0; + u8 major = 0; + u8 minor = 0; int err; - if (event->param.conn.private_data_len >= sizeof(*dp)) { - dp = event->param.conn.private_data; - - /* make sure it isn't empty data */ - if (dp->dp_protocol_major) { - rds_ib_set_protocol(conn, - RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + dp = event->param.conn.private_data; + if (conn->c_isv6) { + if (event->param.conn.private_data_len >= + sizeof(struct rds6_ib_connect_private)) { + major = dp->ricp_v6.dp_protocol_major; + minor = dp->ricp_v6.dp_protocol_minor; + credit = dp->ricp_v6.dp_credit; + /* dp structure start is not guaranteed to be 8 bytes + * aligned. Since dp_ack_seq is 64-bit extended load + * operations can be used so go through get_unaligned + * to avoid unaligned errors. + */ + ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq); } + } else if (event->param.conn.private_data_len >= + sizeof(struct rds_ib_connect_private)) { + major = dp->ricp_v4.dp_protocol_major; + minor = dp->ricp_v4.dp_protocol_minor; + credit = dp->ricp_v4.dp_credit; + ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq); + } + + /* make sure it isn't empty data */ + if (major) { + rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(credit)); } if (conn->c_version < RDS_PROTOCOL(3, 1)) { - pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", + pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n", &conn->c_laddr, &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version)); @@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_conn_destroy(conn); return; } else { - pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", + pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", ic->i_active_side ? "Active" : "Passive", &conn->c_laddr, &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), @@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); /* update ib_device with this local ipaddr */ - err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); + err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr); if (err) printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); @@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ if (dp) { - /* dp structure start is not guaranteed to be 8 bytes aligned. - * Since dp_ack_seq is 64-bit extended load operations can be - * used so go through get_unaligned to avoid unaligned errors. - */ - __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); - - if (dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + if (ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(ack_seq), NULL); } @@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, - struct rdma_conn_param *conn_param, - struct rds_ib_connect_private *dp, - u32 protocol_version, - u32 max_responder_resources, - u32 max_initiator_depth) + struct rdma_conn_param *conn_param, + union rds_ib_conn_priv *dp, + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth, + bool isv6) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_device *rds_ibdev = ic->rds_ibdev; @@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, if (dp) { memset(dp, 0, sizeof(*dp)); - dp->dp_saddr = conn->c_laddr; - dp->dp_daddr = conn->c_faddr; - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); + if (isv6) { + dp->ricp_v6.dp_saddr = conn->c_laddr; + dp->ricp_v6.dp_daddr = conn->c_faddr; + dp->ricp_v6.dp_protocol_major = + RDS_PROTOCOL_MAJOR(protocol_version); + dp->ricp_v6.dp_protocol_minor = + RDS_PROTOCOL_MINOR(protocol_version); + dp->ricp_v6.dp_protocol_minor_mask = + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->ricp_v6.dp_ack_seq = + cpu_to_be64(rds_ib_piggyb_ack(ic)); + + conn_param->private_data = &dp->ricp_v6; + conn_param->private_data_len = sizeof(dp->ricp_v6); + } else { + dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3]; + dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3]; + dp->ricp_v4.dp_protocol_major = + RDS_PROTOCOL_MAJOR(protocol_version); + dp->ricp_v4.dp_protocol_minor = + RDS_PROTOCOL_MINOR(protocol_version); + dp->ricp_v4.dp_protocol_minor_mask = + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->ricp_v4.dp_ack_seq = + cpu_to_be64(rds_ib_piggyb_ack(ic)); + + conn_param->private_data = &dp->ricp_v4; + conn_param->private_data_len = sizeof(dp->ricp_v4); + } /* Advertise flow control */ if (ic->i_flowctl) { unsigned int credits; - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); - dp->dp_credit = cpu_to_be32(credits); - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + credits = IB_GET_POST_CREDITS + (atomic_read(&ic->i_credits)); + if (isv6) + dp->ricp_v6.dp_credit = cpu_to_be32(credits); + else + dp->ricp_v4.dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), + &ic->i_credits); } - - conn_param->private_data = dp; - conn_param->private_data_len = sizeof(*dp); } } @@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) break; default: rdsdebug("Fatal QP Event %u (%s) " - "- connection %pI4->%pI4, reconnecting\n", + "- connection %pI6c->%pI6c, reconnecting\n", event->event, ib_event_msg(event->event), &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); @@ -580,11 +621,13 @@ out: return ret; } -static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) { - const struct rds_ib_connect_private *dp = event->param.conn.private_data; - u16 common; + const union rds_ib_conn_priv *dp = event->param.conn.private_data; + u8 data_len, major, minor; u32 version = 0; + __be16 mask; + u16 common; /* * rdma_cm private data is odd - when there is any private data in the @@ -603,51 +646,126 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) return 0; } + if (isv6) { + data_len = sizeof(struct rds6_ib_connect_private); + major = dp->ricp_v6.dp_protocol_major; + minor = dp->ricp_v6.dp_protocol_minor; + mask = dp->ricp_v6.dp_protocol_minor_mask; + } else { + data_len = sizeof(struct rds_ib_connect_private); + major = dp->ricp_v4.dp_protocol_major; + minor = dp->ricp_v4.dp_protocol_minor; + mask = dp->ricp_v4.dp_protocol_minor_mask; + } + /* Even if len is crap *now* I still want to check it. -ASG */ - if (event->param.conn.private_data_len < sizeof (*dp) || - dp->dp_protocol_major == 0) + if (event->param.conn.private_data_len < data_len || major == 0) return RDS_PROTOCOL_3_0; - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; - if (dp->dp_protocol_major == 3 && common) { + common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (major == 3 && common) { version = RDS_PROTOCOL_3_0; while ((common >>= 1) != 0) version++; - } else - printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); + } else { + if (isv6) + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n", + &dp->ricp_v6.dp_saddr, major, minor); + else + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", + &dp->ricp_v4.dp_saddr, major, minor); + } return version; } +/* Given an IPv6 address, find the IB net_device which hosts that address and + * return its index. This is used by the rds_ib_cm_handle_connect() code to + * find the interface index of where an incoming request comes from when + * the request is using a link local address. + * + * Note one problem in this search. It is possible that two interfaces have + * the same link local address. Unfortunately, this cannot be solved unless + * the underlying layer gives us the interface which an incoming RDMA connect + * request comes from. + */ +static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) +{ + struct net_device *dev; + int idx = 0; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->type == ARPHRD_INFINIBAND && + ipv6_chk_addr(net, addr, dev, 0)) { + idx = dev->ifindex; + break; + } + } + rcu_read_unlock(); + + return idx; +} + int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) + struct rdma_cm_event *event, bool isv6) { __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; - const struct rds_ib_connect_private *dp = event->param.conn.private_data; - struct rds_ib_connect_private dp_rep; + const struct rds_ib_conn_priv_cmn *dp_cmn; struct rds_connection *conn = NULL; struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; + const union rds_ib_conn_priv *dp; + union rds_ib_conn_priv dp_rep; + struct in6_addr s_mapped_addr; + struct in6_addr d_mapped_addr; + const struct in6_addr *saddr6; + const struct in6_addr *daddr6; + int destroy = 1; + u32 ifindex = 0; u32 version; - int err = 1, destroy = 1; + int err = 1; /* Check whether the remote protocol version matches ours. */ - version = rds_ib_protocol_compatible(event); + version = rds_ib_protocol_compatible(event, isv6); if (!version) goto out; - rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " - "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, + dp = event->param.conn.private_data; + if (isv6) { + dp_cmn = &dp->ricp_v6.dp_cmn; + saddr6 = &dp->ricp_v6.dp_saddr; + daddr6 = &dp->ricp_v6.dp_daddr; + /* If the local address is link local, need to find the + * interface index in order to create a proper RDS + * connection. + */ + if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) { + /* Using init_net for now .. */ + ifindex = __rds_find_ifindex(&init_net, daddr6); + /* No index found... Need to bail out. */ + if (ifindex == 0) { + err = -EOPNOTSUPP; + goto out; + } + } + } else { + dp_cmn = &dp->ricp_v4.dp_cmn; + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr); + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr); + saddr6 = &s_mapped_addr; + daddr6 = &d_mapped_addr; + } + + rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid " + "0x%llx\n", saddr6, daddr6, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid)); /* RDS/IB is not currently netns aware, thus init_net */ - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, - &rds_ib_transport, GFP_KERNEL); + conn = rds_conn_create(&init_net, daddr6, saddr6, + &rds_ib_transport, GFP_KERNEL, ifindex); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -678,12 +796,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, ic = conn->c_transport_data; rds_ib_set_protocol(conn, version); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit)); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp_cmn->ricpc_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq), + NULL); BUG_ON(cm_id->context); BUG_ON(ic->i_cm_id); @@ -702,8 +821,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, } rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, - event->param.conn.responder_resources, - event->param.conn.initiator_depth); + event->param.conn.responder_resources, + event->param.conn.initiator_depth, isv6); /* rdma_accept() calls rdma_reject() internally if it fails */ if (rdma_accept(cm_id, &conn_param)) @@ -718,12 +837,12 @@ out: } -int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) { struct rds_connection *conn = cm_id->context; struct rds_ib_connection *ic = conn->c_transport_data; struct rdma_conn_param conn_param; - struct rds_ib_connect_private dp; + union rds_ib_conn_priv dp; int ret; /* If the peer doesn't do protocol negotiation, we must @@ -738,7 +857,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) } rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, - UINT_MAX, UINT_MAX); + UINT_MAX, UINT_MAX, isv6); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); @@ -758,13 +877,17 @@ out: int rds_ib_conn_path_connect(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; - struct rds_ib_connection *ic = conn->c_transport_data; - struct sockaddr_in src, dest; + struct sockaddr_storage src, dest; + rdma_cm_event_handler handler; + struct rds_ib_connection *ic; int ret; + ic = conn->c_transport_data; + /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, + handler = rds_rdma_cm_event_handler; + ic->i_cm_id = rdma_create_id(&init_net, handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); @@ -775,13 +898,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + if (ipv6_addr_v4mapped(&conn->c_faddr)) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&src; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin->sin_port = 0; - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_PORT); + sin = (struct sockaddr_in *)&dest; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin->sin_port = htons(RDS_PORT); + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&src; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = conn->c_laddr; + sin6->sin6_port = 0; + sin6->sin6_scope_id = conn->c_dev_if; + + sin6 = (struct sockaddr_in6 *)&dest; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = conn->c_faddr; + sin6->sin6_port = htons(RDS_CM_PORT); + sin6->sin6_scope_id = conn->c_dev_if; + } ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, (struct sockaddr *)&dest, diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e678699268a2..0ec9df043dd0 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) kfree_rcu(to_free, rcu); } -int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, + struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; - rds_ibdev_old = rds_ib_get_device(ipaddr); + rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { - rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); @@ -544,7 +545,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; int ret; - rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 1eaf2550a9f8..557ccbb1ce00 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); - rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr); return ibinc; } @@ -418,7 +418,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); if (ret) { rds_ib_conn_error(conn, "recv post on " - "%pI4 returned %d, disconnecting and " + "%pI6c returned %d, disconnecting and " "reconnecting\n", &conn->c_faddr, ret); break; @@ -848,7 +848,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (data_len < sizeof(struct rds_header)) { rds_ib_conn_error(conn, "incoming message " - "from %pI4 didn't include a " + "from %pI6c didn't include a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); @@ -861,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { rds_ib_conn_error(conn, "incoming message " - "from %pI4 has corrupted header - " + "from %pI6c has corrupted header - " "forcing a reconnect\n", &conn->c_faddr); rds_stats_inc(s_recv_drop_bad_checksum); @@ -941,10 +941,10 @@ static void rds_ib_process_recv(struct rds_connection *conn, ic->i_recv_data_rem = 0; ic->i_ibinc = NULL; - if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) { rds_ib_cong_recv(conn, ibinc); - else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + } else { + rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr, &ibinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; @@ -988,7 +988,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 8557a1cae041..c4cdfe491d96 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } @@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, first, &first->s_wr, ret, failed_wr); BUG_ON(failed_wr != &first->s_wr); if (ret) { - printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); @@ -827,7 +827,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send, &send->s_atomic_wr, ret, failed_wr); BUG_ON(failed_wr != &send->s_atomic_wr.wr); if (ret) { - printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); @@ -967,7 +967,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) first, &first->s_rdma_wr.wr, ret, failed_wr); BUG_ON(failed_wr != &first->s_rdma_wr.wr); if (ret) { - printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); diff --git a/net/rds/loop.c b/net/rds/loop.c index feea1f96ee2a..1d73ad79c847 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +35,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "rds.h" @@ -88,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(hdr_off || sg || off); - rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + rds_inc_init(&rm->m_inc, conn, &conn->c_laddr); /* For the embedded inc. Matching put is in loop_inc_free() */ rds_message_addref(rm); - rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc, GFP_KERNEL); rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 634cfcb7bba6..7b3998026825 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Oracle. All rights reserved. + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, long i; int ret; - if (rs->rs_bound_addr == 0 || !rs->rs_transport) { + if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -574,7 +574,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, args = CMSG_DATA(cmsg); - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index fc59821f0a27..f49abef69550 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Oracle. All rights reserved. + * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,8 +39,9 @@ static struct rdma_cm_id *rds_rdma_listen_id; -int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) +static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event, + bool isv6) { /* this can be null in the listening path */ struct rds_connection *conn = cm_id->context; @@ -72,7 +73,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - ret = trans->cm_handle_connect(cm_id, event); + ret = trans->cm_handle_connect(cm_id, event, isv6); break; case RDMA_CM_EVENT_ADDR_RESOLVED: @@ -90,7 +91,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, ibic = conn->c_transport_data; if (ibic && ibic->i_cm_id == cm_id) - ret = trans->cm_initiate_connect(cm_id); + ret = trans->cm_initiate_connect(cm_id, isv6); else rds_conn_drop(conn); } @@ -116,14 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_DISCONNECTED: rdsdebug("DISCONNECT event - dropping connection " - "%pI4->%pI4\n", &conn->c_laddr, + "%pI6c->%pI6c\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: if (conn) { - pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", + pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); } @@ -146,13 +147,20 @@ out: return ret; } -static int rds_rdma_listen_init(void) +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, false); +} + +static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, + struct sockaddr *sa, + struct rdma_cm_id **ret_cm_id) { - struct sockaddr_in sin; struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, + cm_id = rdma_create_id(&init_net, handler, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); @@ -161,15 +169,11 @@ static int rds_rdma_listen_init(void) return ret; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_PORT); - /* * XXX I bet this binds the cm_id to a device. If we want to support * fail-over we'll have to take this into consideration. */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); if (ret) { printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_bind_addr() returned %d\n", ret); @@ -185,7 +189,7 @@ static int rds_rdma_listen_init(void) rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); - rds_rdma_listen_id = cm_id; + *ret_cm_id = cm_id; cm_id = NULL; out: if (cm_id) @@ -193,6 +197,26 @@ out: return ret; } +/* Initialize the RDS RDMA listeners. We create two listeners for + * compatibility reason. The one on RDS_PORT is used for IPv4 + * requests only. The one on RDS_CM_PORT is used for IPv6 requests + * only. So only IPv6 enabled RDS module will communicate using this + * port. + */ +static int rds_rdma_listen_init(void) +{ + int ret; + struct sockaddr_in sin; + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(RDS_PORT); + ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, + (struct sockaddr *)&sin, + &rds_rdma_listen_id); + return ret; +} + static void rds_rdma_listen_stop(void) { if (rds_rdma_listen_id) { diff --git a/net/rds/rds.h b/net/rds/rds.h index f2272fb8cd45..1bff26988a5e 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "info.h" @@ -30,6 +31,7 @@ * userspace from listening. */ #define RDS_PORT 18634 +#define RDS_CM_PORT 16385 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 @@ -61,7 +63,7 @@ void rdsdebug(char *fmt, ...) struct rds_cong_map { struct rb_node m_rb_node; - __be32 m_addr; + struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; @@ -136,11 +138,13 @@ struct rds_conn_path { /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; - __be32 c_laddr; - __be32 c_faddr; + struct in6_addr c_laddr; + struct in6_addr c_faddr; + int c_dev_if; /* c_laddrs's interface index */ unsigned int c_loopback:1, + c_isv6:1, c_ping_triggered:1, - c_pad_to_32:30; + c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -269,7 +273,7 @@ struct rds_incoming { struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; - __be32 i_saddr; + struct in6_addr i_saddr; rds_rdma_cookie_t i_rdma_cookie; struct timeval i_rx_tstamp; @@ -386,7 +390,7 @@ struct rds_message { struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; - __be32 m_daddr; + struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. @@ -519,7 +523,8 @@ struct rds_transport { t_mp_capable:1; unsigned int t_type; - int (*laddr_check)(struct net *net, __be32 addr); + int (*laddr_check)(struct net *net, const struct in6_addr *addr, + __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); @@ -535,8 +540,8 @@ struct rds_transport { void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); - int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + struct rdma_cm_event *event, bool isv6); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); @@ -551,6 +556,12 @@ struct rds_transport { bool (*t_unloading)(struct rds_connection *conn); }; +/* Bind hash table key length. It is the sum of the size of a struct + * in6_addr, a scope_id and a port. + */ +#define RDS_BOUND_KEY_LEN \ + (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) + struct rds_sock { struct sock rs_sk; @@ -562,10 +573,14 @@ struct rds_sock { * support. */ struct rhash_head rs_bound_node; - u64 rs_bound_key; - __be32 rs_bound_addr; - __be32 rs_conn_addr; - __be16 rs_bound_port; + u8 rs_bound_key[RDS_BOUND_KEY_LEN]; + struct sockaddr_in6 rs_bound_sin6; +#define rs_bound_addr rs_bound_sin6.sin6_addr +#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] +#define rs_bound_port rs_bound_sin6.sin6_port +#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id + struct in6_addr rs_conn_addr; +#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; @@ -701,7 +716,8 @@ extern wait_queue_head_t rds_poll_waitq; /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); -struct rds_sock *rds_find_bound(__be32 addr, __be16 port); +struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, + __u32 scope_id); int rds_bind_lock_init(void); void rds_bind_lock_destroy(void); @@ -725,11 +741,15 @@ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp); + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, gfp_t gfp, + int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp); + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); @@ -840,11 +860,12 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr); + struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, - __be32 saddr); + struct in6_addr *saddr); void rds_inc_put(struct rds_incoming *inc); -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); @@ -859,7 +880,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); @@ -946,11 +967,14 @@ void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); +int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); /* transport.c */ void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); -struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); +struct rds_transport *rds_trans_get_preferred(struct net *net, + const struct in6_addr *addr, + __u32 scope_id); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/recv.c b/net/rds/recv.c index 192ac6f78ded..4217961fd130 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,14 +41,14 @@ #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr) + struct in6_addr *saddr) { int i; refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; - inc->i_saddr = saddr; + inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; @@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, EXPORT_SYMBOL_GPL(rds_inc_init); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, - __be32 saddr) + struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; - inc->i_saddr = saddr; + inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; @@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); - rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " "now_cong %d delta %d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, @@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn) struct rds_conn_path *cp; if (conn->c_npaths > 1 && - IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); @@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn) * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; @@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { - rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); + rdsdebug("ignore ping with 0 sport from %pI6c\n", + saddr); goto out; } rds_stats_inc(s_recv_ping); @@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, goto out; } - rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; @@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; @@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, break; } - rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); @@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, rds_stats_inc(s_recv_delivered); - if (sin) { - sin->sin_family = AF_INET; - sin->sin_port = inc->i_hdr.h_sport; - sin->sin_addr.s_addr = inc->i_saddr; - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - msg->msg_namelen = sizeof(*sin); + if (msg->msg_name) { + if (ipv6_addr_v4mapped(&inc->i_saddr)) { + sin = (struct sockaddr_in *)msg->msg_name; + + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = + inc->i_saddr.s6_addr32[3]; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)msg->msg_name; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inc->i_hdr.h_sport; + sin6->sin6_addr = inc->i_saddr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + msg->msg_namelen = sizeof(*sin6); + } } break; } diff --git a/net/rds/send.c b/net/rds/send.c index 94c7f74909be..6ed2e925c36a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, } EXPORT_SYMBOL_GPL(rds_send_drop_acked); -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; @@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { - if (dest && (dest->sin_addr.s_addr != rm->m_daddr || - dest->sin_port != rm->m_inc.i_hdr.h_dport)) + if (dest && + (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || + dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); @@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); - __be32 daddr; __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; @@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; + struct in6_addr daddr; + __u32 scope_id = 0; size_t total_payload_len = payload_len, rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = ceil(payload_len, PAGE_SIZE); + int namelen; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1081,27 +1085,59 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - if (msg->msg_namelen) { - /* XXX fail non-unicast destination IPs? */ - if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + namelen = msg->msg_namelen; + if (namelen != 0) { + if (namelen < sizeof(*usin)) { + ret = -EINVAL; + goto out; + } + switch (namelen) { + case sizeof(*usin): + if (usin->sin_family != AF_INET || + usin->sin_addr.s_addr == htonl(INADDR_ANY) || + usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { + ret = -EINVAL; + goto out; + } + ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); + dport = usin->sin_port; + break; + + case sizeof(*sin6): { + ret = -EPROTONOSUPPORT; + goto out; + } + + default: ret = -EINVAL; goto out; } - daddr = usin->sin_addr.s_addr; - dport = usin->sin_port; } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; + scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); - if (daddr == 0 || rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { release_sock(sk); - ret = -ENOTCONN; /* XXX not a great errno */ + ret = -ENOTCONN; goto out; + } else if (namelen != 0) { + /* Cannot send to an IPv4 address using an IPv6 source + * address and cannot send to an IPv6 address using an + * IPv4 source address. + */ + if (ipv6_addr_v4mapped(&daddr) ^ + ipv6_addr_v4mapped(&rs->rs_bound_addr)) { + release_sock(sk); + ret = -EOPNOTSUPP; + goto out; + } } release_sock(sk); @@ -1155,13 +1191,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) conn = rs->rs_conn; else { conn = rds_conn_create_outgoing(sock_net(sock->sk), - rs->rs_bound_addr, daddr, - rs->rs_transport, - sock->sk->sk_allocation); + &rs->rs_bound_addr, &daddr, + rs->rs_transport, + sock->sk->sk_allocation, + scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351a28474667..dadb33790333 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include "rds.h" #include "tcp.h" @@ -262,9 +264,33 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } -static int rds_tcp_laddr_check(struct net *net, __be32 addr) +static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, + __u32 scope_id) { - if (inet_addr_type(net, addr) == RTN_LOCAL) + struct net_device *dev = NULL; + int ret; + + if (ipv6_addr_v4mapped(addr)) { + if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL) + return 0; + return -EADDRNOTAVAIL; + } + + /* If the scope_id is specified, check only those addresses + * hosted on the specified interface. + */ + if (scope_id != 0) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, scope_id); + /* scope_id is not valid... */ + if (!dev) { + rcu_read_unlock(); + return -EADDRNOTAVAIL; + } + rcu_read_unlock(); + } + ret = ipv6_chk_addr(net, addr, dev, 0); + if (ret) return 0; return -EADDRNOTAVAIL; } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index d999e7075645..231ae927858e 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk) * RDS connection as RDS_CONN_UP until the reconnect, * to avoid RDS datagram loss. */ - if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && + if (rds_addr_cmp(&cp->cp_conn->c_laddr, + &cp->cp_conn->c_faddr) >= 0 && rds_conn_path_transition(cp, RDS_CONN_CONNECTING, RDS_CONN_ERROR)) { rds_conn_path_drop(cp, false); @@ -88,7 +89,9 @@ out: int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; - struct sockaddr_in src, dest; + struct sockaddr_in sin; + struct sockaddr *addr; + int addrlen; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -112,30 +115,33 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) rds_tcp_tune(sock); - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin.sin_port = 0; + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); - ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + ret = sock->ops->bind(sock, addr, addrlen); if (ret) { - rdsdebug("bind failed with %d at address %pI4\n", + rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); goto out; } - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin.sin_port = htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); /* * once we call connect() we can start getting callbacks and they * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), - O_NONBLOCK); + ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); - rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; if (ret == 0) { diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 22571189f21e..4fdf5b3a47df 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -83,13 +83,12 @@ static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; - bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr); int npaths = max_t(int, 1, conn->c_npaths); /* for mprds, all paths MUST be initiated by the peer * with the smaller address. */ - if (!peer_is_smaller) { + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { /* Make sure we initiate at least one path if this * has not already been done; rds_start_mprds() will * take care of additional paths, if necessary. @@ -164,13 +163,16 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); - rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", - &inet->inet_saddr, ntohs(inet->inet_sport), - &inet->inet_daddr, ntohs(inet->inet_dport)); + rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", + &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), + &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); conn = rds_conn_create(sock_net(sock->sk), - inet->inet_saddr, inet->inet_daddr, - &rds_tcp_transport, GFP_KERNEL); + &new_sock->sk->sk_v6_rcv_saddr, + &new_sock->sk->sk_v6_daddr, + &rds_tcp_transport, GFP_KERNEL, + new_sock->sk->sk_bound_dev_if); + if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index b9fbd2ee74ef..42c5ff1eda95 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, tc->t_tinc = tinc; rdsdebug("alloced tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, - cp->cp_conn->c_faddr); + &cp->cp_conn->c_faddr); tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = local_clock(); @@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) rds_tcp_cong_recv(conn, tinc); else - rds_recv_incoming(conn, conn->c_faddr, - conn->c_laddr, &tinc->ti_inc, + rds_recv_incoming(conn, &conn->c_faddr, + &conn->c_laddr, + &tinc->ti_inc, arg->gfp); tc->t_tinc_hdr_rem = sizeof(struct rds_header); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 7df869d37afd..78a2554a4497 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -153,7 +153,7 @@ out: * an incoming RST. */ if (rds_conn_path_up(cp)) { - pr_warn("RDS/tcp: send to %pI4 on cp [%d]" + pr_warn("RDS/tcp: send to %pI6c on cp [%d]" "returned %d, " "disconnecting and reconnecting\n", &conn->c_faddr, cp->cp_index, ret); diff --git a/net/rds/threads.c b/net/rds/threads.c index c52861d77a59..e64f9e4c3cda 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) return; } - rdsdebug("conn %p for %pI4 to %pI4 complete\n", - cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); + rdsdebug("conn %p for %pI6c to %pI6c complete\n", + cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); @@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp) unsigned long rand; struct rds_connection *conn = cp->cp_conn; - rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", - conn, &conn->c_laddr, &conn->c_faddr, - cp->cp_reconnect_jiffies); + rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, + cp->cp_reconnect_jiffies); /* let peer with smaller addr initiate reconnect, to avoid duels */ if (conn->c_trans->t_type == RDS_TRANS_TCP && - !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) return; set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); @@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) } get_random_bytes(&rand, sizeof(rand)); - rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", + rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); @@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work) int ret; if (cp->cp_index > 0 && - !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) + rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); if (ret) { ret = conn->c_trans->conn_path_connect(cp); - rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", - conn, &conn->c_laddr, &conn->c_faddr, ret); + rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", + conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { if (rds_conn_path_transition(cp, @@ -259,3 +259,50 @@ int rds_threads_init(void) return 0; } + +/* Compare two IPv6 addresses. Return 0 if the two addresses are equal. + * Return 1 if the first is greater. Return -1 if the second is greater. + */ +int rds_addr_cmp(const struct in6_addr *addr1, + const struct in6_addr *addr2) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const __be64 *a1, *a2; + u64 x, y; + + a1 = (__be64 *)addr1; + a2 = (__be64 *)addr2; + + if (*a1 != *a2) { + if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) + return -1; + else + return 1; + } else { + x = be64_to_cpu(*++a1); + y = be64_to_cpu(*++a2); + if (x < y) + return -1; + else if (x > y) + return 1; + else + return 0; + } +#else + u32 a, b; + int i; + + for (i = 0; i < 4; i++) { + if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { + a = ntohl(addr1->s6_addr32[i]); + b = ntohl(addr2->s6_addr32[i]); + if (a < b) + return -1; + else if (a > b) + return 1; + } + } + return 0; +#endif +} +EXPORT_SYMBOL_GPL(rds_addr_cmp); diff --git a/net/rds/transport.c b/net/rds/transport.c index 0b188dd0a344..c9788dbce441 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include #include #include +#include #include "rds.h" #include "loop.h" @@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans) module_put(trans->t_owner); } -struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) +struct rds_transport *rds_trans_get_preferred(struct net *net, + const struct in6_addr *addr, + __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; - if (IN_LOOPBACK(ntohl(addr))) + if (ipv6_addr_v4mapped(addr)) { + if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) + return &rds_loop_transport; + } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; + } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; - if (trans && (trans->laddr_check(net, addr) == 0) && + if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; -- cgit v1.2.3 From 1e2b44e78eead7bcadfbf96f70d95773191541c9 Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Mon, 23 Jul 2018 20:51:22 -0700 Subject: rds: Enable RDS IPv6 support This patch enables RDS to use IPv6 addresses. For RDS/TCP, the listener is now an IPv6 endpoint which accepts both IPv4 and IPv6 connection requests. RDS/RDMA/IB uses a private data (struct rds_ib_connect_private) exchange between endpoints at RDS connection establishment time to support RDMA. This private data exchange uses a 32 bit integer to represent an IP address. This needs to be changed in order to support IPv6. A new private data struct rds6_ib_connect_private is introduced to handle this. To ensure backward compatibility, an IPv6 capable RDS stack uses another RDMA listener port (RDS_CM_PORT) to accept IPv6 connection. And it continues to use the original RDS_PORT for IPv4 RDS connections. When it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to send the connection set up request. v5: Fixed syntax problem (David Miller). v4: Changed port history comments in rds.h (Sowmini Varadhan). v3: Added support to set up IPv4 connection using mapped address (David Miller). Added support to set up connection between link local and non-link addresses. Various review comments from Santosh Shilimkar and Sowmini Varadhan. v2: Fixed bound and peer address scope mismatched issue. Added back rds_connect() IPv6 changes. Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/af_rds.c | 91 ++++++++++++++++++++++++++++++++++++++++-------- net/rds/bind.c | 59 ++++++++++++++++++++++++++----- net/rds/connection.c | 54 ++++++++++++++++++++-------- net/rds/ib.c | 55 ++++++++++++++++++++++++----- net/rds/ib_cm.c | 20 ++++++++--- net/rds/rdma_transport.c | 30 +++++++++++++++- net/rds/rdma_transport.h | 5 +++ net/rds/rds.h | 22 +++++++----- net/rds/recv.c | 2 +- net/rds/send.c | 61 ++++++++++++++++++++++++++++---- net/rds/tcp.c | 54 +++++++++++++++++----------- net/rds/tcp.h | 2 +- net/rds/tcp_connect.c | 54 +++++++++++++++++++++------- net/rds/tcp_listen.c | 64 +++++++++++++++++++++++++++------- 14 files changed, 459 insertions(+), 114 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index fc1a5c63b783..fc5c48b248fe 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -142,15 +142,32 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, uaddr_len = sizeof(*sin6); } } else { - /* If socket is not yet bound, set the return address family - * to be AF_UNSPEC (value 0) and the address size to be that - * of an IPv4 address. + /* If socket is not yet bound and the socket is connected, + * set the return address family to be the same as the + * connected address, but with 0 address value. If it is not + * connected, set the family to be AF_UNSPEC (value 0) and + * the address size to be that of an IPv4 address. */ if (ipv6_addr_any(&rs->rs_bound_addr)) { - sin = (struct sockaddr_in *)uaddr; - memset(sin, 0, sizeof(*sin)); - sin->sin_family = AF_UNSPEC; - return sizeof(*sin); + if (ipv6_addr_any(&rs->rs_conn_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_UNSPEC; + return sizeof(*sin); + } + + if (ipv6_addr_type(&rs->rs_conn_addr) & + IPV6_ADDR_MAPPED) { + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + return sizeof(*sin); + } + + sin6 = (struct sockaddr_in6 *)uaddr; + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + return sizeof(*sin6); } if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { sin = (struct sockaddr_in *)uaddr; @@ -484,16 +501,18 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; struct rds_sock *rs = rds_sk_to_rs(sk); + int addr_type; int ret = 0; lock_sock(sk); - switch (addr_len) { - case sizeof(struct sockaddr_in): + switch (uaddr->sa_family) { + case AF_INET: sin = (struct sockaddr_in *)uaddr; - if (sin->sin_family != AF_INET) { - ret = -EAFNOSUPPORT; + if (addr_len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; break; } if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { @@ -509,12 +528,56 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, rs->rs_conn_port = sin->sin_port; break; - case sizeof(struct sockaddr_in6): - ret = -EPROTONOSUPPORT; + case AF_INET6: + sin6 = (struct sockaddr_in6 *)uaddr; + if (addr_len < sizeof(struct sockaddr_in6)) { + ret = -EINVAL; + break; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) { + ret = -EPROTOTYPE; + break; + } + + /* It is a mapped address. Need to do some sanity + * checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) { + ret = -EPROTOTYPE; + break; + } + } + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + /* If socket is arleady bound to a link local address, + * the peer address must be on the same link. + */ + if (sin6->sin6_scope_id == 0 || + (!ipv6_addr_any(&rs->rs_bound_addr) && + rs->rs_bound_scope_id && + sin6->sin6_scope_id != rs->rs_bound_scope_id)) { + ret = -EINVAL; + break; + } + /* Remember the connected address scope ID. It will + * be checked against the binding local address when + * the socket is bound. + */ + rs->rs_bound_scope_id = sin6->sin6_scope_id; + } + rs->rs_conn_addr = sin6->sin6_addr; + rs->rs_conn_port = sin6->sin6_port; break; default: - ret = -EINVAL; + ret = -EAFNOSUPPORT; break; } diff --git a/net/rds/bind.c b/net/rds/bind.c index c401776ad938..ba778760cbc2 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -127,9 +127,10 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; + rs->rs_bound_scope_id = scope_id; ret = 0; - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); + rdsdebug("rs %p binding to %pI6c:%d\n", + rs, addr, (int)ntohs(*port)); break; } else { rs->rs_bound_addr = in6addr_any; @@ -164,23 +165,53 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; + int addr_type; int ret = 0; __be16 port; - /* We only allow an RDS socket to be bound to an IPv4 address. IPv6 - * address support will be added later. + /* We allow an RDS socket to be bound to either IPv4 or IPv6 + * address. */ - if (addr_len == sizeof(struct sockaddr_in)) { + if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; - if (sin->sin_family != AF_INET || - sin->sin_addr.s_addr == htonl(INADDR_ANY)) + if (addr_len < sizeof(struct sockaddr_in) || + sin->sin_addr.s_addr == htonl(INADDR_ANY) || + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; - } else if (addr_len == sizeof(struct sockaddr_in6)) { - return -EPROTONOSUPPORT; + } else if (uaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; + + if (addr_len < sizeof(struct sockaddr_in6)) + return -EINVAL; + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) + return -EINVAL; + + /* It is a mapped address. Need to do some sanity + * checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) + return -EINVAL; + } + /* The scope ID must be specified for link local address. */ + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) + return -EINVAL; + scope_id = sin6->sin6_scope_id; + } + binding_addr = &sin6->sin6_addr; + port = sin6->sin6_port; } else { return -EINVAL; } @@ -191,6 +222,16 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ret = -EINVAL; goto out; } + /* Socket is connected. The binding address should have the same + * scope ID as the connected address, except the case when one is + * non-link local address (scope_id is 0). + */ + if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && + rs->rs_bound_scope_id && + scope_id != rs->rs_bound_scope_id) { + ret = -EINVAL; + goto out; + } ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) diff --git a/net/rds/connection.c b/net/rds/connection.c index 3176ead0ab4d..5c9ceed55dae 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,6 +36,7 @@ #include #include #include +#include #include "rds.h" #include "loop.h" @@ -200,6 +201,15 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; + /* If the local address is link local, set c_bound_if to be the + * index used for this connection. Otherwise, set it to 0 as + * the socket is not bound to an interface. c_bound_if is used + * to look up a socket when a packet is received + */ + if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) + conn->c_bound_if = dev_if; + else + conn->c_bound_if = 0; rds_conn_net_set(conn, net); @@ -486,10 +496,18 @@ void rds_conn_destroy(struct rds_connection *conn) } EXPORT_SYMBOL_GPL(rds_conn_destroy); -static void rds_conn_message_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int want_send) +static void __rds_inc_msg_cp(struct rds_incoming *inc, + struct rds_info_iterator *iter, + void *saddr, void *daddr, int flip) +{ + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, + *(__be32 *)daddr, flip); +} + +static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) { struct hlist_head *head; struct list_head *list; @@ -524,18 +542,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { - __be32 laddr; - __be32 faddr; - total++; - laddr = conn->c_laddr.s6_addr32[3]; - faddr = conn->c_faddr.s6_addr32[3]; if (total <= len) - rds_inc_info_copy(&rm->m_inc, - iter, - laddr, - faddr, - 0); + __rds_inc_msg_cp(&rm->m_inc, + iter, + &conn->c_laddr, + &conn->c_faddr, + 0); } spin_unlock_irqrestore(&cp->cp_lock, flags); @@ -548,6 +561,14 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, lens->each = sizeof(struct rds_info_message); } +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send); +} + static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -655,6 +676,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; + if (conn->c_isv6) + return 0; + cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; diff --git a/net/rds/ib.c b/net/rds/ib.c index c712a848957d..756225c5540f 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +39,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "rds.h" @@ -295,6 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; + if (conn->c_isv6) + return 0; iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; @@ -330,7 +333,6 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } - /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. @@ -346,8 +348,12 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, { int ret; struct rdma_cm_id *cm_id; + struct sockaddr_in6 sin6; struct sockaddr_in sin; + struct sockaddr *sa; + bool isv4; + isv4 = ipv6_addr_v4mapped(addr); /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ @@ -356,20 +362,53 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr->s6_addr32[3]; + if (isv4) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr->s6_addr32[3]; + sa = (struct sockaddr *)&sin; + } else { + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = *addr; + sin6.sin6_scope_id = scope_id; + sa = (struct sockaddr *)&sin6; + + /* XXX Do a special IPv6 link local address check here. The + * reason is that rdma_bind_addr() always succeeds with IPv6 + * link local address regardless it is indeed configured in a + * system. + */ + if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { + struct net_device *dev; + + if (scope_id == 0) + return -EADDRNOTAVAIL; + + /* Use init_net for now as RDS is not network + * name space aware. + */ + dev = dev_get_by_index(&init_net, scope_id); + if (!dev) + return -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + dev_put(dev); + } + } /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI6c ret %d node type %d\n", - addr, ret, + rdsdebug("addr %pI6c%%%u ret %d node type %d\n", + addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index dd8a867e5a9c..a33b82dc0804 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -678,7 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) return version; } -/* Given an IPv6 address, find the IB net_device which hosts that address and +/* Given an IPv6 address, find the net_device which hosts that address and * return its index. This is used by the rds_ib_cm_handle_connect() code to * find the interface index of where an incoming request comes from when * the request is using a link local address. @@ -695,8 +695,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_INFINIBAND && - ipv6_chk_addr(net, addr, dev, 0)) { + if (ipv6_chk_addr(net, addr, dev, 1)) { idx = dev->ifindex; break; } @@ -736,7 +735,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, dp_cmn = &dp->ricp_v6.dp_cmn; saddr6 = &dp->ricp_v6.dp_saddr; daddr6 = &dp->ricp_v6.dp_daddr; - /* If the local address is link local, need to find the + /* If either address is link local, need to find the * interface index in order to create a proper RDS * connection. */ @@ -748,6 +747,14 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, err = -EOPNOTSUPP; goto out; } + } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) { + /* Use our address to find the correct index. */ + ifindex = __rds_find_ifindex(&init_net, daddr6); + /* No index found... Need to bail out. */ + if (ifindex == 0) { + err = -EOPNOTSUPP; + goto out; + } } } else { dp_cmn = &dp->ricp_v4.dp_cmn; @@ -886,7 +893,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - handler = rds_rdma_cm_event_handler; + if (conn->c_isv6) + handler = rds6_rdma_cm_event_handler; + else + handler = rds_rdma_cm_event_handler; ic->i_cm_id = rdma_create_id(&init_net, handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index f49abef69550..bd67e55354f4 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -37,7 +37,9 @@ #include "rdma_transport.h" #include "ib.h" +/* Global IPv4 and IPv6 RDS RDMA listener cm_id */ static struct rdma_cm_id *rds_rdma_listen_id; +static struct rdma_cm_id *rds6_rdma_listen_id; static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, @@ -153,6 +155,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, return rds_rdma_cm_event_handler_cmn(cm_id, event, false); } +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, true); +} + static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, struct sockaddr *sa, struct rdma_cm_id **ret_cm_id) @@ -206,6 +214,7 @@ out: static int rds_rdma_listen_init(void) { int ret; + struct sockaddr_in6 sin6; struct sockaddr_in sin; sin.sin_family = PF_INET; @@ -214,7 +223,21 @@ static int rds_rdma_listen_init(void) ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, (struct sockaddr *)&sin, &rds_rdma_listen_id); - return ret; + if (ret != 0) + return ret; + + sin6.sin6_family = PF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(RDS_CM_PORT); + sin6.sin6_scope_id = 0; + sin6.sin6_flowinfo = 0; + ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler, + (struct sockaddr *)&sin6, + &rds6_rdma_listen_id); + /* Keep going even when IPv6 is not enabled in the system. */ + if (ret != 0) + rdsdebug("Cannot set up IPv6 RDMA listener\n"); + return 0; } static void rds_rdma_listen_stop(void) @@ -224,6 +247,11 @@ static void rds_rdma_listen_stop(void) rdma_destroy_id(rds_rdma_listen_id); rds_rdma_listen_id = NULL; } + if (rds6_rdma_listen_id) { + rdsdebug("cm %p\n", rds6_rdma_listen_id); + rdma_destroy_id(rds6_rdma_listen_id); + rds6_rdma_listen_id = NULL; + } } static int rds_rdma_init(void) diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index d309c4430124..200d3134aaae 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -6,11 +6,16 @@ #include #include "rds.h" +/* RDMA_CM also uses 16385 as the listener port. */ +#define RDS_CM_PORT 16385 + #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); /* from ib.c */ extern struct rds_transport rds_ib_transport; diff --git a/net/rds/rds.h b/net/rds/rds.h index 1bff26988a5e..ff537bb11411 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -24,14 +24,15 @@ #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) -/* - * XXX randomly chosen, but at least seems to be unused: - * # 18464-18768 Unassigned - * We should do better. We want a reserved port to discourage unpriv'ed - * userspace from listening. +/* The following ports, 16385, 18634, 18635, are registered with IANA as + * the ports to be used for RDS over TCP and UDP. Currently, only RDS over + * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value + * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After + * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept + * to ensure compatibility with older RDS modules. Those ports are defined + * in each transport's header file. */ #define RDS_PORT 18634 -#define RDS_CM_PORT 16385 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 @@ -140,7 +141,8 @@ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; - int c_dev_if; /* c_laddrs's interface index */ + int c_dev_if; /* ifindex used for this conn */ + int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, @@ -736,7 +738,7 @@ void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); -/* conn.c */ +/* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); @@ -874,6 +876,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); diff --git a/net/rds/recv.c b/net/rds/recv.c index 4217961fd130..1402c21210b1 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -364,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, goto out; } - rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; diff --git a/net/rds/send.c b/net/rds/send.c index 6ed2e925c36a..9604e1faa564 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1091,10 +1091,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) ret = -EINVAL; goto out; } - switch (namelen) { - case sizeof(*usin): - if (usin->sin_family != AF_INET || - usin->sin_addr.s_addr == htonl(INADDR_ANY) || + switch (usin->sin_family) { + case AF_INET: + if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { ret = -EINVAL; @@ -1104,9 +1103,44 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) dport = usin->sin_port; break; - case sizeof(*sin6): { - ret = -EPROTONOSUPPORT; - goto out; + case AF_INET6: { + int addr_type; + + if (namelen < sizeof(*sin6)) { + ret = -EINVAL; + goto out; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + __be32 addr4; + + if (!(addr_type & IPV6_ADDR_MAPPED)) { + ret = -EINVAL; + goto out; + } + + /* It is a mapped address. Need to do some + * sanity checks. + */ + addr4 = sin6->sin6_addr.s6_addr32[3]; + if (addr4 == htonl(INADDR_ANY) || + addr4 == htonl(INADDR_BROADCAST) || + IN_MULTICAST(ntohl(addr4))) { + return -EINVAL; + goto out; + } + } + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) { + ret = -EINVAL; + goto out; + } + scope_id = sin6->sin6_scope_id; + } + + daddr = sin6->sin6_addr; + dport = sin6->sin6_port; + break; } default: @@ -1138,6 +1172,19 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) ret = -EOPNOTSUPP; goto out; } + /* If the socket is already bound to a link local address, + * it can only send to peers on the same link. But allow + * communicating beween link local and non-link local address. + */ + if (scope_id != rs->rs_bound_scope_id) { + if (!scope_id) { + scope_id = rs->rs_bound_scope_id; + } else if (rs->rs_bound_scope_id) { + release_sock(sk); + ret = -EINVAL; + goto out; + } + } } release_sock(sk); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index dadb33790333..890d0e1d8908 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,7 +46,12 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); + +/* rds_tcp_tc_count counts only IPv4 connections. + * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. + */ static unsigned int rds_tcp_tc_count; +static unsigned int rds6_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -113,7 +118,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_del_init(&tc->t_list_item); - rds_tcp_tc_count--; + rds6_tcp_tc_count--; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count--; spin_unlock(&rds_tcp_tc_list_lock); tc->t_sock = NULL; @@ -200,7 +207,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); - rds_tcp_tc_count++; + rds6_tcp_tc_count++; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count++; spin_unlock(&rds_tcp_tc_list_lock); /* accepted sockets need our listen data ready undone */ @@ -221,6 +230,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) write_unlock_bh(&sock->sk->sk_callback_lock); } +/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4 + * connections for backward compatibility. + */ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -228,8 +240,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_tcp_socket tsinfo; struct rds_tcp_connection *tc; unsigned long flags; - struct sockaddr_in sin; - struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -237,16 +247,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, goto out; list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct inet_sock *inet = inet_sk(tc->t_sock->sk); - sock = tc->t_sock; - if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, 0); - tsinfo.local_addr = sin.sin_addr.s_addr; - tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, 1); - tsinfo.peer_addr = sin.sin_addr.s_addr; - tsinfo.peer_port = sin.sin_port; - } + if (tc->t_cpath->cp_conn->c_isv6) + continue; + + tsinfo.local_addr = inet->inet_saddr; + tsinfo.local_port = inet->inet_sport; + tsinfo.peer_addr = inet->inet_daddr; + tsinfo.peer_port = inet->inet_dport; tsinfo.hdr_rem = tc->t_tinc_hdr_rem; tsinfo.data_rem = tc->t_tinc_data_rem; @@ -494,13 +503,18 @@ static __net_init int rds_tcp_init_net(struct net *net) err = -ENOMEM; goto fail; } - rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); if (!rtn->rds_tcp_listen_sock) { - pr_warn("could not set up listen sock\n"); - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - rtn->rds_tcp_sysctl = NULL; - err = -EAFNOSUPPORT; - goto fail; + pr_warn("could not set up IPv6 listen sock\n"); + + /* Try IPv4 as some systems disable IPv6 */ + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); + if (!rtn->rds_tcp_listen_sock) { + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; + } } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index c6fa080e9b6d..3c69361d21c7 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -struct socket *rds_tcp_listen_init(struct net *); +struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 231ae927858e..008f50fb25dd 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -89,9 +89,11 @@ out: int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; + struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; int addrlen; + bool isv6; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -108,18 +110,36 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) mutex_unlock(&tc->t_conn_path_lock); return 0; } - ret = sock_create_kern(rds_conn_net(conn), PF_INET, - SOCK_STREAM, IPPROTO_TCP, &sock); + if (ipv6_addr_v4mapped(&conn->c_laddr)) { + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = false; + } else { + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = true; + } + if (ret < 0) goto out; rds_tcp_tune(sock); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; - sin.sin_port = 0; - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_laddr; + sin6.sin6_port = 0; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin.sin_port = 0; + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } ret = sock->ops->bind(sock, addr, addrlen); if (ret) { @@ -128,11 +148,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) goto out; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; - sin.sin_port = htons(RDS_TCP_PORT); - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_faddr; + sin6.sin6_port = htons(RDS_TCP_PORT); + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin.sin_port = htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } /* * once we call connect() we can start getting callbacks and they diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 4fdf5b3a47df..0cf0147117d8 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -131,6 +131,8 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; + struct in6_addr *my_addr, *peer_addr; + int dev_if; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; @@ -163,15 +165,29 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); + my_addr = &new_sock->sk->sk_v6_rcv_saddr; + peer_addr = &new_sock->sk->sk_v6_daddr; rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", - &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), - &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); + my_addr, ntohs(inet->inet_sport), + peer_addr, ntohs(inet->inet_dport)); + /* sk_bound_dev_if is not set if the peer address is not link local + * address. In this case, it happens that mcast_oif is set. So + * just use it. + */ + if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) && + !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) { + struct ipv6_pinfo *inet6; + + inet6 = inet6_sk(new_sock->sk); + dev_if = inet6->mcast_oif; + } else { + dev_if = new_sock->sk->sk_bound_dev_if; + } conn = rds_conn_create(sock_net(sock->sk), &new_sock->sk->sk_v6_rcv_saddr, &new_sock->sk->sk_v6_daddr, - &rds_tcp_transport, GFP_KERNEL, - new_sock->sk->sk_bound_dev_if); + &rds_tcp_transport, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); @@ -256,15 +272,22 @@ out: ready(sk); } -struct socket *rds_tcp_listen_init(struct net *net) +struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { - struct sockaddr_in sin; struct socket *sock = NULL; + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int addr_len; int ret; - ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) + ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret < 0) { + rdsdebug("could not create %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } sock->sk->sk_reuse = SK_CAN_REUSE; rds_tcp_nonagle(sock); @@ -274,13 +297,28 @@ struct socket *rds_tcp_listen_init(struct net *net) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6 = (struct sockaddr_in6 *)&ss; + sin6->sin6_family = PF_INET6; + sin6->sin6_addr = in6addr_any; + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_scope_id = 0; + sin6->sin6_flowinfo = 0; + addr_len = sizeof(*sin6); + } else { + sin = (struct sockaddr_in *)&ss; + sin->sin_family = PF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + sin->sin_port = (__force u16)htons(RDS_TCP_PORT); + addr_len = sizeof(*sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) + ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + if (ret < 0) { + rdsdebug("could not bind %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } ret = sock->ops->listen(sock, 64); if (ret < 0) -- cgit v1.2.3 From b7ff8b1036f0b0df1390ba6b5e9bc7ec458e857a Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Mon, 23 Jul 2018 20:51:23 -0700 Subject: rds: Extend RDS API for IPv6 support There are many data structures (RDS socket options) used by RDS apps which use a 32 bit integer to store IP address. To support IPv6, struct in6_addr needs to be used. To ensure backward compatibility, a new data structure is introduced for each of those data structures which use a 32 bit integer to represent an IP address. And new socket options are introduced to use those new structures. This means that existing apps should work without a problem with the new RDS module. For apps which want to use IPv6, those new data structures and socket options can be used. IPv4 mapped address is used to represent IPv4 address in the new data structures. v4: Revert changes to SO_RDS_TRANSPORT Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 69 +++++++++++++++++++++++++++++++- net/rds/connection.c | 101 +++++++++++++++++++++++++++++++++++++++++++---- net/rds/ib.c | 52 ++++++++++++++++++++++++ net/rds/ib_mr.h | 2 + net/rds/ib_rdma.c | 11 +++++- net/rds/recv.c | 25 ++++++++++++ net/rds/tcp.c | 44 +++++++++++++++++++++ 7 files changed, 293 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 20c6bd0b0007..dc520e1a4123 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ /* - * Copyright (c) 2008 Oracle. All rights reserved. + * Copyright (c) 2008, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -118,7 +118,17 @@ #define RDS_INFO_IB_CONNECTIONS 10008 #define RDS_INFO_CONNECTION_STATS 10009 #define RDS_INFO_IWARP_CONNECTIONS 10010 -#define RDS_INFO_LAST 10010 + +/* PF_RDS6 options */ +#define RDS6_INFO_CONNECTIONS 10011 +#define RDS6_INFO_SEND_MESSAGES 10012 +#define RDS6_INFO_RETRANS_MESSAGES 10013 +#define RDS6_INFO_RECV_MESSAGES 10014 +#define RDS6_INFO_SOCKETS 10015 +#define RDS6_INFO_TCP_SOCKETS 10016 +#define RDS6_INFO_IB_CONNECTIONS 10017 + +#define RDS_INFO_LAST 10017 struct rds_info_counter { __u8 name[32]; @@ -140,6 +150,15 @@ struct rds_info_connection { __u8 flags; } __attribute__((packed)); +struct rds6_info_connection { + __u64 next_tx_seq; + __u64 next_rx_seq; + struct in6_addr laddr; + struct in6_addr faddr; + __u8 transport[TRANSNAMSIZ]; /* null term ascii */ + __u8 flags; +} __attribute__((packed)); + #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 @@ -153,6 +172,17 @@ struct rds_info_message { __u8 flags; } __attribute__((packed)); +struct rds6_info_message { + __u64 seq; + __u32 len; + struct in6_addr laddr; + struct in6_addr faddr; + __be16 lport; + __be16 fport; + __u8 flags; + __u8 tos; +} __attribute__((packed)); + struct rds_info_socket { __u32 sndbuf; __be32 bound_addr; @@ -163,6 +193,16 @@ struct rds_info_socket { __u64 inum; } __attribute__((packed)); +struct rds6_info_socket { + __u32 sndbuf; + struct in6_addr bound_addr; + struct in6_addr connected_addr; + __be16 bound_port; + __be16 connected_port; + __u32 rcvbuf; + __u64 inum; +} __attribute__((packed)); + struct rds_info_tcp_socket { __be32 local_addr; __be16 local_port; @@ -175,6 +215,18 @@ struct rds_info_tcp_socket { __u32 last_seen_una; } __attribute__((packed)); +struct rds6_info_tcp_socket { + struct in6_addr local_addr; + __be16 local_port; + struct in6_addr peer_addr; + __be16 peer_port; + __u64 hdr_rem; + __u64 data_rem; + __u32 last_sent_nxt; + __u32 last_expected_una; + __u32 last_seen_una; +} __attribute__((packed)); + #define RDS_IB_GID_LEN 16 struct rds_info_rdma_connection { __be32 src_addr; @@ -189,6 +241,19 @@ struct rds_info_rdma_connection { __u32 rdma_mr_size; }; +struct rds6_info_rdma_connection { + struct in6_addr src_addr; + struct in6_addr dst_addr; + __u8 src_gid[RDS_IB_GID_LEN]; + __u8 dst_gid[RDS_IB_GID_LEN]; + + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 rdma_mr_max; + __u32 rdma_mr_size; +}; + /* RDS message Receive Path Latency points */ enum rds_message_rxpath_latency { RDS_MSG_RX_HDR_TO_DGRAM_START = 0, diff --git a/net/rds/connection.c b/net/rds/connection.c index 5c9ceed55dae..051e35c1e7c6 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -498,16 +498,19 @@ EXPORT_SYMBOL_GPL(rds_conn_destroy); static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, - void *saddr, void *daddr, int flip) + void *saddr, void *daddr, int flip, bool isv6) { - rds_inc_info_copy(inc, iter, *(__be32 *)saddr, - *(__be32 *)daddr, flip); + if (isv6) + rds6_inc_info_copy(inc, iter, saddr, daddr, flip); + else + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, + *(__be32 *)daddr, flip); } static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, - int want_send) + int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; @@ -518,7 +521,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, size_t i; int j; - len /= sizeof(struct rds_info_message); + if (isv6) + len /= sizeof(struct rds6_info_message); + else + len /= sizeof(struct rds_info_message); rcu_read_lock(); @@ -528,6 +534,9 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_conn_path *cp; int npaths; + if (!isv6 && conn->c_isv6) + continue; + npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); @@ -548,7 +557,7 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, iter, &conn->c_laddr, &conn->c_faddr, - 0); + 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); @@ -558,7 +567,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, rcu_read_unlock(); lens->nr = total; - lens->each = sizeof(struct rds_info_message); + if (isv6) + lens->each = sizeof(struct rds6_info_message); + else + lens->each = sizeof(struct rds_info_message); } static void rds_conn_message_info(struct socket *sock, unsigned int len, @@ -566,7 +578,15 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_lengths *lens, int want_send) { - rds_conn_message_info_cmn(sock, len, iter, lens, want_send); + rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); +} + +static void rds6_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } static void rds_conn_message_info_send(struct socket *sock, unsigned int len, @@ -576,6 +596,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len, rds_conn_message_info(sock, len, iter, lens, 1); } +static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds6_conn_message_info(sock, len, iter, lens, 1); +} + static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -584,6 +611,14 @@ static void rds_conn_message_info_retrans(struct socket *sock, rds_conn_message_info(sock, len, iter, lens, 0); } +static void rds6_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds6_conn_message_info(sock, len, iter, lens, 0); +} + void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -699,6 +734,34 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) return 1; } +static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) +{ + struct rds6_info_connection *cinfo6 = buffer; + struct rds_connection *conn = cp->cp_conn; + + cinfo6->next_tx_seq = cp->cp_next_tx_seq; + cinfo6->next_rx_seq = cp->cp_next_rx_seq; + cinfo6->laddr = conn->c_laddr; + cinfo6->faddr = conn->c_faddr; + strncpy(cinfo6->transport, conn->c_trans->t_name, + sizeof(cinfo6->transport)); + cinfo6->flags = 0; + + rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), + SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo6->flags, + atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo6->flags, + atomic_read(&cp->cp_state) == RDS_CONN_UP, + CONNECTED); + /* Just return 1 as there is no error case. This is a helper function + * for rds_walk_conn_path_info() and it wants a return value. + */ + return 1; +} + static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -711,6 +774,18 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } +static void rds6_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; + + rds_walk_conn_path_info(sock, len, iter, lens, + rds6_conn_info_visitor, + buffer, + sizeof(struct rds6_info_connection)); +} + int rds_conn_init(void) { int ret; @@ -732,6 +807,11 @@ int rds_conn_init(void) rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); + rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); + rds_info_register_func(RDS6_INFO_SEND_MESSAGES, + rds6_conn_message_info_send); + rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, + rds6_conn_message_info_retrans); return 0; } @@ -750,6 +830,11 @@ void rds_conn_exit(void) rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); + rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); + rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, + rds6_conn_message_info_send); + rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, + rds6_conn_message_info_retrans); } /* diff --git a/net/rds/ib.c b/net/rds/ib.c index 756225c5540f..63d95ea7cdff 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -321,6 +321,43 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, return 1; } +/* IPv6 version of rds_ib_conn_info_visitor(). */ +static int rds6_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds6_info_rdma_connection *iinfo6 = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo6->src_addr = conn->c_laddr; + iinfo6->dst_addr = conn->c_faddr; + + memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); + memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); + + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + rdma_addr_get_sgid(dev_addr, + (union ib_gid *)&iinfo6->src_gid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *)&iinfo6->dst_gid); + + rds_ibdev = ic->rds_ibdev; + iinfo6->max_send_wr = ic->i_send_ring.w_nr; + iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo6->max_send_sge = rds_ibdev->max_sge; + rds6_ib_get_mr_info(rds_ibdev, iinfo6); + } + return 1; +} + static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -333,6 +370,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } +/* IPv6 version of rds_ib_ic_info(). */ +static void rds6_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8]; + + rds_for_each_conn_info(sock, len, iter, lens, + rds6_ib_conn_info_visitor, + buffer, + sizeof(struct rds6_info_rdma_connection)); +} + /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. @@ -441,6 +491,7 @@ void rds_ib_exit(void) rds_ib_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); @@ -502,6 +553,7 @@ int rds_ib_init(void) rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); goto out; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 0ea4ab017a8c..f440ace584c8 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds6_info_rdma_connection *iinfo6); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0ec9df043dd0..e3c8bbbdb43f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -180,6 +180,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } +void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds6_info_rdma_connection *iinfo6) +{ + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; + + iinfo6->rdma_mr_max = pool_1m->max_items; + iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; +} + struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; diff --git a/net/rds/recv.c b/net/rds/recv.c index 1402c21210b1..03cd8df54c26 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -792,3 +792,28 @@ void rds_inc_info_copy(struct rds_incoming *inc, rds_info_copy(iter, &minfo, sizeof(minfo)); } + +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip) +{ + struct rds6_info_message minfo6; + + minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo6.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo6.laddr = *daddr; + minfo6.faddr = *saddr; + minfo6.lport = inc->i_hdr.h_dport; + minfo6.fport = inc->i_hdr.h_sport; + } else { + minfo6.laddr = *saddr; + minfo6.faddr = *daddr; + minfo6.lport = inc->i_hdr.h_sport; + minfo6.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo6, sizeof(minfo6)); +} diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 890d0e1d8908..7028d6e51947 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -273,6 +273,48 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } +/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and + * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped + * address. + */ +static void rds6_tcp_tc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds6_info_tcp_socket tsinfo6; + struct rds_tcp_connection *tc; + unsigned long flags; + + spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + + if (len / sizeof(tsinfo6) < rds6_tcp_tc_count) + goto out; + + list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct sock *sk = tc->t_sock->sk; + struct inet_sock *inet = inet_sk(sk); + + tsinfo6.local_addr = sk->sk_v6_rcv_saddr; + tsinfo6.local_port = inet->inet_sport; + tsinfo6.peer_addr = sk->sk_v6_daddr; + tsinfo6.peer_port = inet->inet_dport; + + tsinfo6.hdr_rem = tc->t_tinc_hdr_rem; + tsinfo6.data_rem = tc->t_tinc_data_rem; + tsinfo6.last_sent_nxt = tc->t_last_sent_nxt; + tsinfo6.last_expected_una = tc->t_last_expected_una; + tsinfo6.last_seen_una = tc->t_last_seen_una; + + rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6)); + } + +out: + lens->nr = rds6_tcp_tc_count; + lens->each = sizeof(tsinfo6); + + spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id) { @@ -628,6 +670,7 @@ static void rds_tcp_exit(void) rds_tcp_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); unregister_pernet_device(&rds_tcp_net_ops); rds_tcp_destroy_conns(); rds_trans_unregister(&rds_tcp_transport); @@ -659,6 +702,7 @@ static int rds_tcp_init(void) rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); goto out; out_recv: -- cgit v1.2.3 From d17504b16ea270ad858ce117447a8f4aa5a2de73 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 15 Jul 2018 20:52:26 -0700 Subject: wireless/lib80211: Convert from ahash to shash In preparing to remove all stack VLA usage from the kernel[1], this removes the discouraged use of AHASH_REQUEST_ON_STACK in favor of the smaller SHASH_DESC_ON_STACK by converting from ahash-wrapped-shash to direct shash. The stack allocation will be made a fixed size in a later patch to the crypto subsystem. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Signed-off-by: Kees Cook Signed-off-by: Johannes Berg --- net/wireless/lib80211_crypt_tkip.c | 55 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index ba0a1f398ce5..e6bce1f130c9 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -65,9 +65,9 @@ struct lib80211_tkip_data { int key_idx; struct crypto_skcipher *rx_tfm_arc4; - struct crypto_ahash *rx_tfm_michael; + struct crypto_shash *rx_tfm_michael; struct crypto_skcipher *tx_tfm_arc4; - struct crypto_ahash *tx_tfm_michael; + struct crypto_shash *tx_tfm_michael; /* scratch buffers for virt_to_page() (crypto API) */ u8 rx_hdr[16], tx_hdr[16]; @@ -106,8 +106,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); if (IS_ERR(priv->tx_tfm_michael)) { priv->tx_tfm_michael = NULL; goto fail; @@ -120,8 +119,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); if (IS_ERR(priv->rx_tfm_michael)) { priv->rx_tfm_michael = NULL; goto fail; @@ -131,9 +129,9 @@ static void *lib80211_tkip_init(int key_idx) fail: if (priv) { - crypto_free_ahash(priv->tx_tfm_michael); + crypto_free_shash(priv->tx_tfm_michael); crypto_free_skcipher(priv->tx_tfm_arc4); - crypto_free_ahash(priv->rx_tfm_michael); + crypto_free_shash(priv->rx_tfm_michael); crypto_free_skcipher(priv->rx_tfm_arc4); kfree(priv); } @@ -145,9 +143,9 @@ static void lib80211_tkip_deinit(void *priv) { struct lib80211_tkip_data *_priv = priv; if (_priv) { - crypto_free_ahash(_priv->tx_tfm_michael); + crypto_free_shash(_priv->tx_tfm_michael); crypto_free_skcipher(_priv->tx_tfm_arc4); - crypto_free_ahash(_priv->rx_tfm_michael); + crypto_free_shash(_priv->rx_tfm_michael); crypto_free_skcipher(_priv->rx_tfm_arc4); } kfree(priv); @@ -510,29 +508,36 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) return keyidx; } -static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr, - u8 * data, size_t data_len, u8 * mic) +static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, + u8 *data, size_t data_len, u8 *mic) { - AHASH_REQUEST_ON_STACK(req, tfm_michael); - struct scatterlist sg[2]; + SHASH_DESC_ON_STACK(desc, tfm_michael); int err; if (tfm_michael == NULL) { pr_warn("%s(): tfm_michael == NULL\n", __func__); return -1; } - sg_init_table(sg, 2); - sg_set_buf(&sg[0], hdr, 16); - sg_set_buf(&sg[1], data, data_len); - if (crypto_ahash_setkey(tfm_michael, key, 8)) + desc->tfm = tfm_michael; + desc->flags = 0; + + if (crypto_shash_setkey(tfm_michael, key, 8)) return -1; - ahash_request_set_tfm(req, tfm_michael); - ahash_request_set_callback(req, 0, NULL, NULL); - ahash_request_set_crypt(req, sg, mic, data_len + 16); - err = crypto_ahash_digest(req); - ahash_request_zero(req); + err = crypto_shash_init(desc); + if (err) + goto out; + err = crypto_shash_update(desc, hdr, 16); + if (err) + goto out; + err = crypto_shash_update(desc, data, data_len); + if (err) + goto out; + err = crypto_shash_final(desc, mic); + +out: + shash_desc_zero(desc); return err; } @@ -654,9 +659,9 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) { struct lib80211_tkip_data *tkey = priv; int keyidx; - struct crypto_ahash *tfm = tkey->tx_tfm_michael; + struct crypto_shash *tfm = tkey->tx_tfm_michael; struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4; - struct crypto_ahash *tfm3 = tkey->rx_tfm_michael; + struct crypto_shash *tfm3 = tkey->rx_tfm_michael; struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4; keyidx = tkey->key_idx; -- cgit v1.2.3 From 133bf90dbb8b873286f8ec2e81ba26e863114b8c Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Tue, 10 Jul 2018 16:48:27 +0530 Subject: mac80211: restrict delayed tailroom needed decrement As explained in ieee80211_delayed_tailroom_dec(), during roam, keys of the old AP will be destroyed and new keys will be installed. Deletion of the old key causes crypto_tx_tailroom_needed_cnt to go from 1 to 0 and the new key installation causes a transition from 0 to 1. Whenever crypto_tx_tailroom_needed_cnt transitions from 0 to 1, we invoke synchronize_net(); the reason for doing this is to avoid a race in the TX path as explained in increment_tailroom_need_count(). This synchronize_net() operation can be slow and can affect the station roam time. To avoid this, decrementing the crypto_tx_tailroom_needed_cnt is delayed for a while so that upon installation of new key the transition would be from 1 to 2 instead of 0 to 1 and thereby improving the roam time. This is all correct for a STA iftype, but deferring the tailroom_needed decrement for other iftypes may be unnecessary. For example, let's consider the case of a 4-addr client connecting to an AP for which AP_VLAN interface is also created, let the initial value for tailroom_needed on the AP be 1. * 4-addr client connects to the AP (AP: tailroom_needed = 1) * AP will clear old keys, delay decrement of tailroom_needed count * AP_VLAN is created, it takes the tailroom count from master (AP_VLAN: tailroom_needed = 1, AP: tailroom_needed = 1) * Install new key for the station, assume key is plumbed in the HW, there won't be any change in tailroom_needed count on AP iface * Delayed decrement of tailroom_needed count on AP (AP: tailroom_needed = 0, AP_VLAN: tailroom_needed = 1) Because of the delayed decrement on AP iface, tailroom_needed count goes out of sync between AP(master iface) and AP_VLAN(slave iface) and there would be unnecessary tailroom created for the packets going through AP_VLAN iface. Also, WARN_ONs were observed while trying to bring down the AP_VLAN interface: (warn_slowpath_common) (warn_slowpath_null+0x18/0x20) (warn_slowpath_null) (ieee80211_free_keys+0x114/0x1e4) (ieee80211_free_keys) (ieee80211_del_virtual_monitor+0x51c/0x850) (ieee80211_del_virtual_monitor) (ieee80211_stop+0x30/0x3c) (ieee80211_stop) (__dev_close_many+0x94/0xb8) (__dev_close_many) (dev_close_many+0x5c/0xc8) Restricting delayed decrement to station interface alone fixes the problem and it makes sense to do so because delayed decrement is done to improve roam time which is applicable only for client devices. Signed-off-by: Manikanta Pubbisetty Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- net/mac80211/key.c | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 02f3672e7b5e..d25da0e66da1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -495,7 +495,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, goto out_unlock; } - ieee80211_key_free(key, true); + ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); ret = 0; out_unlock: diff --git a/net/mac80211/key.c b/net/mac80211/key.c index ee0d0cc8dc3b..c054ac85793c 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -656,11 +656,15 @@ int ieee80211_key_link(struct ieee80211_key *key, { struct ieee80211_local *local = sdata->local; struct ieee80211_key *old_key; - int idx, ret; - bool pairwise; - - pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; - idx = key->conf.keyidx; + int idx = key->conf.keyidx; + bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; + /* + * We want to delay tailroom updates only for station - in that + * case it helps roaming speed, but in other cases it hurts and + * can cause warnings to appear. + */ + bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; + int ret; mutex_lock(&sdata->local->key_mtx); @@ -688,14 +692,14 @@ int ieee80211_key_link(struct ieee80211_key *key, increment_tailroom_need_count(sdata); ieee80211_key_replace(sdata, sta, pairwise, old_key, key); - ieee80211_key_destroy(old_key, true); + ieee80211_key_destroy(old_key, delay_tailroom); ieee80211_debugfs_key_add(key); if (!local->wowlan) { ret = ieee80211_key_enable_hw_accel(key); if (ret) - ieee80211_key_free(key, true); + ieee80211_key_free(key, delay_tailroom); } else { ret = 0; } @@ -930,7 +934,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key, true); + __ieee80211_key_destroy(key, key->sdata->vif.type == + NL80211_IFTYPE_STATION); } for (i = 0; i < NUM_DEFAULT_KEYS; i++) { @@ -940,7 +945,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key, true); + __ieee80211_key_destroy(key, key->sdata->vif.type == + NL80211_IFTYPE_STATION); } mutex_unlock(&local->key_mtx); -- cgit v1.2.3 From 3730cf4dd70b6a36e48d58a862120311411b77f5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Jul 2018 12:47:56 +0200 Subject: netlink: do not store start function in netlink_cb ->start() is called once when dump is being initialized, there is no need to store it in netlink_cb. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 - net/netlink/af_netlink.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index f3075d6c7e82..71f121b66ca8 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -170,7 +170,6 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; - int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 393573a99a5a..f6ac7693d2cc 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2300,7 +2300,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); - cb->start = control->start; cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; @@ -2309,8 +2308,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; - if (cb->start) { - ret = cb->start(cb); + if (control->start) { + ret = control->start(cb); if (ret) goto error_put; } -- cgit v1.2.3 From 8dd30201ce66f2c81077e06056f4a865e512e854 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 24 Jul 2018 13:53:00 +0300 Subject: net: remove redundant input checks in SIOCSIFTXQLEN case of dev_ifsioc The cited patch added a call to dev_change_tx_queue_len in SIOCSIFTXQLEN case. This obsoletes the new len comparison check done before the function call. Remove it here. For the desicion of keep/remove the negative value check, we examine the range check in dev_change_tx_queue_len. On 64-bit we will fail with -ERANGE. The 32-bit int ifr_qlen will be sign extended to 64-bits when it is passed into dev_change_tx_queue_len(). And then for negative values this test triggers: if (new_len != (unsigned int)new_len) return -ERANGE; because: if (0xffffffffWHATEVER != 0x00000000WHATEVER) On 32-bit the signed value will be accepted, changing behavior. Therefore, the negative value check is kept. Fixes: 3f76df198288 ("net: use dev_change_tx_queue_len() for SIOCSIFTXQLEN") Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Cc: Cong Wang Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 50537ff961a7..90e8aa36881e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -284,12 +284,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; - if (dev->tx_queue_len ^ ifr->ifr_qlen) { - err = dev_change_tx_queue_len(dev, ifr->ifr_qlen); - if (err) - return err; - } - return 0; + return dev_change_tx_queue_len(dev, ifr->ifr_qlen); case SIOCSIFNAME: ifr->ifr_newname[IFNAMSIZ-1] = '\0'; -- cgit v1.2.3 From 50f699b1f8462959482251a6cd1b7bc6bbd20796 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:01 -0700 Subject: sched: fix trailing whitespace Remove trailing whitespace and blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/Kconfig | 4 ++-- net/sched/Makefile | 2 +- net/sched/act_connmark.c | 1 - net/sched/act_pedit.c | 1 - net/sched/cls_basic.c | 1 - 5 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7af246764a35..bba71225adbd 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -1,6 +1,6 @@ # # Traffic control configuration. -# +# menuconfig NET_SCHED bool "QoS and/or fair queueing" @@ -706,7 +706,7 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT + depends on NET_CLS_ACT ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing diff --git a/net/sched/Makefile b/net/sched/Makefile index 673ee7d26ff2..910ec7463a36 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o -obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 1e31f0e448e2..2f9bc833d046 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -252,4 +252,3 @@ module_exit(connmark_cleanup_module); MODULE_AUTHOR("Felix Fietkau "); MODULE_DESCRIPTION("Connection tracking mark restoring"); MODULE_LICENSE("GPL"); - diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index cc8ffcd1ddb5..9ab5d81aff1a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -516,4 +516,3 @@ static void __exit pedit_cleanup_module(void) module_init(pedit_init_module); module_exit(pedit_cleanup_module); - diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 95367f37098d..6a5dce8baf19 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -324,4 +324,3 @@ static void __exit exit_basic(void) module_init(init_basic) module_exit(exit_basic) MODULE_LICENSE("GPL"); - -- cgit v1.2.3 From bf9b5567da548ec56c263e210f958fa923a79503 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:02 -0700 Subject: wimax: remove blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/wimax/Makefile | 2 -- net/wimax/debugfs.c | 2 -- net/wimax/op-msg.c | 1 - net/wimax/stack.c | 1 - 4 files changed, 6 deletions(-) (limited to 'net') diff --git a/net/wimax/Makefile b/net/wimax/Makefile index eb2db0d3b880..c2a71ae487ac 100644 --- a/net/wimax/Makefile +++ b/net/wimax/Makefile @@ -11,5 +11,3 @@ wimax-y := \ stack.o wimax-$(CONFIG_DEBUG_FS) += debugfs.o - - diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c index 6c9bedb7431e..24514840746e 100644 --- a/net/wimax/debugfs.c +++ b/net/wimax/debugfs.c @@ -76,5 +76,3 @@ void wimax_debugfs_rm(struct wimax_dev *wimax_dev) { debugfs_remove_recursive(wimax_dev->debugfs_dentry); } - - diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c index 54aa146930bd..101b2fa3f32e 100644 --- a/net/wimax/op-msg.c +++ b/net/wimax/op-msg.c @@ -404,4 +404,3 @@ error_no_wimax_dev: d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); return result; } - diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 73dba9c077bb..a6307813b6d5 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -630,4 +630,3 @@ module_exit(wimax_subsys_exit); MODULE_AUTHOR("Intel Corporation "); MODULE_DESCRIPTION("Linux WiMAX stack"); MODULE_LICENSE("GPL"); - -- cgit v1.2.3 From 1cb1d977b41ad9fbcbd57ba24b203d6cb2f79952 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:03 -0700 Subject: rds: remove trailing whitespace and blank lines Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/rds/Kconfig | 1 - net/rds/Makefile | 1 - net/rds/ib.c | 1 - net/rds/message.c | 1 - net/rds/rdma_transport.c | 1 - net/rds/tcp.c | 1 - net/rds/transport.c | 1 - 7 files changed, 7 deletions(-) (limited to 'net') diff --git a/net/rds/Kconfig b/net/rds/Kconfig index bffde4b46c5d..41f75563b54b 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -24,4 +24,3 @@ config RDS_DEBUG bool "RDS debugging messages" depends on RDS default n - diff --git a/net/rds/Makefile b/net/rds/Makefile index b5d568bd479c..e647f9de104a 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -15,4 +15,3 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ tcp_send.o tcp_stats.o ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG - diff --git a/net/rds/ib.c b/net/rds/ib.c index 63d95ea7cdff..a4245c42d43b 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -568,4 +568,3 @@ out: } MODULE_LICENSE("GPL"); - diff --git a/net/rds/message.c b/net/rds/message.c index a35f76971984..4b00b1152a5f 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -514,4 +514,3 @@ void rds_message_unmapped(struct rds_message *rm) wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped); - diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index bd67e55354f4..ad78929036ef 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -281,4 +281,3 @@ module_exit(rds_rdma_exit); MODULE_AUTHOR("Oracle Corporation "); MODULE_DESCRIPTION("RDS: IB transport"); MODULE_LICENSE("Dual BSD/GPL"); - diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 7028d6e51947..f23925af0b8d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -717,4 +717,3 @@ module_init(rds_tcp_init); MODULE_AUTHOR("Oracle Corporation "); MODULE_DESCRIPTION("RDS: TCP transport"); MODULE_LICENSE("Dual BSD/GPL"); - diff --git a/net/rds/transport.c b/net/rds/transport.c index c9788dbce441..46f709a4b577 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -159,4 +159,3 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, return total; } - -- cgit v1.2.3 From a87e87dbf3926ab43b862cd90ec122ebf7d5aad3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:04 -0700 Subject: llc: fix whitespace issues Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/Kconfig | 2 +- net/llc/Makefile | 2 +- net/llc/llc_if.c | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/llc/Kconfig b/net/llc/Kconfig index b91c65108162..176a6c1521a5 100644 --- a/net/llc/Kconfig +++ b/net/llc/Kconfig @@ -6,5 +6,5 @@ config LLC2 tristate "ANSI/IEEE 802.2 LLC type 2 Support" select LLC help - This is a Logical Link Layer type 2, connection oriented support. + This is a Logical Link Layer type 2, connection oriented support. Select this if you want to have support for PF_LLC sockets. diff --git a/net/llc/Makefile b/net/llc/Makefile index 4e260cff3c5d..5e0ef436daae 100644 --- a/net/llc/Makefile +++ b/net/llc/Makefile @@ -4,7 +4,7 @@ # Copyright (c) 1997 by Procom Technology,Inc. # 2001-2003 by Arnaldo Carvalho de Melo # -# This program can be redistributed or modified under the terms of the +# This program can be redistributed or modified under the terms of the # GNU General Public License as published by the Free Software Foundation. # This program is distributed without any warranty or implied warranty # of merchantability or fitness for a particular purpose. diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 6daf391b3e84..8db03c2d5440 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -151,4 +151,3 @@ out: sock_put(sk); return rc; } - -- cgit v1.2.3 From 04c6a3a40a22cff4e25d36eeda0ad590717022f0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:05 -0700 Subject: mpls: remove trailing whitepace Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/mpls/mpls_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 6e558a419f60..94f53a9b7d1a 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -224,7 +224,7 @@ static int mpls_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; - + tun_encap_info = mpls_lwtunnel_encap(lwtstate); if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, -- cgit v1.2.3 From 2e13b580691cf1a3c4bafd723453dbbd7236a428 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:06 -0700 Subject: xfrm: remove blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/xfrm/Kconfig | 1 - net/xfrm/xfrm_user.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 286ed25c1a69..eab952cca7d0 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -87,4 +87,3 @@ config NET_KEY_MIGRATE . If unsure, say N. - diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 080035f056d9..09cceab450b8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3280,4 +3280,3 @@ module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); - -- cgit v1.2.3 From aa46225235efc687d971351a309b734549ca5718 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:08 -0700 Subject: sctp: whitespace fixes Remove blank line at EOF and trailing whitespace. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sctp/Kconfig | 4 ++-- net/sctp/sm_sideeffect.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index c740b189d4ba..950ecf6e7439 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -41,8 +41,8 @@ config SCTP_DBG_OBJCNT bool "SCTP: Debug object counts" depends on PROC_FS help - If you say Y, this will enable debugging support for counting the - type of objects that are currently allocated. This is useful for + If you say Y, this will enable debugging support for counting the + type of objects that are currently allocated. This is useful for identifying memory leaks. This debug information can be viewed by 'cat /proc/net/sctp/sctp_dbg_objcnt' diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 298112ca8c06..85d393090238 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1827,4 +1827,3 @@ nomem: error = -ENOMEM; goto out; } - -- cgit v1.2.3 From ed976ea7307876a8557b4c069edf9314ed7459d0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:09 -0700 Subject: ila: remove blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/ila/ila_common.c | 1 - net/ipv6/ila/ila_xlat.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 579310466eac..95e9146918cc 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -153,4 +153,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, /* Now change destination address */ iaddr->loc = p->locator; } - diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 51a15ce50a64..17c455ff69ff 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -663,4 +663,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) return 0; } - -- cgit v1.2.3 From 543de8881dfe759b304bbbcac8c360d1af52c6b3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:11 -0700 Subject: atm: remove blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/atm/mpoa_proc.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index b93cc0f18292..46d6cd9a36ae 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -307,9 +307,3 @@ void mpc_proc_clean(void) } #endif /* CONFIG_PROC_FS */ - - - - - - -- cgit v1.2.3 From 9d82a1cdd391e84bfe5d70702ad4efe5be7d2236 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:12 -0700 Subject: ax25: remove blank line at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ax25/ax25_addr.c | 1 - net/ax25/ax25_ds_in.c | 1 - net/ax25/ax25_ds_subr.c | 1 - net/ax25/ax25_ip.c | 1 - net/ax25/ax25_out.c | 1 - 5 files changed, 5 deletions(-) (limited to 'net') diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c index ac2542b7be88..a14cfa736b63 100644 --- a/net/ax25/ax25_addr.c +++ b/net/ax25/ax25_addr.c @@ -304,4 +304,3 @@ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out) } } } - diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c index 891596e74278..488fc2d7085a 100644 --- a/net/ax25/ax25_ds_in.c +++ b/net/ax25/ax25_ds_in.c @@ -299,4 +299,3 @@ int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type) return queued; } - diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 28827e81ba2b..bc0329f43013 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -205,4 +205,3 @@ void ax25_dama_off(ax25_cb *ax25) ax25->condition &= ~AX25_COND_DAMA_MODE; ax25_dev_dama_off(ax25->ax25_dev); } - diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 183b1c583d56..70417e9b932d 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -249,4 +249,3 @@ const struct header_ops ax25_header_ops = { EXPORT_SYMBOL(ax25_header_ops); EXPORT_SYMBOL(ax25_ip_xmit); - diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index b11a5f466fcc..3e5afc8dc93e 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -394,4 +394,3 @@ int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr) } return 0; } - -- cgit v1.2.3 From 27782f403fbfe531442b80f59e7e42ccbe00eb9c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:13 -0700 Subject: x25: remove blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/x25/Kconfig | 2 -- net/x25/x25_subr.c | 1 - 2 files changed, 3 deletions(-) (limited to 'net') diff --git a/net/x25/Kconfig b/net/x25/Kconfig index e2fa133f9fba..59fcb41fc5e6 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -31,5 +31,3 @@ config X25 To compile this driver as a module, choose M here: the module will be called x25. If unsure, say N. - - diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 9c214ec681ac..743103786652 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -381,4 +381,3 @@ void x25_check_rbuf(struct sock *sk) x25_stop_timer(sk); } } - -- cgit v1.2.3 From 19c198d9c130d9a6f1427a2e50f1ed1779202f73 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:14 -0700 Subject: decnet: whitespace fixes Remove trailing whitespace and extra lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/decnet/Kconfig | 1 - net/decnet/Makefile | 1 - net/decnet/TODO | 5 ++--- net/decnet/dn_fib.c | 2 -- net/decnet/dn_nsp_in.c | 1 - net/decnet/dn_nsp_out.c | 1 - net/decnet/dn_route.c | 1 - net/decnet/dn_rules.c | 2 -- net/decnet/netfilter/Makefile | 1 - net/decnet/netfilter/dn_rtmsg.c | 1 - 10 files changed, 2 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index f3393e154f0f..dcc74956badd 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -40,4 +40,3 @@ config DECNET_ROUTER to work. See for more information. - diff --git a/net/decnet/Makefile b/net/decnet/Makefile index 9e38122d942b..07b38e441b2d 100644 --- a/net/decnet/Makefile +++ b/net/decnet/Makefile @@ -8,4 +8,3 @@ decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o decnet-y += sysctl_net_decnet.o obj-$(CONFIG_NETFILTER) += netfilter/ - diff --git a/net/decnet/TODO b/net/decnet/TODO index ebb5ac69d128..358e9eb49016 100644 --- a/net/decnet/TODO +++ b/net/decnet/TODO @@ -16,14 +16,14 @@ Steve's quick list of things that need finishing off: o Verify errors etc. against POSIX 1003.1g (draft) - o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) + o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) [maybe this should be done at socket level... the control data in the send/recvmsg() calls should simply be a vector of set/getsockopt() calls] o check MSG_CTRUNC is set where it should be. - o Find all the commonality between DECnet and IPv4 routing code and extract + o Find all the commonality between DECnet and IPv4 routing code and extract it into a small library of routines. [probably a project for 2.7.xx] o Add perfect socket hashing - an idea suggested by Paul Koning. Currently @@ -38,4 +38,3 @@ Steve's quick list of things that need finishing off: o DECnet sendpages() function o AIO for DECnet - diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index fce94cbd4378..f78fe58eafc8 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -797,5 +797,3 @@ void __init dn_fib_init(void) rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0); } - - diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 34aba55ed573..2fb5e055ba25 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -912,4 +912,3 @@ free_out: return NET_RX_SUCCESS; } - diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 56a52a004c56..a1779de6bd9c 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -701,4 +701,3 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) dn_nsp_send(skb); } - diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e74765024d88..3107a2e24e6b 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1925,4 +1925,3 @@ void __exit dn_route_cleanup(void) remove_proc_entry("decnet_cache", init_net.proc_net); dst_entries_destroy(&dn_dst_ops); } - diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index 72236695db3d..4a4e3c17740c 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -256,5 +256,3 @@ void __exit dn_fib_rules_cleanup(void) rtnl_unlock(); rcu_barrier(); } - - diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile index 255c1ae9daeb..b579e52130aa 100644 --- a/net/decnet/netfilter/Makefile +++ b/net/decnet/netfilter/Makefile @@ -3,4 +3,3 @@ # obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o - diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index ab395e55cd78..a4faacadd8a8 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -158,4 +158,3 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); module_init(dn_rtmsg_init); module_exit(dn_rtmsg_fini); - -- cgit v1.2.3 From a17922def7ca6dba9f40b09a8b36f9cbe3b8bbf3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:16 -0700 Subject: bpfilter: remove trailing newline Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bpfilter/Kconfig | 1 - net/ipv4/bpfilter/Makefile | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index 76deb6615883..e558b46596c4 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -13,4 +13,3 @@ config BPFILTER_UMH help This builds bpfilter kernel module with embedded user mode helper endif - diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile index ce262d76cc48..e9e42f99725e 100644 --- a/net/ipv4/bpfilter/Makefile +++ b/net/ipv4/bpfilter/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_BPFILTER) += sockopt.o - -- cgit v1.2.3 From c2df5603678b5b0d47ca70469934d1c146b29d9b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:17 -0700 Subject: l2tp: remove trailing newline Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1ea285bad84b..c8fc0f7f0b4b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1795,4 +1795,3 @@ MODULE_AUTHOR("James Chapman "); MODULE_DESCRIPTION("L2TP core"); MODULE_LICENSE("GPL"); MODULE_VERSION(L2TP_DRV_VERSION); - -- cgit v1.2.3 From e446a2760f1e265192accd7ddebd3ca5ff1d57bb Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:18 -0700 Subject: net: remove blank lines at end of file Several files have extra line at end of file. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/8021q/Makefile | 1 - net/Kconfig | 2 +- net/core/neighbour.c | 1 - net/dns_resolver/dns_key.c | 1 - net/ieee802154/core.c | 1 - net/ieee802154/nl_policy.c | 1 - net/ipv4/Kconfig | 4 ++-- net/ipv4/Makefile | 2 +- net/ipv6/Kconfig | 2 +- net/iucv/af_iucv.c | 1 - net/kcm/Kconfig | 1 - net/kcm/kcmsock.c | 1 - net/mac80211/rc80211_minstrel.c | 1 - 13 files changed, 5 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/8021q/Makefile b/net/8021q/Makefile index 9b703454b93e..e05d4d7aab35 100644 --- a/net/8021q/Makefile +++ b/net/8021q/Makefile @@ -9,4 +9,3 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q.o 8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o 8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o 8021q-$(CONFIG_PROC_FS) += vlanproc.o - diff --git a/net/Kconfig b/net/Kconfig index f738a6f27665..228dfa382eec 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -12,7 +12,7 @@ menuconfig NET The reason is that some programs need kernel networking support even when running on a stand-alone machine that isn't connected to any other computer. - + If you are upgrading from an older kernel, you should consider updating your networking tools too because changes in the kernel and the tools often go hand in hand. The tools are diff --git a/net/core/neighbour.c b/net/core/neighbour.c index cbe85d8d4cc2..aa19d86937af 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3274,4 +3274,3 @@ static int __init neigh_init(void) } subsys_initcall(neigh_init); - diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 0c9478b91fa5..7f4534828f6c 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -320,4 +320,3 @@ static void __exit exit_dns_resolver(void) module_init(init_dns_resolver) module_exit(exit_dns_resolver) MODULE_LICENSE("GPL"); - diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index cb7176cd4cd6..fe225d9a1877 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -400,4 +400,3 @@ module_exit(wpan_phy_class_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); MODULE_AUTHOR("Dmitry Eremin-Solenikov"); - diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c index 35c432668454..78f6f1233194 100644 --- a/net/ieee802154/nl_policy.c +++ b/net/ieee802154/nl_policy.c @@ -75,4 +75,3 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, }, [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, }, }; - diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 80dad301361d..32cae39cdff6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,7 +430,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: - + http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 If unsure, say Y. @@ -600,7 +600,7 @@ config TCP_CONG_VENO distinguishing to circumvent the difficult judgment of the packet loss type. TCP Veno cuts down less congestion window in response to random loss packets. - See + See config TCP_CONG_YEAH tristate "YeAH TCP" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index eec9569ffa5c..7446b98661d8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index b3885ca22d6f..613282c65a10 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -15,7 +15,7 @@ menuconfig IPV6 Documentation/networking/ipv6.txt and read the HOWTO at - To compile this protocol support as a module, choose M here: the + To compile this protocol support as a module, choose M here: the module will be called ipv6. if IPV6 diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 893a022f9620..8d1c43f8fed4 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -2515,4 +2515,3 @@ MODULE_DESCRIPTION("IUCV Sockets ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_IUCV); - diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 87fca36e6c47..9ca83f2ade6f 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -8,4 +8,3 @@ config AF_KCM KCM (Kernel Connection Multiplexor) sockets provide a method for multiplexing messages of a message based application protocol over kernel connectons (e.g. TCP connections). - diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d3601d421571..571d824e4e24 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -2104,4 +2104,3 @@ module_exit(kcm_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KCM); - diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 76048b53c5b2..07fb219327d6 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -751,4 +751,3 @@ rc80211_minstrel_exit(void) { ieee80211_rate_control_unregister(&mac80211_minstrel); } - -- cgit v1.2.3 From aea5f654e6b78a0c976f7a25950155932c77a53f Mon Sep 17 00:00:00 2001 From: Nishanth Devarajan Date: Mon, 23 Jul 2018 19:37:41 +0530 Subject: net/sched: add skbprio scheduler Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets according to their skb->priority field. Under congestion, already-enqueued lower priority packets will be dropped to make space available for higher priority packets. Skbprio was conceived as a solution for denial-of-service defenses that need to route packets with different priorities as a means to overcome DoS attacks. v5 *Do not reference qdisc_dev(sch)->tx_queue_len for setting limit. Instead set default sch->limit to 64. v4 *Drop Documentation/networking/sch_skbprio.txt doc file to move it to tc man page for Skbprio, in iproute2. v3 *Drop max_limit parameter in struct skbprio_sched_data and instead use sch->limit. *Reference qdisc_dev(sch)->tx_queue_len only once, during initialisation for qdisc (previously being referenced every time qdisc changes). *Move qdisc's detailed description from in-code to Documentation/networking. *When qdisc is saturated, enqueue incoming packet first before dequeueing lowest priority packet in queue - improves usage of call stack registers. *Introduce and use overlimit stat to keep track of number of dropped packets. v2 *Use skb->priority field rather than DS field. Rename queueing discipline as SKB Priority Queue (previously Gatekeeper Priority Queue). *Queueing discipline is made classful to expose Skbprio's internal priority queues. Signed-off-by: Nishanth Devarajan Reviewed-by: Sachin Paryani Reviewed-by: Cody Doucette Reviewed-by: Michel Machado Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 15 ++ net/sched/Kconfig | 13 ++ net/sched/Makefile | 1 + net/sched/sch_skbprio.c | 320 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 349 insertions(+) create mode 100644 net/sched/sch_skbprio.c (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d9cc9dc4f547..8975fd1a1421 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -124,6 +124,21 @@ struct tc_fifo_qopt { __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ }; +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + /* PRIO section */ #define TCQ_PRIO_BANDS 16 diff --git a/net/sched/Kconfig b/net/sched/Kconfig index bba71225adbd..e95741388311 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -251,6 +251,19 @@ config NET_SCH_MQPRIO If unsure, say N. +config NET_SCH_SKBPRIO + tristate "SKB priority queue scheduler (SKBPRIO)" + help + Say Y here if you want to use the SKB priority queue + scheduler. This schedules packets according to skb->priority, + which is useful for request packets in DoS mitigation systems such + as Gatekeeper. + + To compile this driver as a module, choose M here: the module will + be called sch_skbprio. + + If unsure, say N. + config NET_SCH_CHOKE tristate "CHOose and Keep responsive flow scheduler (CHOKE)" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 910ec7463a36..f0403f49edcb 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c new file mode 100644 index 000000000000..52c0b6d8f1d7 --- /dev/null +++ b/net/sched/sch_skbprio.c @@ -0,0 +1,320 @@ +/* + * net/sched/sch_skbprio.c SKB Priority Queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Nishanth Devarajan, + * Cody Doucette, + * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* SKB Priority Queue + * ================================= + * + * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes + * packets according to their skb->priority field. Under congestion, + * Skbprio drops already-enqueued lower priority packets to make space + * available for higher priority packets; it was conceived as a solution + * for denial-of-service defenses that need to route packets with different + * priorities as a mean to overcome DoS attacks. + */ + +struct skbprio_sched_data { + /* Queue state. */ + struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY]; + struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY]; + u16 highest_prio; + u16 lowest_prio; +}; + +static u16 calc_new_high_prio(const struct skbprio_sched_data *q) +{ + int prio; + + for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) { + if (!skb_queue_empty(&q->qdiscs[prio])) + return prio; + } + + /* SKB queue is empty, return 0 (default highest priority setting). */ + return 0; +} + +static u16 calc_new_low_prio(const struct skbprio_sched_data *q) +{ + int prio; + + for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) { + if (!skb_queue_empty(&q->qdiscs[prio])) + return prio; + } + + /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1 + * (default lowest priority setting). + */ + return SKBPRIO_MAX_PRIORITY - 1; +} + +static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; + struct skbprio_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *qdisc; + struct sk_buff_head *lp_qdisc; + struct sk_buff *to_drop; + u16 prio, lp; + + /* Obtain the priority of @skb. */ + prio = min(skb->priority, max_priority); + + qdisc = &q->qdiscs[prio]; + if (sch->q.qlen < sch->limit) { + __skb_queue_tail(qdisc, skb); + qdisc_qstats_backlog_inc(sch, skb); + q->qstats[prio].backlog += qdisc_pkt_len(skb); + + /* Check to update highest and lowest priorities. */ + if (prio > q->highest_prio) + q->highest_prio = prio; + + if (prio < q->lowest_prio) + q->lowest_prio = prio; + + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + + /* If this packet has the lowest priority, drop it. */ + lp = q->lowest_prio; + if (prio <= lp) { + q->qstats[prio].drops++; + q->qstats[prio].overlimits++; + return qdisc_drop(skb, sch, to_free); + } + + __skb_queue_tail(qdisc, skb); + qdisc_qstats_backlog_inc(sch, skb); + q->qstats[prio].backlog += qdisc_pkt_len(skb); + + /* Drop the packet at the tail of the lowest priority qdisc. */ + lp_qdisc = &q->qdiscs[lp]; + to_drop = __skb_dequeue_tail(lp_qdisc); + BUG_ON(!to_drop); + qdisc_qstats_backlog_dec(sch, to_drop); + qdisc_drop(to_drop, sch, to_free); + + q->qstats[lp].backlog -= qdisc_pkt_len(to_drop); + q->qstats[lp].drops++; + q->qstats[lp].overlimits++; + + /* Check to update highest and lowest priorities. */ + if (skb_queue_empty(lp_qdisc)) { + if (q->lowest_prio == q->highest_prio) { + /* The incoming packet is the only packet in queue. */ + BUG_ON(sch->q.qlen != 1); + q->lowest_prio = prio; + q->highest_prio = prio; + } else { + q->lowest_prio = calc_new_low_prio(q); + } + } + + if (prio > q->highest_prio) + q->highest_prio = prio; + + return NET_XMIT_CN; +} + +static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio]; + struct sk_buff *skb = __skb_dequeue(hpq); + + if (unlikely(!skb)) + return NULL; + + sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + + q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb); + + /* Update highest priority field. */ + if (skb_queue_empty(hpq)) { + if (q->lowest_prio == q->highest_prio) { + BUG_ON(sch->q.qlen); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; + } else { + q->highest_prio = calc_new_high_prio(q); + } + } + return skb; +} + +static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct tc_skbprio_qopt *ctl = nla_data(opt); + + sch->limit = ctl->limit; + return 0; +} + +static int skbprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + /* Initialise all queues, one for each possible priority. */ + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_head_init(&q->qdiscs[prio]); + + memset(&q->qstats, 0, sizeof(q->qstats)); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; + sch->limit = 64; + if (!opt) + return 0; + + return skbprio_change(sch, opt, extack); +} + +static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tc_skbprio_qopt opt; + + opt.limit = sch->limit; + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + return -1; + + return skb->len; +} + +static void skbprio_reset(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_purge(&q->qdiscs[prio]); + + memset(&q->qstats, 0, sizeof(q->qstats)); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; +} + +static void skbprio_destroy(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_purge(&q->qdiscs[prio]); +} + +static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long skbprio_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1], + q->qstats[cl - 1].qlen) < 0) + return -1; + return 0; +} + +static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops skbprio_class_ops = { + .leaf = skbprio_leaf, + .find = skbprio_find, + .dump = skbprio_dump_class, + .dump_stats = skbprio_dump_class_stats, + .walk = skbprio_walk, +}; + +static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { + .cl_ops = &skbprio_class_ops, + .id = "skbprio", + .priv_size = sizeof(struct skbprio_sched_data), + .enqueue = skbprio_enqueue, + .dequeue = skbprio_dequeue, + .peek = qdisc_peek_dequeued, + .init = skbprio_init, + .reset = skbprio_reset, + .change = skbprio_change, + .dump = skbprio_dump, + .destroy = skbprio_destroy, + .owner = THIS_MODULE, +}; + +static int __init skbprio_module_init(void) +{ + return register_qdisc(&skbprio_qdisc_ops); +} + +static void __exit skbprio_module_exit(void) +{ + unregister_qdisc(&skbprio_qdisc_ops); +} + +module_init(skbprio_module_init) +module_exit(skbprio_module_exit) + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 158abbf170ecfc6d56abeddd0c66da753b3435df Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Wed, 25 Jul 2018 02:31:25 +0000 Subject: net/sched: cls_flower: Use correct inline function for assignment of vlan tpid This fixes the following sparse warning: net/sched/cls_flower.c:1356:36: warning: incorrect type in argument 3 (different base types) net/sched/cls_flower.c:1356:36: expected unsigned short [unsigned] [usertype] value net/sched/cls_flower.c:1356:36: got restricted __be16 [usertype] vlan_tpid Signed-off-by: Jianbo Liu Reported-by: Or Gerlitz Reviewed-by: Or Gerlitz Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6ccf60364297..e8bd08ba998a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1445,8 +1445,8 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, TCA_FLOWER_KEY_CVLAN_PRIO, &key->cvlan, &mask->cvlan) || (mask->cvlan.vlan_tpid && - nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, - key->cvlan.vlan_tpid))) + nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->cvlan.vlan_tpid))) goto nla_put_failure; if (mask->basic.n_proto) { -- cgit v1.2.3 From 55477206f15cf725178be23344179bd83f773c7b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 25 Jul 2018 06:06:07 +0000 Subject: tcp: make function tcp_retransmit_stamp() static Fixes the following sparse warnings: net/ipv4/tcp_timer.c:25:5: warning: symbol 'tcp_retransmit_stamp' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a242f8874629..7fdf222a0bdf 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,7 +22,7 @@ #include #include -u32 tcp_retransmit_stamp(const struct sock *sk) +static u32 tcp_retransmit_stamp(const struct sock *sk) { u32 start_ts = tcp_sk(sk)->retrans_stamp; -- cgit v1.2.3 From 934ffce1343f22ed5e2d0bd6da4440f4848074de Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 25 Jul 2018 16:54:33 +0800 Subject: xfrm: fix 'passing zero to ERR_PTR()' warning Fix a static code checker warning: net/xfrm/xfrm_policy.c:1836 xfrm_resolve_and_create_bundle() warn: passing zero to 'ERR_PTR' xfrm_tmpl_resolve return 0 just means no xdst found, return NULL instead of passing zero to ERR_PTR. Fixes: d809ec895505 ("xfrm: do not assume that template resolving always returns xfrms") Signed-off-by: YueHaibing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2f70fe68b9b0..69f06f879091 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1752,7 +1752,10 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { - if (err != 0 && err != -EAGAIN) + if (err == 0) + return NULL; + + if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } -- cgit v1.2.3 From 44e2b838c24d883dae8496dc7b6ddac7956ba53c Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 25 Jul 2018 13:45:29 -0700 Subject: xfrm: Return detailed errors from xfrmi_newlink Currently all failure modes of xfrm interface creation return EEXIST. This change improves the granularity of errnos provided by also returning ENODEV or EINVAL if failures happen in looking up the underlying interface, or a required parameter is not provided. This change has been tested against the Android Kernel Networking Tests, with additional xfrmi_newlink tests here: https://android-review.googlesource.com/c/kernel/tests/+/715755 Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index ccfe18d67e98..481d7307ab51 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -149,14 +149,18 @@ static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) char name[IFNAMSIZ]; int err; - if (p->name[0]) + if (p->name[0]) { strlcpy(name, p->name, IFNAMSIZ); - else + } else { + err = -EINVAL; goto failed; + } dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup); - if (!dev) + if (!dev) { + err = -EAGAIN; goto failed; + } dev_net_set(dev, net); @@ -165,8 +169,10 @@ static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) xi->net = net; xi->dev = dev; xi->phydev = dev_get_by_index(net, p->link); - if (!xi->phydev) + if (!xi->phydev) { + err = -ENODEV; goto failed_free; + } err = xfrmi_create2(dev); if (err < 0) @@ -179,7 +185,7 @@ failed_dev_put: failed_free: free_netdev(dev); failed: - return NULL; + return ERR_PTR(err); } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, @@ -194,13 +200,13 @@ static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, xip = &xi->next) { if (xi->p.if_id == p->if_id) { if (create) - return NULL; + return ERR_PTR(-EEXIST); return xi; } } if (!create) - return NULL; + return ERR_PTR(-ENODEV); return xfrmi_create(net, p); } @@ -682,8 +688,9 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); - if (!xfrmi_locate(net, p, 1)) - return -EEXIST; + xi = xfrmi_locate(net, p, 1); + if (IS_ERR(xi)) + return PTR_ERR(xi); return 0; } @@ -704,11 +711,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], xi = xfrmi_locate(net, &xi->p, 0); - if (xi) { + if (IS_ERR_OR_NULL(xi)) { + xi = netdev_priv(dev); + } else { if (xi->dev != dev) return -EEXIST; - } else - xi = netdev_priv(dev); + } return xfrmi_update(xi, &xi->p); } -- cgit v1.2.3 From 947541f36c561b5e0ca639ffc450a8c5221de467 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Jul 2018 16:35:30 +0200 Subject: net/smc: fewer parameters for smc_llc_send_confirm_link() Link confirmation will always be sent across the new link being confirmed. This allows to shrink the parameter list. No functional change. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 ++-------- net/smc/smc_llc.c | 9 +++++---- net/smc/smc_llc.h | 2 +- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7fc810ec31c5..7883f70f7c6d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -352,10 +352,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK response over RoCE fabric */ - rc = smc_llc_send_confirm_link(link, - link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_RESP); + rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TCL; @@ -951,10 +948,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK request to client over the RoCE fabric */ - rc = smc_llc_send_confirm_link(link, - link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_REQ); + rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); if (rc < 0) return SMC_CLC_DECL_TCL; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index b7944aa1ffc3..f2ba99c2e69a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -182,8 +182,7 @@ static int smc_llc_add_pending_send(struct smc_link *link, } /* high-level API to send LLC confirm link */ -int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], - union ib_gid *gid, +int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) { struct smc_link_group *lgr = smc_get_lgr(link); @@ -202,8 +201,10 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; - memcpy(confllc->sender_mac, mac, ETH_ALEN); - memcpy(confllc->sender_gid, gid, SMC_GID_SIZE); + memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + memcpy(confllc->sender_gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 65c8645e96a1..9a29fcbbcea8 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -36,7 +36,7 @@ enum smc_llc_msg_type { }; /* transmit */ -int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, +int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); -- cgit v1.2.3 From 7005ada68d1774d7c1109deaba0c2cd8e46f5091 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Jul 2018 16:35:31 +0200 Subject: net/smc: use correct vlan gid of RoCE device SMC code uses the base gid for VLAN traffic. The gids exchanged in the CLC handshake and the gid index used for the QP have to switch from the base gid to the appropriate vlan gid. When searching for a matching IB device port for a certain vlan device, it does not make sense to return an IB device port, which is not enabled for the used vlan_id. Add another check whether a vlan gid exists for a certain IB device port. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 24 +++++++++++++----------- net/smc/smc_clc.c | 10 ++++------ net/smc/smc_clc.h | 2 +- net/smc/smc_core.c | 37 ++++--------------------------------- net/smc/smc_core.h | 5 +++-- net/smc/smc_diag.c | 2 +- net/smc/smc_ib.c | 41 ++++++++++++++++++++++++++++++++++++----- net/smc/smc_ib.h | 3 ++- net/smc/smc_llc.c | 15 +++++---------- net/smc/smc_llc.h | 2 +- net/smc/smc_pnet.c | 30 +++++++++++++++++++++--------- net/smc/smc_pnet.h | 3 ++- 12 files changed, 93 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 7883f70f7c6d..b81797103260 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -370,8 +370,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) /* send add link reject message, only one link supported for now */ rc = smc_llc_send_add_link(link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_RESP); + link->gid, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TCL; @@ -469,7 +468,7 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport) + u8 *ibport, unsigned short vlan_id, u8 gid[]) { int reason_code = 0; @@ -477,7 +476,8 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); + smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, + gid); if (!(*ibdev)) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ @@ -523,12 +523,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, struct smc_ib_device *ibdev, u8 ibport, - struct smcd_dev *ismdev) + u8 gid[], struct smcd_dev *ismdev) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, ismdev); + rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -650,6 +650,7 @@ static int __smc_connect(struct smc_sock *smc) struct smc_clc_msg_accept_confirm aclc; struct smc_ib_device *ibdev; struct smcd_dev *ismdev; + u8 gid[SMC_GID_SIZE]; unsigned short vlan; int smc_type; int rc = 0; @@ -681,7 +682,7 @@ static int __smc_connect(struct smc_sock *smc) } /* check if there is a rdma device available */ - if (!smc_check_rdma(smc, &ibdev, &ibport)) { + if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { /* RDMA is supported for this connection */ rdma_supported = true; if (ism_supported) @@ -695,7 +696,7 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, ismdev); + rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); if (rc) { smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); @@ -970,8 +971,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) /* send ADD LINK request to client over the RoCE fabric */ rc = smc_llc_send_add_link(link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_REQ); + link->gid, SMC_LLC_REQ); if (rc < 0) return SMC_CLC_DECL_TCL; @@ -1193,6 +1193,7 @@ static void smc_listen_work(struct work_struct *work) struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; int local_contact = 0; + unsigned short vlan; int reason_code = 0; int rc = 0; u8 ibport; @@ -1241,7 +1242,8 @@ static void smc_listen_work(struct work_struct *work) /* check if RDMA is available */ if (!ism_supported && ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_check_rdma(new_smc, &ibdev, &ibport) || + smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || + smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || smc_listen_rdma_check(new_smc, pclc) || smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, &local_contact) || diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ad39efdb4f1c..78d74938a9d9 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -378,7 +378,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *ibdev, u8 ibport, + struct smc_ib_device *ibdev, u8 ibport, u8 gid[], struct smcd_dev *ismdev) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; @@ -409,7 +409,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, /* add SMC-R specifics */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, &ibdev->gid[ibport - 1], SMC_GID_SIZE); + memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE); memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); pclc.iparea_offset = htons(0); } @@ -492,8 +492,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); + memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE); memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); @@ -566,8 +565,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) link = &conn->lgr->lnk[SMC_SINGLE_LINK]; memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); + memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 100e988ad1a8..6bdc63352d6a 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -179,7 +179,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *smcibdev, u8 ibport, + struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], struct smcd_dev *ismdev); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 66741e61a3b0..90c10ae9ae09 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -219,6 +219,10 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto free_lgr; rc = smc_llc_link_init(lnk); if (rc) goto free_lgr; @@ -522,37 +526,6 @@ out: return rc; } -/* determine the link gid matching the vlan id of the link group */ -static int smc_link_determine_gid(struct smc_link_group *lgr) -{ - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - struct ib_gid_attr gattr; - union ib_gid gid; - int i; - - if (!lgr->vlan_id) { - lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; - return 0; - } - - for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; - i++) { - if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, - &gattr)) - continue; - if (gattr.ndev) { - if (is_vlan_dev(gattr.ndev) && - vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { - lnk->gid = gid; - dev_put(gattr.ndev); - return 0; - } - dev_put(gattr.ndev); - } - } - return -ENODEV; -} - static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role) @@ -631,8 +604,6 @@ create: if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (!is_smcd) - rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1e8974c50550..a4f0cc4e0270 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -84,14 +84,15 @@ struct smc_link { wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ - union ib_gid gid; /* gid matching used vlan id */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ + u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu peer_mtu; /* mtu size of peer */ u32 psn_initial; /* QP tx initial packet seqno */ u32 peer_psn; /* QP rx initial packet seqno */ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ - u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ + u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ enum smc_link_state state; /* state of link */ diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index d772cd10297e..a3cf7313a2d3 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -154,7 +154,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, - smc->conn.lgr->lnk[0].gid.raw); + smc->conn.lgr->lnk[0].gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, smc->conn.lgr->lnk[0].peer_gid); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 4706ab7092a9..2cc64bc8ae20 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -68,7 +68,7 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); - rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0); + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, sizeof(lnk->peer_mac)); @@ -142,13 +142,13 @@ out: return rc; } -static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) +static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) { struct ib_gid_attr gattr; + union ib_gid gid; int rc; - rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], &gattr); + rc = ib_query_gid(smcibdev->ibdev, ibport, 0, &gid, &gattr); if (rc || !gattr.ndev) return -ENODEV; @@ -175,6 +175,37 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; } +/* determine the gid for an ib-device port and vlan id */ +int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, + unsigned short vlan_id, u8 gid[], u8 *sgid_index) +{ + struct ib_gid_attr gattr; + union ib_gid _gid; + int i; + + for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + memset(&_gid, 0, SMC_GID_SIZE); + memset(&gattr, 0, sizeof(gattr)); + if (ib_query_gid(smcibdev->ibdev, ibport, i, &_gid, &gattr)) + continue; + if (!gattr.ndev) + continue; + if (((!vlan_id && !is_vlan_dev(gattr.ndev)) || + (vlan_id && is_vlan_dev(gattr.ndev) && + vlan_dev_vlan_id(gattr.ndev) == vlan_id)) && + gattr.gid_type == IB_GID_TYPE_IB) { + if (gid) + memcpy(gid, &_gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = i; + dev_put(gattr.ndev); + return 0; + } + dev_put(gattr.ndev); + } + return -ENODEV; +} + static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) { int rc; @@ -186,7 +217,7 @@ static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) if (rc) goto out; /* the SMC protocol requires specification of the RoCE MAC address */ - rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); + rc = smc_ib_fill_mac(smcibdev, ibport); if (rc) goto out; if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 7c1223c91229..bac7fd65a4c0 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -40,7 +40,6 @@ struct smc_ib_device { /* ib-device infos for smc */ struct tasklet_struct recv_tasklet; /* called by recv cq handler */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ - union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; /* pnetid per port */ u8 initialized : 1; /* ib dev CQ, evthdl done */ @@ -77,4 +76,6 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); +int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, + unsigned short vlan_id, u8 gid[], u8 *sgid_index); #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f2ba99c2e69a..a88c01029fa6 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -203,8 +203,7 @@ int smc_llc_send_confirm_link(struct smc_link *link, confllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - memcpy(confllc->sender_gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); + memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); @@ -241,8 +240,7 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, /* prepare an add link message */ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, - struct smc_link *link, u8 mac[], - union ib_gid *gid, + struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp) { memset(addllc, 0, sizeof(*addllc)); @@ -259,8 +257,7 @@ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, } /* send ADD LINK request or response */ -int smc_llc_send_add_link(struct smc_link *link, u8 mac[], - union ib_gid *gid, +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp) { struct smc_llc_msg_add_link *addllc; @@ -423,14 +420,12 @@ static void smc_llc_rx_add_link(struct smc_link *link, if (lgr->role == SMC_SERV) { smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_REQ); + link->gid, SMC_LLC_REQ); } else { smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], - &link->smcibdev->gid[link->ibport - 1], - SMC_LLC_RESP); + link->gid, SMC_LLC_RESP); } smc_llc_send_message(link, llc, sizeof(*llc)); } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 9a29fcbbcea8..95a7f3662e59 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -38,7 +38,7 @@ enum smc_llc_msg_type { /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); -int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, enum smc_llc_reqresp reqresp); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 1b6c066d3495..01c6ce042a1c 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -535,11 +535,13 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) } /* Determine the corresponding IB device port based on the hardware PNETID. - * Searching stops at the first matching active IB device port. + * Searching stops at the first matching active IB device port with vlan_id + * configured. */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_ib_device **smcibdev, - u8 *ibport) + u8 *ibport, unsigned short vlan_id, + u8 gid[]) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smc_ib_device *ibdev; @@ -553,15 +555,20 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, spin_lock(&smc_ib_devices.lock); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid, SMC_MAX_PNETID_LEN) && - smc_ib_port_active(ibdev, i)) { + smc_ib_port_active(ibdev, i) && + !smc_ib_determine_gid(ibdev, i, vlan_id, gid, + NULL)) { *smcibdev = ibdev; *ibport = i; - break; + goto out; } } } +out: spin_unlock(&smc_ib_devices.lock); } @@ -589,7 +596,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, /* Lookup of coupled ib_device via SMC pnet table */ static void smc_pnet_find_roce_by_table(struct net_device *netdev, struct smc_ib_device **smcibdev, - u8 *ibport) + u8 *ibport, unsigned short vlan_id, + u8 gid[]) { struct smc_pnetentry *pnetelem; @@ -597,7 +605,10 @@ static void smc_pnet_find_roce_by_table(struct net_device *netdev, list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { if (netdev == pnetelem->ndev) { if (smc_ib_port_active(pnetelem->smcibdev, - pnetelem->ib_port)) { + pnetelem->ib_port) && + !smc_ib_determine_gid(pnetelem->smcibdev, + pnetelem->ib_port, vlan_id, + gid, NULL)) { *smcibdev = pnetelem->smcibdev; *ibport = pnetelem->ib_port; } @@ -612,7 +623,8 @@ static void smc_pnet_find_roce_by_table(struct net_device *netdev, * ethernet interface. */ void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport) + struct smc_ib_device **smcibdev, u8 *ibport, + unsigned short vlan_id, u8 gid[]) { struct dst_entry *dst = sk_dst_get(sk); @@ -625,12 +637,12 @@ void smc_pnet_find_roce_resource(struct sock *sk, goto out_rel; /* if possible, lookup via hardware-defined pnetid */ - smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport); + smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid); if (*smcibdev) goto out_rel; /* lookup via SMC PNET table */ - smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport); + smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport, vlan_id, gid); out_rel: dst_release(dst); diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 1e94fd4df7bc..8ff777636e32 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -33,7 +33,8 @@ int smc_pnet_init(void) __init; void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport); + struct smc_ib_device **smcibdev, u8 *ibport, + unsigned short vlan_id, u8 gid[]); void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); #endif -- cgit v1.2.3 From 603cc1498455cf57f5ca4483b600efb37ea2c56c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 25 Jul 2018 16:35:32 +0200 Subject: net/smc: provide fallback reason code Remember the fallback reason code and the peer diagnosis code for smc sockets, and provide them in smc_diag.c to the netlink interface. And add more detailed reason codes. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/uapi/linux/smc_diag.h | 6 +++++ net/smc/af_smc.c | 52 +++++++++++++++++++++++++------------------ net/smc/smc.h | 2 ++ net/smc/smc_clc.c | 6 ++++- net/smc/smc_clc.h | 18 ++++++++++----- net/smc/smc_diag.c | 6 +++++ 6 files changed, 61 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 48ae3ee22b2d..ac9e8c96d9bd 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -43,6 +43,7 @@ enum { SMC_DIAG_LGRINFO, SMC_DIAG_SHUTDOWN, SMC_DIAG_DMBINFO, + SMC_DIAG_FALLBACK, __SMC_DIAG_MAX, }; @@ -92,6 +93,11 @@ struct smc_diag_lgrinfo { __u8 role; }; +struct smc_diag_fallback { + __u32 reason; + __u32 peer_diagnosis; +}; + struct smcd_diag_dmbinfo { /* SMC-D Socket internals */ __u32 linkid; /* Link identifier */ __u64 peer_gid; /* Peer GID */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b81797103260..fce7e4751151 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -344,17 +344,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) rc = smc_ib_modify_qp_rts(link); if (rc) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_RDYLNK; smc_wr_remember_qp_attr(link); if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_CL; /* receive ADD LINK request from server over RoCE fabric */ rest = wait_for_completion_interruptible_timeout(&link->llc_add, @@ -372,7 +372,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) link->smcibdev->mac[link->ibport - 1], link->gid, SMC_LLC_RESP); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_AL; smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); @@ -424,9 +424,10 @@ static void smc_link_save_peer_info(struct smc_link *link, } /* fall back during connect */ -static int smc_connect_fallback(struct smc_sock *smc) +static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { smc->use_fallback = true; + smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -443,7 +444,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) sock_put(&smc->sk); /* passive closing */ return reason_code; } - if (reason_code != SMC_CLC_DECL_REPLY) { + if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code); if (rc < 0) { if (smc->sk.sk_state == SMC_INIT) @@ -451,7 +452,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) return rc; } } - return smc_connect_fallback(smc); + return smc_connect_fallback(smc, reason_code); } /* abort connecting */ @@ -568,7 +569,7 @@ static int smc_connect_rdma(struct smc_sock *smc, smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, aclc)) - return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, local_contact); smc_close_init(smc); @@ -576,12 +577,12 @@ static int smc_connect_rdma(struct smc_sock *smc, if (local_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) - return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, local_contact); } else { if (!smc->conn.rmb_desc->reused && smc_reg_rmb(link, smc->conn.rmb_desc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, local_contact); } smc_rmb_sync_sg_for_device(&smc->conn); @@ -659,11 +660,11 @@ static int __smc_connect(struct smc_sock *smc) sock_hold(&smc->sk); /* sock put in passive closing */ if (smc->use_fallback) - return smc_connect_fallback(smc); + return smc_connect_fallback(smc, smc->fallback_rsn); /* if peer has not signalled SMC-capability, fall back */ if (!tcp_sk(smc->clcsock->sk)->syn_smc) - return smc_connect_fallback(smc); + return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(smc)) @@ -693,7 +694,7 @@ static int __smc_connect(struct smc_sock *smc) /* if neither ISM nor RDMA are supported, fallback */ if (!rdma_supported && !ism_supported) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); /* perform CLC handshake */ rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); @@ -708,7 +709,7 @@ static int __smc_connect(struct smc_sock *smc) else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) rc = smc_connect_ism(smc, &aclc, ismdev); else - rc = SMC_CLC_DECL_CNFERR; + rc = SMC_CLC_DECL_MODEUNSUPP; if (rc) { smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); @@ -946,12 +947,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) link = &lgr->lnk[SMC_SINGLE_LINK]; if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ rest = wait_for_completion_interruptible_timeout( @@ -973,7 +974,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) link->smcibdev->mac[link->ibport - 1], link->gid, SMC_LLC_REQ); if (rc < 0) - return SMC_CLC_DECL_TCL; + return SMC_CLC_DECL_TIMEOUT_AL; /* receive ADD LINK response from client over the RoCE fabric */ rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, @@ -1048,7 +1049,8 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, } smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; - if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { + new_smc->fallback_rsn = reason_code; + if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code) < 0) { smc_listen_out_err(new_smc); return; @@ -1139,7 +1141,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) if (local_contact != SMC_FIRST_CONTACT) { if (!new_smc->conn.rmb_desc->reused) { if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_INTERR; + return SMC_CLC_DECL_ERR_REGRMB; } } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1159,13 +1161,13 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc, smc_link_save_peer_info(link, cclc); if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { - reason_code = SMC_CLC_DECL_INTERR; + reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } if (local_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) { - reason_code = SMC_CLC_DECL_INTERR; + reason_code = SMC_CLC_DECL_ERR_RDYLNK; goto decline; } /* QP confirmation over RoCE fabric */ @@ -1206,6 +1208,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { new_smc->use_fallback = true; + new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; smc_listen_out_connected(new_smc); return; } @@ -1250,7 +1253,8 @@ static void smc_listen_work(struct work_struct *work) smc_listen_rdma_reg(new_smc, local_contact))) { /* SMC not supported, decline */ mutex_unlock(&smc_create_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); + smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, + local_contact); return; } @@ -1297,6 +1301,7 @@ static void smc_tcp_listen_work(struct work_struct *work) new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; + new_smc->fallback_rsn = lsmc->fallback_rsn; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); @@ -1451,6 +1456,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT) { smc->use_fallback = true; + smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; goto out; @@ -1648,6 +1654,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* option not supported by SMC */ if (sk->sk_state == SMC_INIT) { smc->use_fallback = true; + smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { if (!smc->use_fallback) rc = -EINVAL; @@ -1885,6 +1892,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ + smc->fallback_rsn = 0; rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); if (rc) { diff --git a/net/smc/smc.h b/net/smc/smc.h index be20acd7b5ab..08786ace6010 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -208,6 +208,8 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ + int fallback_rsn; /* reason for fallback */ + u32 peer_diagnosis; /* decline reason from peer */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 78d74938a9d9..83aba9ade060 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -334,7 +334,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, goto out; } if (clcm->type == SMC_CLC_DECLINE) { - reason_code = SMC_CLC_DECL_REPLY; + struct smc_clc_msg_decline *dclc; + + dclc = (struct smc_clc_msg_decline *)clcm; + reason_code = SMC_CLC_DECL_PEERDECL; + smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = 1; smc_lgr_terminate(smc->conn.lgr); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 6bdc63352d6a..18da89b681c2 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -28,15 +28,21 @@ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ -#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ +#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ +#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ -#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */ +#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ +#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */ +#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ +#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ +#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ -#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */ +#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ -#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ -#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ -#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */ +#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index a3cf7313a2d3..dbf64a93d68a 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -79,6 +79,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct nlattr *bc) { struct smc_sock *smc = smc_sk(sk); + struct smc_diag_fallback fallback; struct user_namespace *user_ns; struct smc_diag_msg *r; struct nlmsghdr *nlh; @@ -101,6 +102,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; + fallback.reason = smc->fallback_rsn; + fallback.peer_diagnosis = smc->peer_diagnosis; + if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) + goto errout; + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; -- cgit v1.2.3 From 0d18a0cb4b1585d9e5a3b300d5df9ed866561ffb Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 25 Jul 2018 16:35:33 +0200 Subject: net/smc: improve delete link processing Send an orderly DELETE LINK request before termination of a link group, add support for client triggered DELETE LINK processing. And send a disorderly DELETE LINK before module is unloaded. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- net/smc/smc_core.h | 4 +++- net/smc/smc_llc.c | 30 +++++++++++++++++++----------- net/smc/smc_llc.h | 3 ++- net/smc/smc_wr.c | 7 ++----- 5 files changed, 68 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 90c10ae9ae09..a46418f45ecd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -30,6 +30,7 @@ #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) +#define SMC_LGR_FREE_DELAY_FAST (8 * HZ) static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), @@ -51,6 +52,11 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } +void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) +{ + mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); +} + /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock @@ -133,6 +139,20 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } +/* Send delete link, either as client to request the initiation + * of the DELETE LINK sequence from server; or as server to + * initiate the delete processing. See smc_llc_rx_delete_link(). + */ +static int smc_link_send_delete(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_ACTIVE && + !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { + smc_llc_link_deleting(lnk); + return 0; + } + return -ENOTCONN; +} + static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), @@ -153,10 +173,21 @@ static void smc_lgr_free_work(struct work_struct *work) list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); + + if (!lgr->is_smcd && !lgr->terminating) { + /* try to send del link msg, on error free lgr immediately */ + if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { + /* reschedule in case we never receive a response */ + smc_lgr_schedule_free_work(lgr); + return; + } + } + if (!delayed_work_pending(&lgr->free_work)) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); smc_lgr_free(lgr); } } @@ -984,8 +1015,14 @@ void smc_core_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); - if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + if (lnk->state == SMC_LNK_ACTIVE) + smc_llc_send_delete_link(lnk, SMC_LLC_REQ, + false); + smc_llc_link_inactive(lnk); + } cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index a4f0cc4e0270..c156674733c9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -34,7 +34,8 @@ enum smc_lgr_role { /* possible roles of a link group */ enum smc_link_state { /* possible states of a link */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ - SMC_LNK_ACTIVE /* link is active */ + SMC_LNK_ACTIVE, /* link is active */ + SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ @@ -265,6 +266,7 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, u64 peer_gid); void smcd_conn_free(struct smc_connection *conn); +void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a88c01029fa6..9c916c709ca7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -278,7 +278,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], /* prepare a delete link message */ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, struct smc_link *link, - enum smc_llc_reqresp reqresp) + enum smc_llc_reqresp reqresp, bool orderly) { memset(delllc, 0, sizeof(*delllc)); delllc->hd.common.type = SMC_LLC_DELETE_LINK; @@ -287,13 +287,14 @@ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, delllc->hd.flags |= SMC_LLC_FLAG_RESP; /* DEL_LINK_ALL because only 1 link supported */ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; delllc->link_num = link->link_id; } /* send DELETE LINK request or response */ int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp) + enum smc_llc_reqresp reqresp, bool orderly) { struct smc_llc_msg_del_link *delllc; struct smc_wr_tx_pend_priv *pend; @@ -304,7 +305,7 @@ int smc_llc_send_delete_link(struct smc_link *link, if (rc) return rc; delllc = (struct smc_llc_msg_del_link *)wr_buf; - smc_llc_prep_delete_link(delllc, link, reqresp); + smc_llc_prep_delete_link(delllc, link, reqresp, orderly); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -438,17 +439,19 @@ static void smc_llc_rx_delete_link(struct smc_link *link, if (llc->hd.flags & SMC_LLC_FLAG_RESP) { if (lgr->role == SMC_SERV) - smc_lgr_terminate(lgr); + smc_lgr_schedule_free_work_fast(lgr); } else { + smc_lgr_forget(lgr); + smc_llc_link_deleting(link); if (lgr->role == SMC_SERV) { - smc_lgr_forget(lgr); - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ); - smc_llc_send_message(link, llc, sizeof(*llc)); + /* client asks to delete this link, send request */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); } else { - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP); - smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_terminate(lgr); + /* server requests to delete this link, send response */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } + smc_llc_send_message(link, llc, sizeof(*llc)); + smc_lgr_schedule_free_work_fast(lgr); } } @@ -622,6 +625,11 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) } } +void smc_llc_link_deleting(struct smc_link *link) +{ + link->state = SMC_LNK_DELETING; +} + /* called in tasklet context */ void smc_llc_link_inactive(struct smc_link *link) { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 95a7f3662e59..9e2ff088e301 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -41,9 +41,10 @@ int smc_llc_send_confirm_link(struct smc_link *lnk, int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp); + enum smc_llc_reqresp reqresp, bool orderly); int smc_llc_link_init(struct smc_link *link); void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_deleting(struct smc_link *link); void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index b6df69756bef..f856b8402b3f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -182,17 +182,14 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, if (rc) return rc; } else { - struct smc_link_group *lgr; - - lgr = smc_get_lgr(link); rc = wait_event_timeout( link->wr_tx_wait, - list_empty(&lgr->list) || /* lgr terminated */ + link->state == SMC_LNK_INACTIVE || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(lgr); + smc_lgr_terminate(smc_get_lgr(link)); return -EPIPE; } if (idx == link->wr_tx_cnt) -- cgit v1.2.3 From ba7d7e2677c0953b251c36588b15f5f442e59c84 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 26 Jul 2018 00:20:08 +0200 Subject: net/rds/Kconfig: RDS should depend on IPV6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build error, implicit declaration of function __inet6_ehashfn shows up When RDS is enabled but not IPV6. net/rds/connection.c: In function ‘rds_conn_bucket’: net/rds/connection.c:67:9: error: implicit declaration of function ‘__inet6_ehashfn’; did you mean ‘__inet_ehashfn’? [-Werror=implicit-function-declaration] hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); ^~~~~~~~~~~~~~~ __inet_ehashfn Current code adds IPV6 as a depends on in config RDS. Fixes: eee2fa6ab322 ("rds: Changing IP address internal representation to struct in6_addr") Signed-off-by: Anders Roxell Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 41f75563b54b..607128f10bcd 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,7 +1,7 @@ config RDS tristate "The RDS Protocol" - depends on INET + depends on INET && CONFIG_IPV6 ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband or TCP. -- cgit v1.2.3 From dc66fe43b7ebdb53628dcbc1f8f15de3e000aacf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 25 Jul 2018 10:22:27 -0500 Subject: rds: send: Fix dead code in rds_sendmsg Currently, code at label *out* is unreachable. Fix this by updating variable *ret* with -EINVAL, so the jump to *out* can be properly executed instead of directly returning from function. Addresses-Coverity-ID: 1472059 ("Structurally dead code") Fixes: 1e2b44e78eea ("rds: Enable RDS IPv6 support") Signed-off-by: Gustavo A. R. Silva Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 9604e1faa564..18e2b4d3931f 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1126,7 +1126,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || IN_MULTICAST(ntohl(addr4))) { - return -EINVAL; + ret = -EINVAL; goto out; } } -- cgit v1.2.3 From 990e35ecba1cb8ebee4ad4a028735e24f4615417 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 23 Jul 2018 17:08:00 -0700 Subject: cbs: Add support for the graft function This will allow to install a child qdisc under cbs. The main use case is to install ETF (Earliest TxTime First) qdisc under cbs, so there's another level of control for time-sensitive traffic. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 125 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index cdd96b9a27bc..e26a24017faa 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -78,18 +78,42 @@ struct cbs_sched_data { s64 sendslope; /* in bytes/s */ s64 idleslope; /* in bytes/s */ struct qdisc_watchdog watchdog; - int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch); + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); + struct Qdisc *qdisc; }; -static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch) +static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, + struct sk_buff **to_free) { - return qdisc_enqueue_tail(skb, sch); + int err; + + err = child->ops->enqueue(skb, child, to_free); + if (err != NET_XMIT_SUCCESS) + return err; + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return NET_XMIT_SUCCESS; } -static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) +static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; + + return cbs_child_enqueue(skb, sch, qdisc, to_free); +} + +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; if (sch->q.qlen == 0 && q->credits > 0) { /* We need to stop accumulating credits when there's @@ -99,7 +123,7 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) q->last = ktime_get_ns(); } - return qdisc_enqueue_tail(skb, sch); + return cbs_child_enqueue(skb, sch, qdisc, to_free); } static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -107,7 +131,7 @@ static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct cbs_sched_data *q = qdisc_priv(sch); - return q->enqueue(skb, sch); + return q->enqueue(skb, sch, to_free); } /* timediff is in ns, slope is in bytes/s */ @@ -132,9 +156,25 @@ static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) return div64_s64(len * slope, port_rate); } +static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child) +{ + struct sk_buff *skb; + + skb = child->ops->dequeue(child); + if (!skb) + return NULL; + + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + return skb; +} + static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; s64 now = ktime_get_ns(); struct sk_buff *skb; s64 credits; @@ -157,8 +197,7 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) return NULL; } } - - skb = qdisc_dequeue_head(sch); + skb = cbs_child_dequeue(sch, qdisc); if (!skb) return NULL; @@ -178,7 +217,10 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) { - return qdisc_dequeue_head(sch); + struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; + + return cbs_child_dequeue(sch, qdisc); } static struct sk_buff *cbs_dequeue(struct Qdisc *sch) @@ -310,6 +352,13 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } + q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle, extack); + if (!q->qdisc) + return -ENOMEM; + + qdisc_hash_add(q->qdisc, false); + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); q->enqueue = cbs_enqueue_soft; @@ -328,6 +377,9 @@ static void cbs_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); + + if (q->qdisc) + qdisc_destroy(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -356,8 +408,72 @@ nla_put_failure: return -1; } +static int cbs_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (cl != 1 || !q->qdisc) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old, struct netlink_ext_ack *extack) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle, NULL); + if (!new) + new = &noop_qdisc; + } + + *old = qdisc_replace(sch, new, &q->qdisc); + return 0; +} + +static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->qdisc; +} + +static unsigned long cbs_find(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) { + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } +} + +static const struct Qdisc_class_ops cbs_class_ops = { + .graft = cbs_graft, + .leaf = cbs_leaf, + .find = cbs_find, + .walk = cbs_walk, + .dump = cbs_dump_class, +}; + static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .id = "cbs", + .cl_ops = &cbs_class_ops, .priv_size = sizeof(struct cbs_sched_data), .enqueue = cbs_enqueue, .dequeue = cbs_dequeue, -- cgit v1.2.3 From 201876b33c09edcb6c2914f0ced798437a102648 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Tue, 24 Jul 2018 16:54:27 +0530 Subject: net/tls: Removed redundant checks for non-NULL Removed checks against non-NULL before calling kfree_skb() and crypto_free_aead(). These functions are safe to be called with NULL as an argument. Signed-off-by: Vakul Garg Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 03f1370f5db1..0687a7a4689f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1047,8 +1047,7 @@ void tls_sw_free_resources_tx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - if (ctx->aead_send) - crypto_free_aead(ctx->aead_send); + crypto_free_aead(ctx->aead_send); tls_free_both_sg(sk); kfree(ctx); @@ -1060,10 +1059,8 @@ void tls_sw_release_resources_rx(struct sock *sk) struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); if (ctx->aead_recv) { - if (ctx->recv_pkt) { - kfree_skb(ctx->recv_pkt); - ctx->recv_pkt = NULL; - } + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; crypto_free_aead(ctx->aead_recv); strp_stop(&ctx->strp); write_lock_bh(&sk->sk_callback_lock); -- cgit v1.2.3 From 63135ee0a6e5f5a5ad1345e48099e62d3d617a81 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 25 Jul 2018 18:00:49 +0800 Subject: tipc: add missing dev_put() on error in tipc_enable_l2_media when tipc_own_id failed to obtain node identity,dev_put should be call before return -EINVAL. Fixes: 682cd3cf946b ("tipc: confgiure and apply UDP bearer MTU on running links") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/tipc/bearer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index fd6d8f18955c..418f03d0be90 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -395,6 +395,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, tipc_net_init(net, node_id, 0); } if (!tipc_own_id(net)) { + dev_put(dev); pr_warn("Failed to obtain node identity\n"); return -EINVAL; } -- cgit v1.2.3 From 2b139e6b1ec86e1d3646039965dd79ad05d8dca4 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 25 Jul 2018 14:53:33 +0200 Subject: l2tp: remove ->recv_payload_hook The tunnel reception hook is only used by l2tp_ppp for skipping PPP framing bytes. This is a session specific operation, but once a PPP session sets ->recv_payload_hook on its tunnel, all frames received by the tunnel will enter pppol2tp_recv_payload_hook(), including those targeted at Ethernet sessions (an L2TPv3 tunnel can multiplex PPP and Ethernet sessions). So this mechanism is wrong, and uselessly complex. Let's just move this functionality to the pppol2tp rx handler and drop ->recv_payload_hook. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 16 ++++------------ net/l2tp/l2tp_core.h | 3 +-- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 3 +-- net/l2tp/l2tp_ppp.c | 33 +++++++++++---------------------- 5 files changed, 18 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index c8fc0f7f0b4b..d10f4ed52d92 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -619,7 +619,7 @@ discard: */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, - int length, int (*payload_hook)(struct sk_buff *skb)) + int length) { struct l2tp_tunnel *tunnel = session->tunnel; int offset; @@ -740,13 +740,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, __skb_pull(skb, offset); - /* If caller wants to process the payload before we queue the - * packet, do so now. - */ - if (payload_hook) - if ((*payload_hook)(skb)) - goto discard; - /* Prepare skb for adding to the session's reorder_q. Hold * packets for max reorder_timeout or 1 second if not * reordering. @@ -800,8 +793,7 @@ static int l2tp_session_queue_purge(struct l2tp_session *session) * Returns 1 if the packet was not a good data packet and could not be * forwarded. All such packets are passed up to userspace to deal with. */ -static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, - int (*payload_hook)(struct sk_buff *skb)) +static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) { struct l2tp_session *session = NULL; unsigned char *ptr, *optr; @@ -892,7 +884,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, goto error; } - l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); + l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); return 0; @@ -921,7 +913,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n", tunnel->name, skb->len); - if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook)) + if (l2tp_udp_recv_core(tunnel, skb)) goto pass_up; return 0; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a5c09d3a5698..d85fde793a8c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -180,7 +180,6 @@ struct l2tp_tunnel { struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; - int (*recv_payload_hook)(struct sk_buff *skb); void (*old_sk_destruct)(struct sock *); struct sock *sock; /* Parent socket */ int fd; /* Parent fd, if tunnel socket @@ -232,7 +231,7 @@ int l2tp_session_delete(struct l2tp_session *session); void l2tp_session_free(struct l2tp_session *session); void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, - int length, int (*payload_hook)(struct sk_buff *skb)); + int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); void l2tp_session_set_header_len(struct l2tp_session *session, int version); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index a9c05b2bc1b0..0bc39cc20a3f 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -165,7 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); } - l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 672e5b753738..42f828cf62fb 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -178,8 +178,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); } - l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, - tunnel->recv_payload_hook); + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 9ac02c93df98..000c9829304c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -183,25 +183,6 @@ out: * Receive data handling *****************************************************************************/ -static int pppol2tp_recv_payload_hook(struct sk_buff *skb) -{ - /* Skip PPP header, if present. In testing, Microsoft L2TP clients - * don't send the PPP header (PPP header compression enabled), but - * other clients can include the header. So we cope with both cases - * here. The PPP header is always FF03 when using L2TP. - * - * Note that skb->data[] isn't dereferenced from a u16 ptr here since - * the field may be unaligned. - */ - if (!pskb_may_pull(skb, 2)) - return 1; - - if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI)) - skb_pull(skb, 2); - - return 0; -} - /* Receive message. This is the recvmsg for the PPPoL2TP socket. */ static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, @@ -248,6 +229,17 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int if (sk == NULL) goto no_sock; + /* If the first two bytes are 0xFF03, consider that it is the PPP's + * Address and Control fields and skip them. The L2TP module has always + * worked this way, although, in theory, the use of these fields should + * be negociated and handled at the PPP layer. These fields are + * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered + * Information command with Poll/Final bit set to zero (RFC 1662). + */ + if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS && + skb->data[1] == PPP_UI) + skb_pull(skb, 2); + if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; @@ -763,9 +755,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } - if (tunnel->recv_payload_hook == NULL) - tunnel->recv_payload_hook = pppol2tp_recv_payload_hook; - if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = info.peer_tunnel_id; -- cgit v1.2.3 From 2ed9db3074fcd8d12709fe40ff0e691d74229818 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 25 Jul 2018 09:07:24 -0500 Subject: net: sched: cls_api: fix dead code in switch Code at line 1850 is unreachable. Fix this by removing the break statement above it, so the code for case RTM_GETCHAIN can be properly executed. Addresses-Coverity-ID: 1472050 ("Structurally dead code") Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") Signed-off-by: Gustavo A. R. Silva Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5f7098b5405e..f3d78c23338e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1846,7 +1846,6 @@ replay: tcf_chain_put_explicitly_created(chain); break; case RTM_GETCHAIN: - break; err = tc_chain_notify(chain, skb, n->nlmsg_seq, n->nlmsg_seq, n->nlmsg_type, true); if (err < 0) -- cgit v1.2.3 From 0a26cf3ff47d9e70fbed2fa79b0678ee70e25113 Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Wed, 25 Jul 2018 14:48:21 -0700 Subject: tls: Skip zerocopy path for ITER_KVEC The zerocopy path ultimately calls iov_iter_get_pages, which defines the step function for ITER_KVECs as simply, return -EFAULT. Taking the non-zerocopy path for ITER_KVECs avoids the unnecessary fallback. See https://lore.kernel.org/lkml/20150401023311.GL29656@ZenIV.linux.org.uk/T/#u for a discussion of why zerocopy for vmalloc data is not a good idea. Discovered while testing NBD traffic encrypted with ktls. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0687a7a4689f..f9971717f7e0 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -362,6 +362,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int record_room; bool full_record; int orig_size; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; @@ -410,8 +411,7 @@ alloc_encrypted: try_to_copy -= required_size - ctx->sg_encrypted_size; full_record = true; } - - if (full_record || eor) { + if (!is_kvec && (full_record || eor)) { ret = zerocopy_from_iter(sk, &msg->msg_iter, try_to_copy, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, @@ -779,6 +779,7 @@ int tls_sw_recvmsg(struct sock *sk, bool cmsg = false; int target, err = 0; long timeo; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; flags |= nonblock; @@ -822,7 +823,7 @@ int tls_sw_recvmsg(struct sock *sk, page_count = iov_iter_npages(&msg->msg_iter, MAX_SKB_FRAGS); to_copy = rxm->full_len - tls_ctx->rx.overhead_size; - if (to_copy <= len && page_count < MAX_SKB_FRAGS && + if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS && likely(!(flags & MSG_PEEK))) { struct scatterlist sgin[MAX_SKB_FRAGS + 1]; int pages = 0; -- cgit v1.2.3 From c921d7db3d1248c9091af070a7fdce2e55baa86a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 26 Jul 2018 18:27:58 +0200 Subject: net: sched: unmark chain as explicitly created on delete Once user manually deletes the chain using "chain del", the chain cannot be marked as explicitly created anymore. Signed-off-by: Jiri Pirko Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f3d78c23338e..75cce2819de9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1844,6 +1844,7 @@ replay: * to the chain previously taken during addition. */ tcf_chain_put_explicitly_created(chain); + chain->explicitly_created = false; break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, -- cgit v1.2.3 From c6f5e017df9dfa9f6cbe70da008e7d716d726f1b Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 26 Jul 2018 15:09:52 +0800 Subject: xfrm: fix ptr_ret.cocci warnings net/xfrm/xfrm_interface.c:692:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Fixes: 44e2b838c24d ("xfrm: Return detailed errors from xfrmi_newlink") CC: Benedict Wong Signed-off-by: kbuild test robot Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 481d7307ab51..31acc6f33d98 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -689,10 +689,7 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); xi = xfrmi_locate(net, p, 1); - if (IS_ERR(xi)) - return PTR_ERR(xi); - - return 0; + return PTR_ERR_OR_ZERO(xi); } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) -- cgit v1.2.3 From 1f3ed383fb9a073ae2e408cd7a0717b04c7c3a21 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 27 Jul 2018 09:45:05 +0200 Subject: net: sched: don't dump chains only held by actions In case a chain is empty and not explicitly created by a user, such chain should not exist. The only exception is if there is an action "goto chain" pointing to it. In that case, don't show the chain in the dump. Track the chain references held by actions and use them to find out if a chain should or should not be shown in chain dump. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++ include/net/sch_generic.h | 1 + net/sched/act_api.c | 4 +-- net/sched/cls_api.c | 70 +++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 64 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3101582f642..6d02f31abba8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, + u32 chain_index); void tcf_chain_put(struct tcf_chain *chain); +void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 085c509c8674..c5432362dc26 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -314,6 +314,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + unsigned int action_refcnt; bool explicitly_created; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 148a89ab789b..b43df1e25c6d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) if (!tp) return -EINVAL; - a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true); + a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!a->goto_chain) return -ENOMEM; return 0; @@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) static void tcf_action_goto_chain_fini(struct tc_action *a) { - tcf_chain_put(a->goto_chain); + tcf_chain_put_by_act(a->goto_chain); } static void tcf_action_goto_chain_exec(const struct tc_action *a, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 75cce2819de9..e20aad1987b8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } +static void tcf_chain_hold_by_act(struct tcf_chain *chain) +{ + ++chain->action_refcnt; +} + +static void tcf_chain_release_by_act(struct tcf_chain *chain) +{ + --chain->action_refcnt; +} + +static bool tcf_chain_is_zombie(struct tcf_chain *chain) +{ + /* In case all the references are action references, this + * chain is a zombie and should not be listed in the chain + * dump list. + */ + return chain->refcnt == chain->action_refcnt; +} + static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, u32 chain_index) { @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); + + tcf_chain_hold_by_act(chain); + return chain; +} +EXPORT_SYMBOL(tcf_chain_get_by_act); + static void tc_chain_tmplt_del(struct tcf_chain *chain); void tcf_chain_put(struct tcf_chain *chain) @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); +void tcf_chain_put_by_act(struct tcf_chain *chain) +{ + tcf_chain_release_by_act(chain); + tcf_chain_put(chain); +} +EXPORT_SYMBOL(tcf_chain_put_by_act); + static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { if (chain->explicitly_created) @@ -1803,20 +1838,29 @@ replay: chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { - NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; - } - if (!(n->nlmsg_flags & NLM_F_CREATE)) { - NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; - } - chain = tcf_chain_create(block, chain_index); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + if (tcf_chain_is_zombie(chain)) { + /* The chain exists only because there is + * some action referencing it, meaning it + * is a zombie. + */ + tcf_chain_hold(chain); + } else { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + } else { + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } } } else { - if (!chain) { + if (!chain || tcf_chain_is_zombie(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); return -EINVAL; } @@ -1944,6 +1988,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; continue; } + if (tcf_chain_is_zombie(chain)) + continue; err = tc_chain_fill_node(chain, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, -- cgit v1.2.3 From 08193d1a893c802c4b807e4d522865061f4e9f4f Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 27 Jul 2018 15:26:55 +0300 Subject: net: dcb: For wild-card lookups, use priority -1, not 0 The function dcb_app_lookup walks the list of specified DCB APP entries, looking for one that matches a given criteria: ifindex, selector, protocol ID and optionally also priority. The "don't care" value for priority is set to 0, because that priority has not been allowed under CEE regime, which predates the IEEE standardization. Under IEEE, 0 is a valid priority number. But because dcb_app_lookup considers zero a wild card, attempts to add an APP entry with priority 0 fail when other entries exist for a given ifindex / selector / PID triplet. Fix by changing the wild-card value to -1. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 2589a6b78aa1..013fdb6fa07a 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1786,7 +1786,7 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && - (!prio || itr->app.priority == prio)) + ((prio == -1) || itr->app.priority == prio)) return itr; } @@ -1821,7 +1821,8 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) u8 prio = 0; spin_lock_bh(&dcb_lock); - if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) + itr = dcb_app_lookup(app, dev->ifindex, -1); + if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); @@ -1849,7 +1850,8 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ - if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) { + itr = dcb_app_lookup(new, dev->ifindex, -1); + if (itr) { if (new->priority) itr->app.priority = new->priority; else { @@ -1882,7 +1884,8 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) u8 prio = 0; spin_lock_bh(&dcb_lock); - if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) + itr = dcb_app_lookup(app, dev->ifindex, -1); + if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); -- cgit v1.2.3 From b67c540b8a987e365dc548e5b2ddf023946e3d63 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 27 Jul 2018 15:26:56 +0300 Subject: net: dcb: Add priority-to-DSCP map getters On ingress, a network device such as a switch assigns to packets priority based on various criteria. Common options include interpreting PCP and DSCP fields according to user configuration. When a packet egresses the switch, a reverse process may rewrite PCP and/or DSCP values according to packet priority. The following three functions support a) obtaining a DSCP-to-priority map or vice versa, and b) finding default-priority entries in APP database. The DCB subsystem supports for APP entries a very generous M:N mapping between priorities and protocol identifiers. Understandably, several (say) DSCP values can map to the same priority. But this asymmetry holds the other way around as well--one priority can map to several DSCP values. For this reason, the following functions operate in terms of bitmaps, with ones in positions that match some APP entry. - dcb_ieee_getapp_dscp_prio_mask_map() to compute for a given netdevice a map of DSCP-to-priority-mask, which gives for each DSCP value a bitmap of priorities related to that DSCP value by APP, along the lines of dcb_ieee_getapp_mask(). - dcb_ieee_getapp_prio_dscp_mask_map() similarly to compute for a given netdevice a map from priorities to a bitmap of DSCPs. - dcb_ieee_getapp_default_prio_mask() which finds all default-priority rules for a given port in APP database, and returns a mask of priorities allowed by these default-priority rules. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/dcbnl.h | 13 ++++++++ net/dcb/dcbnl.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'net') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 0e5e91be2d30..e22a8a3c089b 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -34,6 +34,19 @@ int dcb_ieee_setapp(struct net_device *, struct dcb_app *); int dcb_ieee_delapp(struct net_device *, struct dcb_app *); u8 dcb_ieee_getapp_mask(struct net_device *, struct dcb_app *); +struct dcb_ieee_app_prio_map { + u64 map[IEEE_8021QAZ_MAX_TCS]; +}; +void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map); + +struct dcb_ieee_app_dscp_map { + u8 map[64]; +}; +void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, + struct dcb_ieee_app_dscp_map *p_map); +u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev); + int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 pid); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 013fdb6fa07a..a556cd708885 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1958,6 +1958,92 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); +/** + * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from + * priorities to the DSCP values assigned to that priority. Initialize p_map + * such that each map element holds a bit mask of DSCP values configured for + * that priority by APP entries. + */ +void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1ULL << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); + +/** + * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from + * DSCP values to the priorities assigned to that DSCP value. Initialize p_map + * such that each map element holds a bit mask of priorities configured for a + * given DSCP value by APP entries. + */ +void +dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, + struct dcb_ieee_app_dscp_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) + p_map->map[itr->app.protocol] |= 1 << itr->app.priority; + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); + +/** + * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet + * type, with valid PID values >= 1536. A special meaning is then assigned to + * protocol value of 0: "default priority. For use when priority is not + * otherwise specified". + * + * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries + * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default + * priorities set by these entries. + */ +u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 mask = 0; + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && + itr->app.protocol == 0 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) + mask |= 1 << itr->app.priority; + } + spin_unlock_bh(&dcb_lock); + + return mask; +} +EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); + static int __init dcbnl_init(void) { INIT_LIST_HEAD(&dcb_app_list); -- cgit v1.2.3 From 3e4e36436047155f67eafffe3062a09db1dff8df Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 27 Jul 2018 15:18:49 +0200 Subject: net/rds/Kconfig: Correct the RDS depends Remove prefix 'CONFIG_' from CONFIG_IPV6 Fixes: ba7d7e2677c0 ("net/rds/Kconfig: RDS should depend on IPV6") Reported-by: Eric Dumazet Signed-off-by: Anders Roxell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/rds/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 607128f10bcd..4c7f2595d919 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,7 +1,7 @@ config RDS tristate "The RDS Protocol" - depends on INET && CONFIG_IPV6 + depends on INET && IPV6 ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband or TCP. -- cgit v1.2.3 From 3ae5536b808dced0af5b2e6768a41862620c779d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Jul 2018 10:59:57 +0200 Subject: l2tp: ignore L2TP_ATTR_DATA_SEQ netlink attribute The value of this attribute is never used. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 7 ++++--- net/l2tp/l2tp_core.h | 8 -------- net/l2tp/l2tp_debugfs.c | 4 +--- net/l2tp/l2tp_netlink.c | 6 ------ 4 files changed, 5 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 7d570c7bd117..ae888606b3ec 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -65,9 +65,9 @@ struct sockaddr_l2tpip6 { * TUNNEL_MODIFY - CONN_ID, udpcsum * TUNNEL_GETSTATS - CONN_ID, (stats) * TUNNEL_GET - CONN_ID, (...) - * SESSION_CREATE - SESSION_ID, PW_TYPE, data_seq, cookie, peer_cookie, l2spec + * SESSION_CREATE - SESSION_ID, PW_TYPE, cookie, peer_cookie, l2spec * SESSION_DELETE - SESSION_ID - * SESSION_MODIFY - SESSION_ID, data_seq + * SESSION_MODIFY - SESSION_ID * SESSION_GET - SESSION_ID, (...) * SESSION_GETSTATS - SESSION_ID, (stats) * @@ -95,7 +95,7 @@ enum { L2TP_ATTR_PW_TYPE, /* u16, enum l2tp_pwtype */ L2TP_ATTR_ENCAP_TYPE, /* u16, enum l2tp_encap_type */ L2TP_ATTR_OFFSET, /* u16 (not used) */ - L2TP_ATTR_DATA_SEQ, /* u16 */ + L2TP_ATTR_DATA_SEQ, /* u16 (not used) */ L2TP_ATTR_L2SPEC_TYPE, /* u8, enum l2tp_l2spec_type */ L2TP_ATTR_L2SPEC_LEN, /* u8 (not used) */ L2TP_ATTR_PROTO_VERSION, /* u8 */ @@ -169,6 +169,7 @@ enum l2tp_encap_type { L2TP_ENCAPTYPE_IP, }; +/* For L2TP_ATTR_DATA_SEQ. Unused. */ enum l2tp_seqmode { L2TP_SEQ_NONE = 0, L2TP_SEQ_IP = 1, diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index d85fde793a8c..7dbfb55ab3b5 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -45,10 +45,6 @@ struct l2tp_tunnel; */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; - unsigned int data_seq:2; /* data sequencing level - * 0 => none, 1 => IP only, - * 2 => all - */ unsigned int recv_seq:1; /* expect receive packets with * sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence @@ -99,10 +95,6 @@ struct l2tp_session { char name[32]; /* for logging */ char ifname[IFNAMSIZ]; - unsigned int data_seq:2; /* data sequencing level - * 0 => none, 1 => IP only, - * 2 => all - */ unsigned int recv_seq:1; /* expect receive packets with * sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index b5d7dde003ef..91b9248610f0 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -191,12 +191,10 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) if (session->send_seq || session->recv_seq) seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns); seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count)); - seq_printf(m, " config %d/%d/%c/%c/%s/%s %08x %u\n", + seq_printf(m, " config %d/%d/%c/%c/-/%s %08x %u\n", session->mtu, session->mru, session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', - session->data_seq == 1 ? "IPSEQ" : - session->data_seq == 2 ? "DATASEQ" : "-", session->lns_mode ? "LNS" : "LAC", session->debug, jiffies_to_msecs(session->reorder_timeout)); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 5b9900889e31..e4785f6966f6 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -560,9 +560,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (tunnel->version > 2) { - if (info->attrs[L2TP_ATTR_DATA_SEQ]) - cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); - if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) { cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT && @@ -693,9 +690,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_DEBUG]) session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_DATA_SEQ]) - session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); -- cgit v1.2.3 From ae51a7c6d54876c47ae53c455434023df2c19801 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Jul 2018 10:59:58 +0200 Subject: l2tp: ignore L2TP_ATTR_VLAN_ID netlink attribute The value of this attribute is never used. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 4 ++-- net/l2tp/l2tp_core.h | 1 - net/l2tp/l2tp_netlink.c | 3 --- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index ae888606b3ec..41bf79a4b165 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -60,7 +60,7 @@ struct sockaddr_l2tpip6 { /* * Commands. * Valid TLVs of each command are:- - * TUNNEL_CREATE - CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, udpcsum, vlanid + * TUNNEL_CREATE - CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, udpcsum * TUNNEL_DELETE - CONN_ID * TUNNEL_MODIFY - CONN_ID, udpcsum * TUNNEL_GETSTATS - CONN_ID, (stats) @@ -105,7 +105,7 @@ enum { L2TP_ATTR_SESSION_ID, /* u32 */ L2TP_ATTR_PEER_SESSION_ID, /* u32 */ L2TP_ATTR_UDP_CSUM, /* u8 */ - L2TP_ATTR_VLAN_ID, /* u16 */ + L2TP_ATTR_VLAN_ID, /* u16 (not used) */ L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags */ diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 7dbfb55ab3b5..49fd5e05538c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -54,7 +54,6 @@ struct l2tp_session_cfg { * control of LNS. */ int debug; /* bitmask of debug message * categories */ - u16 vlan_id; /* VLAN pseudowire only */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index e4785f6966f6..8ea1deefbc37 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -591,9 +591,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (info->attrs[L2TP_ATTR_IFNAME]) cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); - - if (info->attrs[L2TP_ATTR_VLAN_ID]) - cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]); } if (info->attrs[L2TP_ATTR_DEBUG]) -- cgit v1.2.3 From 1998b5ed9c9bba5369e7c3659fc8a2e468e62bea Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Jul 2018 10:59:59 +0200 Subject: l2tp: drop ->flags from struct pppol2tp_session This field is not used. Keep validating user input in PPPIOCSFLAGS. Even though we discard the value, it would look wrong to succeed if an invalid address was passed from userspace. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 000c9829304c..759ce8421269 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -127,8 +127,6 @@ struct pppol2tp_session { * PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ struct rcu_head rcu; /* For asynchronous release */ - int flags; /* accessed by PPPIOCGFLAGS. - * Unused. */ }; static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); @@ -1057,7 +1055,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, int err = 0; struct sock *sk; int val = (int) arg; - struct pppol2tp_session *ps = l2tp_session_priv(session); struct l2tp_tunnel *tunnel = session->tunnel; struct pppol2tp_ioc_stats stats; @@ -1134,21 +1131,15 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, case PPPIOCGFLAGS: err = -EFAULT; - if (put_user(ps->flags, (int __user *) arg)) + if (put_user(0, (int __user *)arg)) break; - - l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n", - session->name, ps->flags); err = 0; break; case PPPIOCSFLAGS: err = -EFAULT; - if (get_user(val, (int __user *) arg)) + if (get_user(val, (int __user *)arg)) break; - ps->flags = val; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n", - session->name, ps->flags); err = 0; break; -- cgit v1.2.3 From 92ea4a7eec7289468ac8de5386f4b13d9c210cb5 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 27 Jul 2018 11:00:00 +0200 Subject: l2tp: drop ->mru from struct l2tp_session This field is not used. Treat PPPIOC*MRU the same way as PPPIOC*FLAGS: "get" requests return 0, while "set" requests vadidate the user supplied pointer but discard its value. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 2 +- net/l2tp/l2tp_core.c | 1 - net/l2tp/l2tp_core.h | 2 -- net/l2tp/l2tp_debugfs.c | 4 ++-- net/l2tp/l2tp_netlink.c | 10 +--------- net/l2tp/l2tp_ppp.c | 41 +++++------------------------------------ 6 files changed, 9 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 41bf79a4b165..8bb8c7cfabe5 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -120,7 +120,7 @@ enum { L2TP_ATTR_UDP_SPORT, /* u16 */ L2TP_ATTR_UDP_DPORT, /* u16 */ L2TP_ATTR_MTU, /* u16 */ - L2TP_ATTR_MRU, /* u16 */ + L2TP_ATTR_MRU, /* u16 (not used) */ L2TP_ATTR_STATS, /* nested */ L2TP_ATTR_IP6_SADDR, /* struct in6_addr */ L2TP_ATTR_IP6_DADDR, /* struct in6_addr */ diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d10f4ed52d92..c61a467fd9b8 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1675,7 +1675,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn session->pwtype = cfg->pw_type; session->debug = cfg->debug; session->mtu = cfg->mtu; - session->mru = cfg->mru; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 49fd5e05538c..fa5ae9432d38 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -62,7 +62,6 @@ struct l2tp_session_cfg { int reorder_timeout; /* configured reorder timeout * (in jiffies) */ int mtu; - int mru; char *ifname; }; @@ -107,7 +106,6 @@ struct l2tp_session { * (in jiffies) */ int reorder_skip; /* set if skip to next nr */ int mtu; - int mru; enum l2tp_pwtype pwtype; struct l2tp_stats stats; struct hlist_node global_hlist; /* Global hash list node */ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 91b9248610f0..aee271741f5b 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -191,8 +191,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) if (session->send_seq || session->recv_seq) seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns); seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count)); - seq_printf(m, " config %d/%d/%c/%c/-/%s %08x %u\n", - session->mtu, session->mru, + seq_printf(m, " config %d/0/%c/%c/-/%s %08x %u\n", + session->mtu, session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 8ea1deefbc37..a7c409215336 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -611,9 +611,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_MTU]) cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]); - if (info->attrs[L2TP_ATTR_MRU]) - cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]); - #ifdef CONFIG_MODULES if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { genl_unlock(); @@ -704,9 +701,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_MTU]) session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]); - if (info->attrs[L2TP_ATTR_MRU]) - session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]); - ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); @@ -737,9 +731,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) || - nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) || - (session->mru && - nla_put_u16(skb, L2TP_ATTR_MRU, session->mru))) + nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu)) goto nla_put_failure; if ((session->ifname[0] && diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 759ce8421269..44cac66284a5 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -570,10 +570,9 @@ static void pppol2tp_session_init(struct l2tp_session *session) if (dst) { u32 pmtu = dst_mtu(dst); - if (pmtu) { + if (pmtu) session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD; - session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; - } + dst_release(dst); } } @@ -781,7 +780,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, } else { /* Default MTU must allow space for UDP/L2TP/PPP headers */ cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; - cfg.mru = cfg.mtu; cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), @@ -885,8 +883,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, /* Default MTU values. */ if (cfg->mtu == 0) cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; - if (cfg->mru == 0) - cfg->mru = cfg->mtu; /* Allocate and initialize a new session context. */ session = l2tp_session_create(sizeof(struct pppol2tp_session), @@ -1101,34 +1097,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, break; case PPPIOCGMRU: - err = -ENXIO; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - err = -EFAULT; - if (put_user(session->mru, (int __user *) arg)) - break; - - l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n", - session->name, session->mru); - err = 0; - break; - - case PPPIOCSMRU: - err = -ENXIO; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - err = -EFAULT; - if (get_user(val, (int __user *) arg)) - break; - - session->mru = val; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n", - session->name, session->mru); - err = 0; - break; - case PPPIOCGFLAGS: err = -EFAULT; if (put_user(0, (int __user *)arg)) @@ -1136,6 +1104,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, err = 0; break; + case PPPIOCSMRU: case PPPIOCSFLAGS: err = -EFAULT; if (get_user(val, (int __user *)arg)) @@ -1723,8 +1692,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) tunnel->peer_tunnel_id, session->peer_session_id, state, user_data_ok); - seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n", - session->mtu, session->mru, + seq_printf(m, " %d/0/%c/%c/%s %08x %u\n", + session->mtu, session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", -- cgit v1.2.3 From 2db6dc2662bab14e59517ab4b86a164cc4d2db42 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Thu, 26 Jul 2018 19:45:10 -0700 Subject: sch_cake: Make gso-splitting configurable from userspace This patch restores cake's deployed behavior at line rate to always split gso, and makes gso splitting configurable from userspace. running cake unlimited (unshaped) at 1gigE, local traffic: no-split-gso bql limit: 131966 split-gso bql limit: ~42392-45420 On this 4 stream test splitting gso apart results in halving the observed interpacket latency at no loss in throughput. Summary of tcp_nup test run 'gso-split' (at 2018-07-26 16:03:51.824728): Ping (ms) ICMP : 0.83 0.81 ms 341 TCP upload avg : 235.43 235.39 Mbits/s 301 TCP upload sum : 941.71 941.56 Mbits/s 301 TCP upload::1 : 235.45 235.43 Mbits/s 271 TCP upload::2 : 235.45 235.41 Mbits/s 289 TCP upload::3 : 235.40 235.40 Mbits/s 288 TCP upload::4 : 235.41 235.40 Mbits/s 291 verses Summary of tcp_nup test run 'no-split-gso' (at 2018-07-26 16:37:23.563960): avg median # data pts Ping (ms) ICMP : 1.67 1.73 ms 348 TCP upload avg : 234.56 235.37 Mbits/s 301 TCP upload sum : 938.24 941.49 Mbits/s 301 TCP upload::1 : 234.55 235.38 Mbits/s 285 TCP upload::2 : 234.57 235.37 Mbits/s 286 TCP upload::3 : 234.58 235.37 Mbits/s 274 TCP upload::4 : 234.54 235.42 Mbits/s 288 Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 539c9490c308..35fc7252187c 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -80,7 +80,6 @@ #define CAKE_QUEUES (1024) #define CAKE_FLOW_MASK 63 #define CAKE_FLOW_NAT_FLAG 64 -#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate @@ -2569,10 +2568,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); - if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) - q->rate_flags |= CAKE_FLAG_SPLIT_GSO; - else - q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + if (tb[TCA_CAKE_SPLIT_GSO]) { + if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + else + q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + } if (q->tins) { sch_tree_lock(sch); @@ -2608,7 +2609,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, q->target = 5000; /* 5ms: codel RFC argues * for 5 to 10% of interval */ - + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; q->cur_tin = 0; q->cur_flow = 0; -- cgit v1.2.3 From 04b9ce48ef19e09d8c65eb506b7982e99db212d7 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 27 Jul 2018 17:28:25 +0800 Subject: net: tipc: name_table: Replace GFP_ATOMIC with GFP_KERNEL in tipc_nametbl_init() tipc_nametbl_init() is never called in atomic context. It calls kzalloc() with GFP_ATOMIC, which is not necessary. GFP_ATOMIC can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- net/tipc/name_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bebe88cae07b..88f027b502f6 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -735,7 +735,7 @@ int tipc_nametbl_init(struct net *net) struct name_table *nt; int i; - nt = kzalloc(sizeof(*nt), GFP_ATOMIC); + nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; -- cgit v1.2.3 From a0732548ba03c27fb42da4cf8e1eecc205760f12 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 27 Jul 2018 17:31:35 +0800 Subject: net: tipc: bcast: Replace GFP_ATOMIC with GFP_KERNEL in tipc_bcast_init() tipc_bcast_init() is never called in atomic context. It calls kzalloc() with GFP_ATOMIC, which is not necessary. GFP_ATOMIC can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- net/tipc/bcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f3711176be45..9ee6cfea56dd 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -512,7 +512,7 @@ int tipc_bcast_init(struct net *net) struct tipc_bc_base *bb = NULL; struct tipc_link *l = NULL; - bb = kzalloc(sizeof(*bb), GFP_ATOMIC); + bb = kzalloc(sizeof(*bb), GFP_KERNEL); if (!bb) goto enomem; tn->bcbase = bb; -- cgit v1.2.3 From 5a3611efe5b3095f348c892d040202b2ae969f4e Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Thu, 26 Jul 2018 07:59:35 -0700 Subject: tls: Remove dead code in tls_sw_sendmsg tls_push_record either returns 0 on success or a negative value on failure. This patch removes code that would only be executed if tls_push_record were to return a positive value. Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f9971717f7e0..e80d70a1e138 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -423,12 +423,10 @@ alloc_encrypted: copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); - if (!ret) - continue; - if (ret < 0) + if (ret) goto send_end; + continue; - copied -= try_to_copy; fallback_to_reg_send: iov_iter_revert(&msg->msg_iter, ctx->sg_plaintext_size - orig_size); -- cgit v1.2.3 From 2da19ed3e4a87db16c0f69039da9f17a9596c350 Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Thu, 26 Jul 2018 07:59:36 -0700 Subject: tls: Fix improper revert in zerocopy_from_iter The current code is problematic because the iov_iter is reverted and never advanced in the non-error case. This patch skips the revert in the non-error case. This patch also fixes the amount by which the iov_iter is reverted. Currently, iov_iter is reverted by size, which can be greater than the amount by which the iter was actually advanced. Instead, only revert by the amount that the iter was advanced. Fixes: 4718799817c5 ("tls: Fix zerocopy_from_iter iov handling") Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e80d70a1e138..6deceb7c56ba 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -263,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge, bool revert) + bool charge) { struct page *pages[MAX_SKB_FRAGS]; @@ -312,10 +312,10 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } out: + if (rc) + iov_iter_revert(from, size - *size_used); *size_used = size; *pages_used = num_elem; - if (revert) - iov_iter_revert(from, size); return rc; } @@ -417,7 +417,7 @@ alloc_encrypted: &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true, false); + true); if (ret) goto fallback_to_reg_send; @@ -428,8 +428,6 @@ alloc_encrypted: continue; fallback_to_reg_send: - iov_iter_revert(&msg->msg_iter, - ctx->sg_plaintext_size - orig_size); trim_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, @@ -834,7 +832,7 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], - MAX_SKB_FRAGS, false, true); + MAX_SKB_FRAGS, false); if (err < 0) goto fallback_to_reg_recv; -- cgit v1.2.3 From d0c1f01138c4b7e532889474e3f2a485546d7270 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Wed, 25 Jul 2018 13:19:13 +0200 Subject: net/ipv6: allow any source address for sendmsg pktinfo with ip_nonlocal_bind When freebind feature is set of an IPv6 socket, any source address can be used when sending UDP datagrams using IPv6 PKTINFO ancillary message. Global non-local bind feature was added in commit 35a256fee52c ("ipv6: Nonlocal bind") for IPv6. This commit also allows IPv6 source address spoofing when non-local bind feature is enabled. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5a094f58fe8a..f0264dfd38de 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -803,7 +803,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + if (!(net->ipv6.sysctl.ip_nonlocal_bind || + inet_sk(sk)->freebind || inet_sk(sk)->transparent) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && -- cgit v1.2.3 From 5cbf777cfdf6e5a7b7149006e4881a255da78fdd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 27 Jul 2018 16:37:28 +0800 Subject: route: add support for directed broadcast forwarding This patch implements the feature described in rfc1812#section-5.3.5.2 and rfc2644. It allows the router to forward directed broadcast when sysctl bc_forwarding is enabled. Note that this feature could be done by iptables -j TEE, but it would cause some problems: - target TEE's gateway param has to be set with a specific address, and it's not flexible especially when the route wants forward all directed broadcasts. - this duplicates the directed broadcasts so this may cause side effects to applications. Besides, to keep consistent with other os router like BSD, it's also necessary to implement it in the route rx path. Note that route cache needs to be flushed when bc_forwarding is changed. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/uapi/linux/ip.h | 1 + include/uapi/linux/netconf.h | 1 + net/ipv4/devinet.c | 11 +++++++++++ net/ipv4/route.c | 6 +++++- 5 files changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 27650f1bff3d..c759d1cbcedd 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) +#define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index b24a742beae5..e42d13b55cf3 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -168,6 +168,7 @@ enum IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, IPV4_DEVCONF_DROP_GRATUITOUS_ARP, + IPV4_DEVCONF_BC_FORWARDING, __IPV4_DEVCONF_MAX }; diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h index c84fcdfca862..fac4edd55379 100644 --- a/include/uapi/linux/netconf.h +++ b/include/uapi/linux/netconf.h @@ -18,6 +18,7 @@ enum { NETCONFA_PROXY_NEIGH, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_INPUT, + NETCONFA_BC_FORWARDING, __NETCONFA_MAX }; #define NETCONFA_MAX (__NETCONFA_MAX - 1) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7585ab1a77a..ea4bd8a52422 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (all || type == NETCONFA_BC_FORWARDING) + size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((all || type == NETCONFA_BC_FORWARDING) && + nla_put_s32(skb, NETCONFA_BC_FORWARDING, + IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0) + goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) @@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && + new_value != old_value) + rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); @@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), + DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1df6e97106d7..b678466da451 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto no_route; } - if (res->type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) { + if (IN_DEV_BFORWARD(in_dev)) + goto make_route; goto brd_input; + } if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; +make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; -- cgit v1.2.3 From 3e7a50ceb11ea75c27e944f1a01e478fd62a2d8d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 27 Jul 2018 13:43:22 -0700 Subject: net: report min and max mtu network device settings Report the minimum and maximum MTU allowed on a device via netlink so that it can be displayed by tools like ip link. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 553c438cabe3..43391e2d1153 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -164,6 +164,8 @@ enum { IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 92b6fa5d5f6e..510d4f765a13 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1015,6 +1015,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_IF_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + + nla_total_size(4) /* IFLA_MIN_MTU */ + + nla_total_size(4) /* IFLA_MAX_MTU */ + 0; } @@ -1601,6 +1603,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || + nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || + nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || @@ -1732,6 +1736,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IF_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, + [IFLA_MIN_MTU] = { .type = NLA_U32 }, + [IFLA_MAX_MTU] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From 7a4c53bee3324ac00bf964aa2f82d15d279e86e4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 27 Jul 2018 13:43:23 -0700 Subject: net: report invalid mtu value via netlink extack If an invalid MTU value is set through rtnetlink return extra error information instead of putting message in kernel log. For other cases where there is no visible API, keep the error report in the log. Example: # ip li set dev enp12s0 mtu 10000 Error: mtu greater than device maximum. # ifconfig enp12s0 mtu 10000 SIOCSIFMTU: Invalid argument # dmesg | tail -1 [ 2047.795467] enp12s0: mtu greater than device maximum Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 23 +++++++++++++++++------ net/core/rtnetlink.c | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1295c7a452e..9c917467a2c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3546,6 +3546,8 @@ int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); int dev_change_tx_queue_len(struct net_device *, unsigned long); void dev_set_group(struct net_device *, int); diff --git a/net/core/dev.c b/net/core/dev.c index 87c42c8249ae..89031b5fef9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7523,13 +7523,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(__dev_set_mtu); /** - * dev_set_mtu - Change maximum transfer unit + * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit + * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ -int dev_set_mtu(struct net_device *dev, int new_mtu) +int dev_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { int err, orig_mtu; @@ -7538,14 +7540,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { - net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", - dev->name, new_mtu, dev->min_mtu); + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { - net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", - dev->name, new_mtu, dev->max_mtu); + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } @@ -7573,6 +7573,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) } return err; } + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + struct netlink_ext_ack extack; + int err; + + err = dev_set_mtu_ext(dev, new_mtu, &extack); + if (err) + net_err_ratelimited("%s: %s\n", dev->name, extack._msg); + return err; +} EXPORT_SYMBOL(dev_set_mtu); /** diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 510d4f765a13..24431e578310 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2382,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_MTU]) { - err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; -- cgit v1.2.3 From 0a80848ec5cc1294984e648b9a71aecf69c4bb73 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Jul 2018 18:29:01 +0800 Subject: act_pedit: remove unnecessary semicolon net/sched/act_pedit.c:289:2-3: Unneeded semicolon Remove unneeded semicolon. Generated by: scripts/coccinelle/misc/semicolon.cocci Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 9ab5d81aff1a..43ba999b2d23 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -286,7 +286,7 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb, default: ret = -EINVAL; break; - }; + } return ret; } -- cgit v1.2.3 From f9562fa4a5750d097f4468c0a7fc9a4e0d2dfdc3 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Jul 2018 18:35:15 +0800 Subject: cls_bpf: Use kmemdup instead of duplicating it in cls_bpf_prog_from_ops Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. Signed-off-by: YueHaibing Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 66e0ac9811f9..fa6fe2fe0f32 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -349,12 +349,10 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) if (bpf_size != nla_len(tb[TCA_BPF_OPS])) return -EINVAL; - bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; - memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); - fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; -- cgit v1.2.3 From 3f6bcc5162a1ba4e99e867364919168c1d821308 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Jul 2018 18:38:06 +0800 Subject: act_bpf: Use kmemdup instead of duplicating it in tcf_bpf_init_from_ops Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. Signed-off-by: YueHaibing Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 06f743d8ed41..6203eb075c9a 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -196,12 +196,10 @@ static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS])) return -EINVAL; - bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; - memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size); - fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; -- cgit v1.2.3 From 222440b4e832059c0ddf18d1e409f0552ab53a7d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Jul 2018 12:48:04 +0200 Subject: netfilter: nf_tables: handle meta/lookup with direct call Currently nft uses inlined variants for common operations such as 'ip saddr 1.2.3.4' instead of an indirect call. Also handle meta get operations and lookups without indirect call, both are builtin. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 7 +++++++ net/netfilter/nf_tables_core.c | 16 +++++++++++++++- net/netfilter/nft_lookup.c | 6 +++--- net/netfilter/nft_meta.c | 6 +++--- 4 files changed, 28 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index a05134507e7b..8da837d2aaf9 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -71,4 +71,11 @@ extern struct nft_set_type nft_set_hash_fast_type; extern struct nft_set_type nft_set_rbtree_type; extern struct nft_set_type nft_set_bitmap_type; +struct nft_expr; +struct nft_regs; +struct nft_pktinfo; +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); +void nft_lookup_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 8de912ca53d3..ffd5c0f9412b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -120,6 +120,20 @@ struct nft_jumpstack { struct nft_rule *const *rules; }; +static void expr_call_ops_eval(const struct nft_expr *expr, + struct nft_regs *regs, + struct nft_pktinfo *pkt) +{ + unsigned long e = (unsigned long)expr->ops->eval; + + if (e == (unsigned long)nft_meta_get_eval) + nft_meta_get_eval(expr, regs, pkt); + else if (e == (unsigned long)nft_lookup_eval) + nft_lookup_eval(expr, regs, pkt); + else + expr->ops->eval(expr, regs, pkt); +} + unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv) { @@ -153,7 +167,7 @@ next_rule: nft_cmp_fast_eval(expr, ®s); else if (expr->ops != &nft_payload_fast_ops || !nft_payload_fast_eval(expr, ®s, pkt)) - expr->ops->eval(expr, ®s, pkt); + expr_call_ops_eval(expr, ®s, pkt); if (regs.verdict.code != NFT_CONTINUE) break; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c2a1d84cdfc4..ad13e8643599 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -26,9 +26,9 @@ struct nft_lookup { struct nft_set_binding binding; }; -static void nft_lookup_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_lookup_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2b94dcc43456..297fe7d97c18 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -41,9 +41,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); #include "../bridge/br_private.h" #endif -static void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; -- cgit v1.2.3 From 6decb5b45e70d6ffff6488cc8e8bad6b9ac7f99b Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:32 +0530 Subject: Bluetooth: Define PHY flags in hdev and set 1M as default 1M is mandatory to be supported by LE controllers and the same would be set in power on. This patch defines hdev flags for LE PHYs and set 1M to default. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 ++++ include/net/bluetooth/hci_core.h | 3 +++ net/bluetooth/hci_core.c | 9 +++++---- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 73e48be5bbb3..664fe1ebf2c7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1514,6 +1514,10 @@ struct hci_cp_le_set_default_phy { __u8 rx_phys; } __packed; +#define HCI_LE_SET_PHY_1M 0x01 +#define HCI_LE_SET_PHY_2M 0x02 +#define HCI_LE_SET_PHY_CODED 0x04 + #define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 struct hci_cp_le_set_ext_scan_params { __u8 own_addr_type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a74453571264..71f79df9ee05 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -315,6 +315,9 @@ struct hci_dev { unsigned long sco_last_tx; unsigned long le_last_tx; + __u8 le_tx_def_phys; + __u8 le_rx_def_phys; + struct workqueue_struct *workqueue; struct workqueue_struct *req_workqueue; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f5c21004186c..432f89f390c0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -830,10 +830,9 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[35] & 0x20) { struct hci_cp_le_set_default_phy cp; - /* No transmitter PHY or receiver PHY preferences */ - cp.all_phys = 0x03; - cp.tx_phys = 0; - cp.rx_phys = 0; + cp.all_phys = 0x00; + cp.tx_phys = hdev->le_tx_def_phys; + cp.rx_phys = hdev->le_rx_def_phys; hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp); } @@ -3027,6 +3026,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; + hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; + hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; -- cgit v1.2.3 From 6244691fec4dd0adebca255e60e0ed7ac8155b2e Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:34 +0530 Subject: Bluetooth: Implement Get PHY Configuration mgmt command This commands basically retrieve the supported packet types of BREDR and supported PHYs of the controller. BR_1M_1SLOT, LE_1M_TX and LE_1M_RX would be supported by default. Other PHYs are supported based on the local features. Also this sets PHY_CONFIGURATION bit in supported settings. @ MGMT Command: Get PHY Configuration (0x0044) plen 0 @ MGMT Event: Command Complete (0x0001) plen 15 Get PHY Configuration (0x0044) plen 12 Status: Success (0x00) Supported PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Configurable PHYs: 0x79fe BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 2M TX LE 2M RX LE CODED TX LE CODED RX Selected PHYs: 0x07ff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 25 ++++++++ net/bluetooth/mgmt.c | 145 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index e7303eee65cd..1c93d6e83a6c 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -101,6 +101,7 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_PRIVACY 0x00002000 #define MGMT_SETTING_CONFIGURATION 0x00004000 #define MGMT_SETTING_STATIC_ADDRESS 0x00008000 +#define MGMT_SETTING_PHY_CONFIGURATION 0x00010000 #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 @@ -604,6 +605,30 @@ struct mgmt_cp_set_appearance { } __packed; #define MGMT_SET_APPEARANCE_SIZE 2 +#define MGMT_OP_GET_PHY_CONFIGURATION 0x0044 +struct mgmt_rp_get_phy_confguration { + __le32 supported_phys; + __le32 configurable_phys; + __le32 selected_phys; +} __packed; +#define MGMT_GET_PHY_CONFIGURATION_SIZE 0 + +#define MGMT_PHY_BR_1M_1SLOT 0x00000001 +#define MGMT_PHY_BR_1M_3SLOT 0x00000002 +#define MGMT_PHY_BR_1M_5SLOT 0x00000004 +#define MGMT_PHY_EDR_2M_1SLOT 0x00000008 +#define MGMT_PHY_EDR_2M_3SLOT 0x00000010 +#define MGMT_PHY_EDR_2M_5SLOT 0x00000020 +#define MGMT_PHY_EDR_3M_1SLOT 0x00000040 +#define MGMT_PHY_EDR_3M_3SLOT 0x00000080 +#define MGMT_PHY_EDR_3M_5SLOT 0x00000100 +#define MGMT_PHY_LE_1M_TX 0x00000200 +#define MGMT_PHY_LE_1M_RX 0x00000400 +#define MGMT_PHY_LE_2M_TX 0x00000800 +#define MGMT_PHY_LE_2M_RX 0x00001000 +#define MGMT_PHY_LE_CODED_TX 0x00002000 +#define MGMT_PHY_LE_CODED_RX 0x00004000 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 8a80d48d89c4..c8c3b39fa9f2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -617,6 +617,127 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +static u32 get_supported_phys(struct hci_dev *hdev) +{ + u32 supported_phys = 0; + + if (lmp_bredr_capable(hdev)) { + supported_phys |= MGMT_PHY_BR_1M_1SLOT; + + if (hdev->features[0][0] & LMP_3SLOT) + supported_phys |= MGMT_PHY_BR_1M_3SLOT; + + if (hdev->features[0][0] & LMP_5SLOT) + supported_phys |= MGMT_PHY_BR_1M_5SLOT; + + if (lmp_edr_2m_capable(hdev)) { + supported_phys |= MGMT_PHY_EDR_2M_1SLOT; + + if (lmp_edr_3slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_2M_3SLOT; + + if (lmp_edr_5slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_2M_5SLOT; + + if (lmp_edr_3m_capable(hdev)) { + supported_phys |= MGMT_PHY_EDR_3M_1SLOT; + + if (lmp_edr_3slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_3M_3SLOT; + + if (lmp_edr_5slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_3M_5SLOT; + } + } + } + + if (lmp_le_capable(hdev)) { + supported_phys |= MGMT_PHY_LE_1M_TX; + supported_phys |= MGMT_PHY_LE_1M_RX; + + if (hdev->le_features[1] & HCI_LE_PHY_2M) { + supported_phys |= MGMT_PHY_LE_2M_TX; + supported_phys |= MGMT_PHY_LE_2M_RX; + } + + if (hdev->le_features[1] & HCI_LE_PHY_CODED) { + supported_phys |= MGMT_PHY_LE_CODED_TX; + supported_phys |= MGMT_PHY_LE_CODED_RX; + } + } + + return supported_phys; +} + +static u32 get_selected_phys(struct hci_dev *hdev) +{ + u32 selected_phys = 0; + + if (lmp_bredr_capable(hdev)) { + selected_phys |= MGMT_PHY_BR_1M_1SLOT; + + if (hdev->pkt_type & (HCI_DM3 | HCI_DH3)) + selected_phys |= MGMT_PHY_BR_1M_3SLOT; + + if (hdev->pkt_type & (HCI_DM5 | HCI_DH5)) + selected_phys |= MGMT_PHY_BR_1M_5SLOT; + + if (lmp_edr_2m_capable(hdev)) { + if (!(hdev->pkt_type & HCI_2DH1)) + selected_phys |= MGMT_PHY_EDR_2M_1SLOT; + + if (lmp_edr_3slot_capable(hdev) && + !(hdev->pkt_type & HCI_2DH3)) + selected_phys |= MGMT_PHY_EDR_2M_3SLOT; + + if (lmp_edr_5slot_capable(hdev) && + !(hdev->pkt_type & HCI_2DH5)) + selected_phys |= MGMT_PHY_EDR_2M_5SLOT; + + if (lmp_edr_3m_capable(hdev)) { + if (!(hdev->pkt_type & HCI_3DH1)) + selected_phys |= MGMT_PHY_EDR_3M_1SLOT; + + if (lmp_edr_3slot_capable(hdev) && + !(hdev->pkt_type & HCI_3DH3)) + selected_phys |= MGMT_PHY_EDR_3M_3SLOT; + + if (lmp_edr_5slot_capable(hdev) && + !(hdev->pkt_type & HCI_3DH5)) + selected_phys |= MGMT_PHY_EDR_3M_5SLOT; + } + } + } + + if (lmp_le_capable(hdev)) { + if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M) + selected_phys |= MGMT_PHY_LE_1M_TX; + + if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M) + selected_phys |= MGMT_PHY_LE_1M_RX; + + if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M) + selected_phys |= MGMT_PHY_LE_2M_TX; + + if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M) + selected_phys |= MGMT_PHY_LE_2M_RX; + + if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED) + selected_phys |= MGMT_PHY_LE_CODED_TX; + + if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED) + selected_phys |= MGMT_PHY_LE_CODED_RX; + } + + return selected_phys; +} + +static u32 get_configurable_phys(struct hci_dev *hdev) +{ + return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT & + ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX); +} + static u32 get_supported_settings(struct hci_dev *hdev) { u32 settings = 0; @@ -654,6 +775,8 @@ static u32 get_supported_settings(struct hci_dev *hdev) hdev->set_bdaddr) settings |= MGMT_SETTING_CONFIGURATION; + settings |= MGMT_SETTING_PHY_CONFIGURATION; + return settings; } @@ -3184,6 +3307,27 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, return err; } +static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_rp_get_phy_confguration rp; + + BT_DBG("sock %p %s", sk, hdev->name); + + hci_dev_lock(hdev); + + memset(&rp, 0, sizeof(rp)); + + rp.supported_phys = cpu_to_le32(get_supported_phys(hdev)); + rp.selected_phys = cpu_to_le32(get_selected_phys(hdev)); + rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev)); + + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0, + &rp, sizeof(rp)); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -6544,6 +6688,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, HCI_MGMT_UNTRUSTED }, { set_appearance, MGMT_SET_APPEARANCE_SIZE }, + { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From 0314f2867fa0c46d0fc1c23c80e7fab9435079df Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:35 +0530 Subject: Bluetooth: Implement Set PHY Confguration command This enables user to set phys which will be used in all subsequent connections. Also host will use the same in LE scanning as well. @ MGMT Command: Set PHY Configuration (0x0045) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX < HCI Command: LE Set Default PHY (0x08|0x0031) plen 3 All PHYs preference: 0x00 TX PHYs preference: 0x07 LE 1M LE 2M LE Coded RX PHYs preference: 0x07 LE 1M LE 2M LE Coded > HCI Event: Command Complete (0x0e) plen 4 LE Set Default PHY (0x08|0x0031) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 3 Set PHY Configuration (0x0045) plen 0 Status: Success (0x00) @ MGMT Event: PHY Configuration Changed (0x0026) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 19 +++++ net/bluetooth/hci_event.c | 26 +++++++ net/bluetooth/mgmt.c | 182 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 1c93d6e83a6c..0916e203e5d9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -629,6 +629,25 @@ struct mgmt_rp_get_phy_confguration { #define MGMT_PHY_LE_CODED_TX 0x00002000 #define MGMT_PHY_LE_CODED_RX 0x00004000 +#define MGMT_PHY_BREDR_MASK (MGMT_PHY_BR_1M_1SLOT | MGMT_PHY_BR_1M_3SLOT | \ + MGMT_PHY_BR_1M_5SLOT | MGMT_PHY_EDR_2M_1SLOT | \ + MGMT_PHY_EDR_2M_3SLOT | MGMT_PHY_EDR_2M_5SLOT | \ + MGMT_PHY_EDR_3M_1SLOT | MGMT_PHY_EDR_3M_3SLOT | \ + MGMT_PHY_EDR_3M_5SLOT) +#define MGMT_PHY_LE_MASK (MGMT_PHY_LE_1M_TX | MGMT_PHY_LE_1M_RX | \ + MGMT_PHY_LE_2M_TX | MGMT_PHY_LE_2M_RX | \ + MGMT_PHY_LE_CODED_TX | MGMT_PHY_LE_CODED_RX) +#define MGMT_PHY_LE_TX_MASK (MGMT_PHY_LE_1M_TX | MGMT_PHY_LE_2M_TX | \ + MGMT_PHY_LE_CODED_TX) +#define MGMT_PHY_LE_RX_MASK (MGMT_PHY_LE_1M_RX | MGMT_PHY_LE_2M_RX | \ + MGMT_PHY_LE_CODED_RX) + +#define MGMT_OP_SET_PHY_CONFIGURATION 0x0045 +struct mgmt_cp_set_phy_confguration { + __le32 selected_phys; +} __packed; +#define MGMT_SET_PHY_CONFIGURATION_SIZE 4 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 68192152c23b..694231541a4c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1042,6 +1042,28 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + struct hci_cp_le_set_default_phy *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY); + if (!cp) + return; + + hci_dev_lock(hdev); + + hdev->le_tx_def_phys = cp->tx_phys; + hdev->le_rx_def_phys = cp->rx_phys; + + hci_dev_unlock(hdev); +} + static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { __u8 *sent, status = *((__u8 *) skb->data); @@ -3163,6 +3185,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_ext_scan_enable(hdev, skb); break; + case HCI_OP_LE_SET_DEFAULT_PHY: + hci_cc_le_set_default_phy(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c8c3b39fa9f2..7cd6a37a63ee 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3328,6 +3328,187 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +static void set_default_phy_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct mgmt_cp_set_phy_confguration *cp; + struct mgmt_pending_cmd *cmd; + + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev); + if (!cmd) + goto unlock; + + cp = cmd->param; + + if (status) { + mgmt_cmd_status(cmd->sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + mgmt_status(status)); + } else { + mgmt_cmd_complete(cmd->sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, 0, + NULL, 0); + } + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + +static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_set_phy_confguration *cp = data; + struct hci_cp_le_set_default_phy cp_phy; + struct mgmt_pending_cmd *cmd; + struct hci_request req; + u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys; + u16 pkt_type = (HCI_DH1 | HCI_DM1); + int err; + + BT_DBG("sock %p %s", sk, hdev->name); + + configurable_phys = get_configurable_phys(hdev); + supported_phys = get_supported_phys(hdev); + selected_phys = __le32_to_cpu(cp->selected_phys); + + if (selected_phys & ~supported_phys) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_INVALID_PARAMS); + + unconfigure_phys = supported_phys & ~configurable_phys; + + if ((selected_phys & unconfigure_phys) != unconfigure_phys) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_INVALID_PARAMS); + + if (selected_phys == get_selected_phys(hdev)) + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + 0, NULL, 0); + + hci_dev_lock(hdev); + + if (!hdev_is_powered(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_REJECTED); + goto unlock; + } + + if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_BUSY); + goto unlock; + } + + if (selected_phys & MGMT_PHY_BR_1M_3SLOT) + pkt_type |= (HCI_DH3 | HCI_DM3); + else + pkt_type &= ~(HCI_DH3 | HCI_DM3); + + if (selected_phys & MGMT_PHY_BR_1M_5SLOT) + pkt_type |= (HCI_DH5 | HCI_DM5); + else + pkt_type &= ~(HCI_DH5 | HCI_DM5); + + if (selected_phys & MGMT_PHY_EDR_2M_1SLOT) + pkt_type &= ~HCI_2DH1; + else + pkt_type |= HCI_2DH1; + + if (selected_phys & MGMT_PHY_EDR_2M_3SLOT) + pkt_type &= ~HCI_2DH3; + else + pkt_type |= HCI_2DH3; + + if (selected_phys & MGMT_PHY_EDR_2M_5SLOT) + pkt_type &= ~HCI_2DH5; + else + pkt_type |= HCI_2DH5; + + if (selected_phys & MGMT_PHY_EDR_3M_1SLOT) + pkt_type &= ~HCI_3DH1; + else + pkt_type |= HCI_3DH1; + + if (selected_phys & MGMT_PHY_EDR_3M_3SLOT) + pkt_type &= ~HCI_3DH3; + else + pkt_type |= HCI_3DH3; + + if (selected_phys & MGMT_PHY_EDR_3M_5SLOT) + pkt_type &= ~HCI_3DH5; + else + pkt_type |= HCI_3DH5; + + if (pkt_type != hdev->pkt_type) + hdev->pkt_type = pkt_type; + + if ((selected_phys & MGMT_PHY_LE_MASK) == + (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) { + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + 0, NULL, 0); + + goto unlock; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, + len); + if (!cmd) { + err = -ENOMEM; + goto unlock; + } + + hci_req_init(&req, hdev); + + memset(&cp_phy, 0, sizeof(cp_phy)); + + if (!(selected_phys & MGMT_PHY_LE_TX_MASK)) + cp_phy.all_phys |= 0x01; + + if (!(selected_phys & MGMT_PHY_LE_RX_MASK)) + cp_phy.all_phys |= 0x02; + + if (selected_phys & MGMT_PHY_LE_1M_TX) + cp_phy.tx_phys |= HCI_LE_SET_PHY_1M; + + if (selected_phys & MGMT_PHY_LE_2M_TX) + cp_phy.tx_phys |= HCI_LE_SET_PHY_2M; + + if (selected_phys & MGMT_PHY_LE_CODED_TX) + cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED; + + if (selected_phys & MGMT_PHY_LE_1M_RX) + cp_phy.rx_phys |= HCI_LE_SET_PHY_1M; + + if (selected_phys & MGMT_PHY_LE_2M_RX) + cp_phy.rx_phys |= HCI_LE_SET_PHY_2M; + + if (selected_phys & MGMT_PHY_LE_CODED_RX) + cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED; + + hci_req_add(&req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy); + + err = hci_req_run_skb(&req, set_default_phy_complete); + if (err < 0) + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); + + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -6689,6 +6870,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { HCI_MGMT_UNTRUSTED }, { set_appearance, MGMT_SET_APPEARANCE_SIZE }, { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, + { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From b7c23df85b6a1c3bcfb591cfa938d341fc3a556e Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:36 +0530 Subject: Bluetooth: Implement PHY changed event This defines and implement phy changed event and send it to user whenever selected PHYs changes using SET_PHY_CONFIGURATION. This will be also trigerred when BREDR pkt_type is changed using the legacy ioctl HCISETPTYPE. @ MGMT Command: Set PHY Configuration (0x0045) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX < HCI Command: LE Set Default PHY (0x08|0x0031) plen 3 All PHYs preference: 0x00 TX PHYs preference: 0x07 LE 1M LE 2M LE Coded RX PHYs preference: 0x07 LE 1M LE 2M LE Coded > HCI Event: Command Complete (0x0e) plen 4 LE Set Default PHY (0x08|0x0031) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 3 Set PHY Configuration (0x0045) plen 0 Status: Success (0x00) @ MGMT Event: PHY Configuration Changed (0x0026) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 5 +++++ net/bluetooth/hci_core.c | 4 ++++ net/bluetooth/mgmt.c | 22 +++++++++++++++++++++- 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a64d13f91d09..ab5d494a545a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1544,6 +1544,7 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); +int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 0916e203e5d9..7f372e9067c9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -868,3 +868,8 @@ struct mgmt_ev_ext_info_changed { __le16 eir_len; __u8 eir[0]; } __packed; + +#define MGMT_EV_PHY_CONFIGURATION_CHANGED 0x0026 +struct mgmt_ev_phy_configuration_changed { + __le32 selected_phys; +} __packed; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 432f89f390c0..523e91ad64d0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1924,7 +1924,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) break; case HCISETPTYPE: + if (hdev->pkt_type == (__u16) dr.dev_opt) + break; + hdev->pkt_type = (__u16) dr.dev_opt; + mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7cd6a37a63ee..1867aadc5061 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3328,6 +3328,18 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) +{ + struct mgmt_ev_phy_configuration_changed ev; + + memset(&ev, 0, sizeof(ev)); + + ev.selected_phys = cpu_to_le32(get_selected_phys(hdev)); + + return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev, + sizeof(ev), skip); +} + static void set_default_phy_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -3352,6 +3364,8 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status, mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); + + mgmt_phy_configuration_changed(hdev, cmd->sk); } mgmt_pending_remove(cmd); @@ -3369,6 +3383,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, struct hci_request req; u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys; u16 pkt_type = (HCI_DH1 | HCI_DM1); + bool changed = false; int err; BT_DBG("sock %p %s", sk, hdev->name); @@ -3450,11 +3465,16 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, else pkt_type |= HCI_3DH5; - if (pkt_type != hdev->pkt_type) + if (pkt_type != hdev->pkt_type) { hdev->pkt_type = pkt_type; + changed = true; + } if ((selected_phys & MGMT_PHY_LE_MASK) == (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) { + if (changed) + mgmt_phy_configuration_changed(hdev, sk); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); -- cgit v1.2.3 From 45bdd86eafc7d29e0b4b6681bec9c6ab8eddc6bf Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:37 +0530 Subject: Bluetooth: Set Scan PHYs based on selected PHYs by user Use the PHYs selected in Set Phy Configuration management command while scanning. < HCI Command: LE Set Extended Scan Parameters (0x08|0x0041) plen 13 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x05 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) Entry 1: LE Coded Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 +++- include/net/bluetooth/hci_core.h | 9 +++++++++ net/bluetooth/hci_request.c | 37 ++++++++++++++++++++++++++++--------- 3 files changed, 40 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 89bf800f6eb1..04211457367a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1540,7 +1540,9 @@ struct hci_cp_le_set_ext_scan_params { __u8 data[0]; } __packed; -#define LE_SCAN_PHY_1M 0x01 +#define LE_SCAN_PHY_1M 0x01 +#define LE_SCAN_PHY_2M 0x02 +#define LE_SCAN_PHY_CODED 0x04 struct hci_cp_le_scan_phy_params { __u8 type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ab5d494a545a..113c9bb609c7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1165,6 +1165,15 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +#define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) + +#define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) + +#define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) + /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40)) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index faf7c711234c..215059a7646e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -790,8 +790,8 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, struct hci_cp_le_set_ext_scan_params *ext_param_cp; struct hci_cp_le_set_ext_scan_enable ext_enable_cp; struct hci_cp_le_scan_phy_params *phy_params; - /* Ony single PHY (1M) is supported as of now */ - u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 1]; + u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 2]; + u32 plen; ext_param_cp = (void *)data; phy_params = (void *)ext_param_cp->data; @@ -799,16 +799,35 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, memset(ext_param_cp, 0, sizeof(*ext_param_cp)); ext_param_cp->own_addr_type = own_addr_type; ext_param_cp->filter_policy = filter_policy; - ext_param_cp->scanning_phys = LE_SCAN_PHY_1M; - memset(phy_params, 0, sizeof(*phy_params)); - phy_params->type = type; - phy_params->interval = cpu_to_le16(interval); - phy_params->window = cpu_to_le16(window); + plen = sizeof(*ext_param_cp); + + if (scan_1m(hdev) || scan_2m(hdev)) { + ext_param_cp->scanning_phys |= LE_SCAN_PHY_1M; + + memset(phy_params, 0, sizeof(*phy_params)); + phy_params->type = type; + phy_params->interval = cpu_to_le16(interval); + phy_params->window = cpu_to_le16(window); + + plen += sizeof(*phy_params); + phy_params++; + } + + if (scan_coded(hdev)) { + ext_param_cp->scanning_phys |= LE_SCAN_PHY_CODED; + + memset(phy_params, 0, sizeof(*phy_params)); + phy_params->type = type; + phy_params->interval = cpu_to_le16(interval); + phy_params->window = cpu_to_le16(window); + + plen += sizeof(*phy_params); + phy_params++; + } hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS, - sizeof(*ext_param_cp) + sizeof(*phy_params), - ext_param_cp); + plen, ext_param_cp); memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); ext_enable_cp.enable = LE_SCAN_ENABLE; -- cgit v1.2.3 From b2cc9761f144e8ef714be8c590603073b80ddc13 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:38 +0530 Subject: Bluetooth: Handle extended ADV PDU types This patch defines the extended ADV types and handle it in ADV report. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ net/bluetooth/hci_event.c | 50 +++++++++++++++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 04211457367a..83a1593a128e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1976,6 +1976,14 @@ struct hci_ev_le_conn_complete { #define LE_LEGACY_SCAN_RSP_ADV 0x001b #define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a +/* Extended Advertising event types */ +#define LE_EXT_ADV_NON_CONN_IND 0x0000 +#define LE_EXT_ADV_CONN_IND 0x0001 +#define LE_EXT_ADV_SCAN_IND 0x0002 +#define LE_EXT_ADV_DIRECT_IND 0x0004 +#define LE_EXT_ADV_SCAN_RSP 0x0008 +#define LE_EXT_ADV_LEGACY_PDU 0x0010 + #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 694231541a4c..5fa00f488cfc 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5137,21 +5137,45 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } -static u8 convert_legacy_evt_type(u16 evt_type) -{ - switch (evt_type) { - case LE_LEGACY_ADV_IND: +static u8 ext_evt_type_to_legacy(u16 evt_type) +{ + if (evt_type & LE_EXT_ADV_LEGACY_PDU) { + switch (evt_type) { + case LE_LEGACY_ADV_IND: + return LE_ADV_IND; + case LE_LEGACY_ADV_DIRECT_IND: + return LE_ADV_DIRECT_IND; + case LE_LEGACY_ADV_SCAN_IND: + return LE_ADV_SCAN_IND; + case LE_LEGACY_NONCONN_IND: + return LE_ADV_NONCONN_IND; + case LE_LEGACY_SCAN_RSP_ADV: + case LE_LEGACY_SCAN_RSP_ADV_SCAN: + return LE_ADV_SCAN_RSP; + } + + BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", + evt_type); + + return LE_ADV_INVALID; + } + + if (evt_type & LE_EXT_ADV_CONN_IND) { + if (evt_type & LE_EXT_ADV_DIRECT_IND) + return LE_ADV_DIRECT_IND; + return LE_ADV_IND; - case LE_LEGACY_ADV_DIRECT_IND: - return LE_ADV_DIRECT_IND; - case LE_LEGACY_ADV_SCAN_IND: + } + + if (evt_type & LE_EXT_ADV_SCAN_RSP) + return LE_ADV_SCAN_RSP; + + if (evt_type & LE_EXT_ADV_SCAN_IND) return LE_ADV_SCAN_IND; - case LE_LEGACY_NONCONN_IND: + + if (evt_type == LE_EXT_ADV_NON_CONN_IND || + evt_type & LE_EXT_ADV_DIRECT_IND) return LE_ADV_NONCONN_IND; - case LE_LEGACY_SCAN_RSP_ADV: - case LE_LEGACY_SCAN_RSP_ADV_SCAN: - return LE_ADV_SCAN_RSP; - } BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", evt_type); @@ -5172,7 +5196,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) u16 evt_type; evt_type = __le16_to_cpu(ev->evt_type); - legacy_evt_type = convert_legacy_evt_type(evt_type); + legacy_evt_type = ext_evt_type_to_legacy(evt_type); if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &ev->bdaddr, ev->bdaddr_type, NULL, 0, ev->rssi, -- cgit v1.2.3 From 4e6e99e9336ce863449c2570dc1d1d6c2c886ac0 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:39 +0530 Subject: Bluetooth: Use selected PHYs in extended connect Use the selected PHYs by Set PHY Configuration management command in extended create connection. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 61 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index cc967ca67962..64e828ad3951 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -748,6 +748,26 @@ static bool conn_use_rpa(struct hci_conn *conn) return hci_dev_test_flag(hdev, HCI_PRIVACY); } +static void set_ext_conn_params(struct hci_conn *conn, + struct hci_cp_le_ext_conn_param *p) +{ + struct hci_dev *hdev = conn->hdev; + + memset(p, 0, sizeof(*p)); + + /* Set window to be the same value as the interval to + * enable continuous scanning. + */ + p->scan_interval = cpu_to_le16(hdev->le_scan_interval); + p->scan_window = p->scan_interval; + p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); + p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); + p->conn_latency = cpu_to_le16(conn->le_conn_latency); + p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); + p->min_ce_len = cpu_to_le16(0x0000); + p->max_ce_len = cpu_to_le16(0x0000); +} + static void hci_req_add_le_create_conn(struct hci_request *req, struct hci_conn *conn, bdaddr_t *direct_rpa) @@ -777,8 +797,8 @@ static void hci_req_add_le_create_conn(struct hci_request *req, if (use_ext_conn(hdev)) { struct hci_cp_le_ext_create_conn *cp; struct hci_cp_le_ext_conn_param *p; - /* As of now only LE 1M is supported */ - u8 data[sizeof(*cp) + sizeof(*p) * 1]; + u8 data[sizeof(*cp) + sizeof(*p) * 3]; + u32 plen; cp = (void *) data; p = (void *) cp->data; @@ -788,24 +808,33 @@ static void hci_req_add_le_create_conn(struct hci_request *req, bacpy(&cp->peer_addr, &conn->dst); cp->peer_addr_type = conn->dst_type; cp->own_addr_type = own_addr_type; - cp->phys = LE_SCAN_PHY_1M; - memset(p, 0, sizeof(*p)); + plen = sizeof(*cp); - /* Set window to be the same value as the interval to enable - * continuous scanning. - */ + if (scan_1m(hdev)) { + cp->phys |= LE_SCAN_PHY_1M; + set_ext_conn_params(conn, p); + + p++; + plen += sizeof(*p); + } + + if (scan_2m(hdev)) { + cp->phys |= LE_SCAN_PHY_2M; + set_ext_conn_params(conn, p); - p->scan_interval = cpu_to_le16(hdev->le_scan_interval); - p->scan_window = p->scan_interval; - p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); - p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); - p->conn_latency = cpu_to_le16(conn->le_conn_latency); - p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); - p->min_ce_len = cpu_to_le16(0x0000); - p->max_ce_len = cpu_to_le16(0x0000); + p++; + plen += sizeof(*p); + } + + if (scan_coded(hdev)) { + cp->phys |= LE_SCAN_PHY_CODED; + set_ext_conn_params(conn, p); + + plen += sizeof(*p); + } - hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, sizeof(data), data); + hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, plen, data); } else { struct hci_cp_le_create_conn cp; -- cgit v1.2.3 From 6b49bcb4bce2ed0f0aefe8e304a8b9cbaeeaa3f0 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:40 +0530 Subject: Bluetooth: Read no of adv sets during init This patch reads the number of advertising sets in the controller during init and save it in hdev. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 7 +++++++ include/net/bluetooth/hci_core.h | 4 ++++ net/bluetooth/hci_core.c | 16 ++++++++++++++-- net/bluetooth/hci_event.c | 18 ++++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 83a1593a128e..3f93ae9765a4 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -410,6 +410,7 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 @@ -1579,6 +1580,12 @@ struct hci_cp_le_ext_conn_param { __le16 max_ce_len; } __packed; +#define HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS 0x203b +struct hci_rp_le_read_num_supported_adv_sets { + __u8 status; + __u8 num_of_sets; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 113c9bb609c7..2aad4a863176 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -222,6 +222,7 @@ struct hci_dev { __u8 le_features[8]; __u8 le_white_list_size; __u8 le_resolv_list_size; + __u8 le_num_of_adv_sets; __u8 le_states[8]; __u8 commands[64]; __u8 hci_ver; @@ -1180,6 +1181,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext create connection if command is supported */ #define use_ext_conn(dev) ((dev)->commands[37] & 0x80) +/* Extended advertising support */ +#define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 523e91ad64d0..7b08b7f57418 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -715,8 +715,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); - if (hdev->commands[25] & 0x40) { - /* Read LE Advertising Channel TX Power */ + /* Read LE Advertising Channel TX Power */ + if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) { + /* HCI TS spec forbids mixing of legacy and extended + * advertising commands wherein READ_ADV_TX_POWER is + * also included. So do not call it if extended adv + * is supported otherwise controller will return + * COMMAND_DISALLOWED for extended commands. + */ hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); } @@ -750,6 +756,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL); } + if (ext_adv_capable(hdev)) { + /* Read LE Number of Supported Advertising Sets */ + hci_req_add(req, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS, + 0, NULL); + } + hci_set_le_support(req); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5fa00f488cfc..0ceb52edc142 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1267,6 +1267,20 @@ static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev, le_set_scan_enable_complete(hdev, cp->enable); } +static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_le_read_num_supported_adv_sets *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x No of Adv sets %u", hdev->name, rp->status, + rp->num_of_sets); + + if (rp->status) + return; + + hdev->le_num_of_adv_sets = rp->num_of_sets; +} + static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3189,6 +3203,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_default_phy(hdev, skb); break; + case HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS: + hci_cc_le_read_num_adv_sets(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; -- cgit v1.2.3 From de181e887ac27dadda127c7d4c3e89c6da8fb6d2 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:41 +0530 Subject: Bluetooth: Impmlement extended adv enable This patch basically replaces legacy adv with extended adv based on the controller support. Currently there is no design change. ie only one adv set will be enabled at a time. This also adds tx_power in instance and store whatever returns from Set_ext_parameter, use the same in adv data as well. For instance 0 tx_power is stored in hdev only. < HCI Command: LE Set Extended Advertising Parameters (0x08|0x0036) plen 25 Handle: 0x00 Properties: 0x0010 Use legacy advertising PDUs: ADV_NONCONN_IND Min advertising interval: 1280.000 msec (0x0800) Max advertising interval: 1280.000 msec (0x0800) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: 127 dbm (0x7f) Primary PHY: LE 1M (0x01) Secondary max skip: 0x00 Secondary PHY: LE 1M (0x01) SID: 0x00 Scan request notifications: Disabled (0x00) > HCI Event: Command Complete (0x0e) plen 5 LE Set Extended Advertising Parameters (0x08|0x0036) ncmd 1 Status: Success (0x00) TX power (selected): 7 dbm (0x07) < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 6 Extended advertising: Enabled (0x01) Number of sets: 1 (0x01) Entry 0 Handle: 0x00 Duration: 0 ms (0x00) Max ext adv events: 0 > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 39 +++++++++ include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 2 + net/bluetooth/hci_event.c | 72 +++++++++++++++++ net/bluetooth/hci_request.c | 171 ++++++++++++++++++++++++++++++++++----- net/bluetooth/hci_request.h | 3 + net/bluetooth/mgmt.c | 22 +++-- 7 files changed, 285 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3f93ae9765a4..b447b127879e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1586,6 +1586,45 @@ struct hci_rp_le_read_num_supported_adv_sets { __u8 num_of_sets; } __packed; +#define HCI_OP_LE_SET_EXT_ADV_PARAMS 0x2036 +struct hci_cp_le_set_ext_adv_params { + __u8 handle; + __le16 evt_properties; + __u8 min_interval[3]; + __u8 max_interval[3]; + __u8 channel_map; + __u8 own_addr_type; + __u8 peer_addr_type; + bdaddr_t peer_addr; + __u8 filter_policy; + __u8 tx_power; + __u8 primary_phy; + __u8 secondary_max_skip; + __u8 secondary_phy; + __u8 sid; + __u8 notif_enable; +} __packed; + +#define HCI_ADV_PHY_1M 0X01 + +struct hci_rp_le_set_ext_adv_params { + __u8 status; + __u8 tx_power; +} __packed; + +#define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 +struct hci_cp_le_set_ext_adv_enable { + __u8 enable; + __u8 num_of_sets; + __u8 data[0]; +} __packed; + +struct hci_cp_ext_adv_set { + __u8 handle; + __le16 duration; + __u8 max_events; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2aad4a863176..ad3518303a0c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -171,6 +171,7 @@ struct adv_info { __u8 adv_data[HCI_MAX_AD_LENGTH]; __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __s8 tx_power; }; #define HCI_MAX_ADV_INSTANCES 5 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 7b08b7f57418..944d4fedc317 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2779,6 +2779,8 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, else adv_instance->duration = duration; + adv_instance->tx_power = HCI_TX_POWER_INVALID; + BT_DBG("%s for %dMR", hdev->name, instance); return 0; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0ceb52edc142..0418a5514819 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1099,6 +1099,41 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *adv_set; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE); + if (!cp) + return; + + adv_set = (void *) cp->data; + + hci_dev_lock(hdev); + + if (cp->enable) { + struct hci_conn *conn; + + hci_dev_set_flag(hdev, HCI_LE_ADV); + + conn = hci_lookup_le_connect(hdev); + if (conn) + queue_delayed_work(hdev->workqueue, + &conn->le_conn_timeout, + conn->conn_timeout); + } + + hci_dev_unlock(hdev); +} + static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_cp_le_set_scan_param *cp; @@ -1486,6 +1521,35 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_le_set_ext_adv_params *rp = (void *) skb->data; + struct hci_cp_le_set_ext_adv_params *cp; + struct adv_info *adv_instance; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS); + if (!cp) + return; + + hci_dev_lock(hdev); + hdev->adv_addr_type = cp->own_addr_type; + if (!hdev->cur_adv_instance) { + /* Store in hdev for instance 0 */ + hdev->adv_tx_power = rp->tx_power; + } else { + adv_instance = hci_find_adv_instance(hdev, + hdev->cur_adv_instance); + if (adv_instance) + adv_instance->tx_power = rp->tx_power; + } + hci_dev_unlock(hdev); +} + static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_rssi *rp = (void *) skb->data; @@ -3207,6 +3271,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_read_num_adv_sets(hdev, skb); break; + case HCI_OP_LE_SET_EXT_ADV_PARAMS: + hci_cc_set_ext_adv_param(hdev, skb); + break; + + case HCI_OP_LE_SET_EXT_ADV_ENABLE: + hci_cc_le_set_ext_adv_enable(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 215059a7646e..2ac9fd67440a 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -895,6 +895,24 @@ void hci_req_add_le_passive_scan(struct hci_request *req) hdev->le_scan_window, own_addr_type, filter_policy); } +static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + /* Ignore instance 0 */ + if (instance == 0x00) + return 0; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + + /* TODO: Take into account the "appearance" and "local-name" flags here. + * These are currently being ignored as they are not supported. + */ + return adv_instance->scan_rsp_len; +} + static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) { u8 instance = hdev->cur_adv_instance; @@ -1235,15 +1253,27 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) ptr += adv_instance->adv_data_len; } - /* Provide Tx Power only if we can provide a valid value for it */ - if (hdev->adv_tx_power != HCI_TX_POWER_INVALID && - (instance_flags & MGMT_ADV_FLAG_TX_POWER)) { - ptr[0] = 0x02; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8)hdev->adv_tx_power; + if (instance_flags & MGMT_ADV_FLAG_TX_POWER) { + s8 adv_tx_power; - ad_len += 3; - ptr += 3; + if (ext_adv_capable(hdev)) { + if (adv_instance) + adv_tx_power = adv_instance->tx_power; + else + adv_tx_power = hdev->adv_tx_power; + } else { + adv_tx_power = hdev->adv_tx_power; + } + + /* Provide Tx Power only if we can provide a valid value for it */ + if (adv_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 0x02; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)adv_tx_power; + + ad_len += 3; + ptr += 3; + } } return ad_len; @@ -1304,9 +1334,13 @@ void hci_req_reenable_advertising(struct hci_dev *hdev) __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance, true); } else { - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); - __hci_req_enable_advertising(&req); + if (ext_adv_capable(hdev)) { + __hci_req_start_ext_adv(&req, 0x00); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + __hci_req_enable_advertising(&req); + } } hci_req_run(&req, adv_enable_complete); @@ -1343,6 +1377,87 @@ unlock: hci_dev_unlock(hdev); } +static int __hci_req_setup_ext_adv_instance(struct hci_request *req, + u8 instance) +{ + struct hci_cp_le_set_ext_adv_params cp; + struct hci_dev *hdev = req->hdev; + bool connectable; + u32 flags; + /* In ext adv set param interval is 3 octets */ + const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; + + flags = get_adv_instance_flags(hdev, instance); + + /* If the "connectable" instance flag was not set, then choose between + * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. + */ + connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || + mgmt_get_connectable(hdev); + + if (!is_advertising_allowed(hdev, connectable)) + return -EPERM; + + memset(&cp, 0, sizeof(cp)); + + memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); + memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval)); + + if (connectable) + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); + else if (get_adv_instance_scan_rsp_len(hdev, instance)) + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); + + cp.own_addr_type = BDADDR_LE_PUBLIC; + cp.channel_map = hdev->le_adv_channel_map; + cp.tx_power = 127; + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_1M; + cp.handle = 0; + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); + + return 0; +} + +void __hci_req_enable_ext_advertising(struct hci_request *req) +{ + struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *adv_set; + u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; + + cp = (void *) data; + adv_set = (void *) cp->data; + + memset(cp, 0, sizeof(*cp)); + + cp->enable = 0x01; + cp->num_of_sets = 0x01; + + memset(adv_set, 0, sizeof(*adv_set)); + + adv_set->handle = 0; + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, + sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets, + data); +} + +int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) +{ + int err; + + err = __hci_req_setup_ext_adv_instance(req, instance); + if (err < 0) + return err; + + __hci_req_enable_ext_advertising(req); + + return 0; +} + int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, bool force) { @@ -1396,9 +1511,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, return 0; hdev->cur_adv_instance = instance; - __hci_req_update_adv_data(req, instance); - __hci_req_update_scan_rsp_data(req, instance); - __hci_req_enable_advertising(req); + if (ext_adv_capable(hdev)) { + __hci_req_start_ext_adv(req, instance); + } else { + __hci_req_update_adv_data(req, instance); + __hci_req_update_scan_rsp_data(req, instance); + __hci_req_enable_advertising(req); + } return 0; } @@ -1669,8 +1788,12 @@ static int connectable_update(struct hci_request *req, unsigned long opt) /* Update the advertising parameters if necessary */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !list_empty(&hdev->adv_instances)) - __hci_req_enable_advertising(req); + !list_empty(&hdev->adv_instances)) { + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(req, hdev->cur_adv_instance); + else + __hci_req_enable_advertising(req); + } __hci_update_background_scan(req); @@ -1779,8 +1902,12 @@ static int discoverable_update(struct hci_request *req, unsigned long opt) /* Discoverable mode affects the local advertising * address in limited privacy mode. */ - if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) - __hci_req_enable_advertising(req); + if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) { + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(req, 0x00); + else + __hci_req_enable_advertising(req); + } } hci_dev_unlock(hdev); @@ -2376,8 +2503,12 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt) __hci_req_update_adv_data(req, 0x00); __hci_req_update_scan_rsp_data(req, 0x00); - if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - __hci_req_enable_advertising(req); + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(req, 0x00); + else + __hci_req_enable_advertising(req); + } } else if (!list_empty(&hdev->adv_instances)) { struct adv_info *adv_instance; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 702beb140d9f..9b8c74df6b2b 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -80,6 +80,9 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, struct hci_request *req, u8 instance, bool force); +int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); +void __hci_req_enable_ext_advertising(struct hci_request *req); + void __hci_req_update_class(struct hci_request *req); /* Returns true if HCI commands were queued */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1867aadc5061..761a9aeaa824 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -940,7 +940,10 @@ static void rpa_expired(struct work_struct *work) * function. */ hci_req_init(&req, hdev); - __hci_req_enable_advertising(&req); + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(&req, hdev->cur_adv_instance); + else + __hci_req_enable_advertising(&req); hci_req_run(&req, NULL); } @@ -4382,9 +4385,14 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, * HCI_ADVERTISING flag is not yet set. */ hdev->cur_adv_instance = 0x00; - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); - __hci_req_enable_advertising(&req); + + if (ext_adv_capable(hdev)) { + __hci_req_start_ext_adv(&req, 0x00); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + __hci_req_enable_advertising(&req); + } } else { __hci_req_disable_advertising(&req); } @@ -6312,7 +6320,11 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_APPEARANCE; flags |= MGMT_ADV_FLAG_LOCAL_NAME; - if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) + /* In extended adv TX_POWER returned from Set Adv Param + * will be always valid. + */ + if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) || + ext_adv_capable(hdev)) flags |= MGMT_ADV_FLAG_TX_POWER; return flags; -- cgit v1.2.3 From a0fb3726ba55138ef6fdd5dc67da6d9a70360696 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:42 +0530 Subject: Bluetooth: Use Set ext adv/scan rsp data if controller supports This patch implements Set Ext Adv data and Set Ext Scan rsp data if controller support extended advertising. Currently the operation is set as Complete data and fragment preference is set as no fragment < HCI Command: LE Set Extended Advertising Data (0x08|0x0037) plen 35 Handle: 0x00 Operation: Complete extended advertising data (0x03) Fragment preference: Minimize fragmentation (0x01) Data length: 0x15 16-bit Service UUIDs (complete): 2 entries Heart Rate (0x180d) Battery Service (0x180f) Name (complete): Test LE Company: Google (224) Data: 0102 > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Data (0x08|0x0037) ncmd 1 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 22 ++++++++ net/bluetooth/hci_event.c | 2 + net/bluetooth/hci_request.c | 126 +++++++++++++++++++++++++++++++++----------- net/bluetooth/hci_request.h | 1 + net/bluetooth/mgmt.c | 13 +++-- 5 files changed, 130 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index b447b127879e..aace97099ead 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1625,6 +1625,28 @@ struct hci_cp_ext_adv_set { __u8 max_events; } __packed; +#define HCI_OP_LE_SET_EXT_ADV_DATA 0x2037 +struct hci_cp_le_set_ext_adv_data { + __u8 handle; + __u8 operation; + __u8 frag_pref; + __u8 length; + __u8 data[HCI_MAX_AD_LENGTH]; +} __packed; + +#define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 +struct hci_cp_le_set_ext_scan_rsp_data { + __u8 handle; + __u8 operation; + __u8 frag_pref; + __u8 length; + __u8 data[HCI_MAX_AD_LENGTH]; +} __packed; + +#define LE_SET_ADV_DATA_OP_COMPLETE 0x03 + +#define LE_SET_ADV_DATA_NO_FRAG 0x01 + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0418a5514819..0a92bf7e3d80 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1547,6 +1547,8 @@ static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb) if (adv_instance) adv_instance->tx_power = rp->tx_power; } + /* Update adv data as tx power is known now */ + hci_req_update_adv_data(hdev, hdev->cur_adv_instance); hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 2ac9fd67440a..c41e9bb7818b 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1174,29 +1174,58 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_rsp_data cp; u8 len; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; - memset(&cp, 0, sizeof(cp)); + if (ext_adv_capable(hdev)) { + struct hci_cp_le_set_ext_scan_rsp_data cp; - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, cp.data); - else - len = create_default_scan_rsp_data(hdev, cp.data); + memset(&cp, 0, sizeof(cp)); - if (hdev->scan_rsp_data_len == len && - !memcmp(cp.data, hdev->scan_rsp_data, len)) - return; + if (instance) + len = create_instance_scan_rsp_data(hdev, instance, + cp.data); + else + len = create_default_scan_rsp_data(hdev, cp.data); + + if (hdev->scan_rsp_data_len == len && + !memcmp(cp.data, hdev->scan_rsp_data, len)) + return; + + memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + hdev->scan_rsp_data_len = len; + + cp.handle = 0; + cp.length = len; + cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp), + &cp); + } else { + struct hci_cp_le_set_scan_rsp_data cp; + + memset(&cp, 0, sizeof(cp)); + + if (instance) + len = create_instance_scan_rsp_data(hdev, instance, + cp.data); + else + len = create_default_scan_rsp_data(hdev, cp.data); + + if (hdev->scan_rsp_data_len == len && + !memcmp(cp.data, hdev->scan_rsp_data, len)) + return; - memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); - hdev->scan_rsp_data_len = len; + memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + hdev->scan_rsp_data_len = len; - cp.length = len; + cp.length = len; - hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); + } } static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) @@ -1282,27 +1311,51 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) void __hci_req_update_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_adv_data cp; u8 len; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; - memset(&cp, 0, sizeof(cp)); + if (ext_adv_capable(hdev)) { + struct hci_cp_le_set_ext_adv_data cp; - len = create_instance_adv_data(hdev, instance, cp.data); + memset(&cp, 0, sizeof(cp)); - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) - return; + len = create_instance_adv_data(hdev, instance, cp.data); + + /* There's nothing to do if the data hasn't changed */ + if (hdev->adv_data_len == len && + memcmp(cp.data, hdev->adv_data, len) == 0) + return; + + memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + hdev->adv_data_len = len; + + cp.length = len; + cp.handle = 0; + cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); - hdev->adv_data_len = len; + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp); + } else { + struct hci_cp_le_set_adv_data cp; + + memset(&cp, 0, sizeof(cp)); - cp.length = len; + len = create_instance_adv_data(hdev, instance, cp.data); + + /* There's nothing to do if the data hasn't changed */ + if (hdev->adv_data_len == len && + memcmp(cp.data, hdev->adv_data, len) == 0) + return; - hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); + memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + hdev->adv_data_len = len; + + cp.length = len; + + hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); + } } int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) @@ -1377,8 +1430,7 @@ unlock: hci_dev_unlock(hdev); } -static int __hci_req_setup_ext_adv_instance(struct hci_request *req, - u8 instance) +int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; struct hci_dev *hdev = req->hdev; @@ -1453,6 +1505,7 @@ int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) if (err < 0) return err; + __hci_req_update_scan_rsp_data(req, instance); __hci_req_enable_ext_advertising(req); return 0; @@ -2500,14 +2553,25 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt) */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || list_empty(&hdev->adv_instances)) { - __hci_req_update_adv_data(req, 0x00); - __hci_req_update_scan_rsp_data(req, 0x00); + int err; + + if (ext_adv_capable(hdev)) { + err = __hci_req_setup_ext_adv_instance(req, + 0x00); + if (!err) + __hci_req_update_scan_rsp_data(req, + 0x00); + } else { + err = 0; + __hci_req_update_adv_data(req, 0x00); + __hci_req_update_scan_rsp_data(req, 0x00); + } if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { - if (ext_adv_capable(hdev)) - __hci_req_start_ext_adv(req, 0x00); - else + if (!ext_adv_capable(hdev)) __hci_req_enable_advertising(req); + else if (!err) + __hci_req_enable_ext_advertising(req); } } else if (!list_empty(&hdev->adv_instances)) { struct adv_info *adv_instance; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 9b8c74df6b2b..6afc624605af 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -80,6 +80,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, struct hci_request *req, u8 instance, bool force); +int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); void __hci_req_enable_ext_advertising(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 761a9aeaa824..142f7e72a9a2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1847,10 +1847,17 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) */ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { struct hci_request req; - hci_req_init(&req, hdev); - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); + if (ext_adv_capable(hdev)) { + int err; + + err = __hci_req_setup_ext_adv_instance(&req, 0x00); + if (!err) + __hci_req_update_scan_rsp_data(&req, 0x00); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + } hci_req_run(&req, NULL); hci_update_background_scan(hdev); } -- cgit v1.2.3 From 45b7749f16aacd9ffab8e958caa77e2aa2358c0b Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:43 +0530 Subject: Bluetooth: Implement disable and removal of adv instance If ext adv is enabled then use ext adv to disable as well. Also remove the adv set during LE disable. < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 2 Extended advertising: Disabled (0x00) Number of sets: Disable all sets (0x00) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 ++ net/bluetooth/hci_event.c | 2 ++ net/bluetooth/hci_request.c | 23 +++++++++++++++++++++-- net/bluetooth/hci_request.h | 1 + net/bluetooth/mgmt.c | 3 +++ 5 files changed, 29 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index aace97099ead..faa2922a69fd 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1647,6 +1647,8 @@ struct hci_cp_le_set_ext_scan_rsp_data { #define LE_SET_ADV_DATA_NO_FRAG 0x01 +#define HCI_OP_LE_CLEAR_ADV_SETS 0x203d + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0a92bf7e3d80..a78d1dd2f57b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1129,6 +1129,8 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, conn->conn_timeout); + } else { + hci_dev_clear_flag(hdev, HCI_LE_ADV); } hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c41e9bb7818b..96e1e05a92c3 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -934,9 +934,19 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) void __hci_req_disable_advertising(struct hci_request *req) { - u8 enable = 0x00; + if (ext_adv_capable(req->hdev)) { + struct hci_cp_le_set_ext_adv_enable cp; - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + cp.enable = 0x00; + /* Disable all sets since we only support one set at the moment */ + cp.num_of_sets = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp); + } else { + u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + } } static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) @@ -1430,6 +1440,11 @@ unlock: hci_dev_unlock(hdev); } +void __hci_req_clear_ext_adv_sets(struct hci_request *req) +{ + hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); +} + int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; @@ -1499,8 +1514,12 @@ void __hci_req_enable_ext_advertising(struct hci_request *req) int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) { + struct hci_dev *hdev = req->hdev; int err; + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + __hci_req_disable_advertising(req); + err = __hci_req_setup_ext_adv_instance(req, instance); if (err < 0) return err; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 6afc624605af..2451861bb4f8 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -83,6 +83,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); void __hci_req_enable_ext_advertising(struct hci_request *req); +void __hci_req_clear_ext_adv_sets(struct hci_request *req); void __hci_req_update_class(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 142f7e72a9a2..c283f0364c0f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1956,6 +1956,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) } else { if (hci_dev_test_flag(hdev, HCI_LE_ADV)) __hci_req_disable_advertising(&req); + + if (ext_adv_capable(hdev)) + __hci_req_clear_ext_adv_sets(&req); } hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp), -- cgit v1.2.3 From 075e40b79f6d0aa1479701d2dd6dea3b78478d60 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:44 +0530 Subject: Bluetooth: Use ext adv for directed adv This patch does extended advertising for directed advertising if the controller supportes. Instance 0 is used for directed advertising. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 67 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 64e828ad3951..5c37d383caa3 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -868,35 +868,58 @@ static void hci_req_directed_advertising(struct hci_request *req, struct hci_conn *conn) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_adv_param cp; u8 own_addr_type; u8 enable; - /* Clear the HCI_LE_ADV bit temporarily so that the - * hci_update_random_address knows that it's safe to go ahead - * and write a new random address. The flag will be set back on - * as soon as the SET_ADV_ENABLE HCI command completes. - */ - hci_dev_clear_flag(hdev, HCI_LE_ADV); + if (ext_adv_capable(hdev)) { + struct hci_cp_le_set_ext_adv_params cp; - /* Set require_privacy to false so that the remote device has a - * chance of identifying us. - */ - if (hci_update_random_address(req, false, conn_use_rpa(conn), - &own_addr_type) < 0) - return; + memset(&cp, 0, sizeof(cp)); - memset(&cp, 0, sizeof(cp)); - cp.type = LE_ADV_DIRECT_IND; - cp.own_address_type = own_addr_type; - cp.direct_addr_type = conn->dst_type; - bacpy(&cp.direct_addr, &conn->dst); - cp.channel_map = hdev->le_adv_channel_map; + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND); + cp.own_addr_type = own_addr_type; + cp.channel_map = hdev->le_adv_channel_map; + cp.tx_power = HCI_TX_POWER_INVALID; + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_1M; + cp.handle = 0; /* Use instance 0 for directed adv */ + cp.own_addr_type = own_addr_type; + cp.peer_addr_type = conn->dst_type; + bacpy(&cp.peer_addr, &conn->dst); + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); - hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); + __hci_req_enable_ext_advertising(req); + } else { + struct hci_cp_le_set_adv_param cp; + + /* Clear the HCI_LE_ADV bit temporarily so that the + * hci_update_random_address knows that it's safe to go ahead + * and write a new random address. The flag will be set back on + * as soon as the SET_ADV_ENABLE HCI command completes. + */ + hci_dev_clear_flag(hdev, HCI_LE_ADV); - enable = 0x01; - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + /* Set require_privacy to false so that the remote device has a + * chance of identifying us. + */ + if (hci_update_random_address(req, false, conn_use_rpa(conn), + &own_addr_type) < 0) + return; + + memset(&cp, 0, sizeof(cp)); + cp.type = LE_ADV_DIRECT_IND; + cp.own_address_type = own_addr_type; + cp.direct_addr_type = conn->dst_type; + bacpy(&cp.direct_addr, &conn->dst); + cp.channel_map = hdev->le_adv_channel_map; + + hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); + + enable = 0x01; + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), + &enable); + } conn->state = BT_CONNECT; } -- cgit v1.2.3 From a73c046a2869048430c332a871a5b169f192c6c3 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:45 +0530 Subject: Bluetooth: Implement Set ADV set random address This basically sets the random address for the adv instance Random address can be set only if the instance is created which is done in Set ext adv param. Random address and rpa expire timer and flags have been added to adv instance which will be used when the respective instance is scheduled. This introduces a hci_get_random_address() which returns the own address type and random address (rpa or nrpa) based on the instance flags and hdev flags. New function is required since own address type should be known before setting adv params but address can be set only after setting params. < HCI Command: LE Set Advertising Set Random Address (0x08|0x0035) plen 7 Advertising handle: 0x00 Advertising random address: 3C:8E:56:9B:77:84 (OUI 3C-8E-56) > HCI Event: Command Complete (0x0e) plen 4 LE Set Advertising Set Random Address (0x08|0x0035) ncmd 1 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 ++ include/net/bluetooth/hci_core.h | 4 ++ net/bluetooth/hci_conn.c | 23 +++++++ net/bluetooth/hci_core.c | 33 +++++++++- net/bluetooth/hci_event.c | 37 ++++++++++- net/bluetooth/hci_request.c | 128 ++++++++++++++++++++++++++++++++++++++- net/bluetooth/hci_request.h | 3 + net/bluetooth/mgmt.c | 2 + 8 files changed, 233 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index faa2922a69fd..8d348d0d3eea 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1649,6 +1649,12 @@ struct hci_cp_le_set_ext_scan_rsp_data { #define HCI_OP_LE_CLEAR_ADV_SETS 0x203d +#define HCI_OP_LE_SET_ADV_SET_RAND_ADDR 0x2035 +struct hci_cp_le_set_adv_set_rand_addr { + __u8 handle; + bdaddr_t bdaddr; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ad3518303a0c..0db1b9b428b7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -172,6 +172,9 @@ struct adv_info { __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; __s8 tx_power; + bdaddr_t random_addr; + bool rpa_expired; + struct delayed_work rpa_expired_cb; }; #define HCI_MAX_ADV_INSTANCES 5 @@ -1113,6 +1116,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration); int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); +void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5c37d383caa3..bd4978ce8c45 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -873,6 +873,14 @@ static void hci_req_directed_advertising(struct hci_request *req, if (ext_adv_capable(hdev)) { struct hci_cp_le_set_ext_adv_params cp; + bdaddr_t random_addr; + + /* Set require_privacy to false so that the remote device has a + * chance of identifying us. + */ + if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL, + &own_addr_type, &random_addr) < 0) + return; memset(&cp, 0, sizeof(cp)); @@ -889,6 +897,21 @@ static void hci_req_directed_advertising(struct hci_request *req, hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); + if (own_addr_type == ADDR_LE_DEV_RANDOM && + bacmp(&random_addr, BDADDR_ANY) && + bacmp(&random_addr, &hdev->random_addr)) { + struct hci_cp_le_set_adv_set_rand_addr cp; + + memset(&cp, 0, sizeof(cp)); + + cp.handle = 0; + bacpy(&cp.bdaddr, &random_addr); + + hci_req_add(req, + HCI_OP_LE_SET_ADV_SET_RAND_ADDR, + sizeof(cp), &cp); + } + __hci_req_enable_ext_advertising(req); } else { struct hci_cp_le_set_adv_param cp; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 944d4fedc317..840e8fd89fa5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1471,6 +1471,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (!ret) { hci_dev_hold(hdev); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, true); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); hci_leds_update_powered(hdev, true); @@ -1626,9 +1627,15 @@ int hci_dev_do_close(struct hci_dev *hdev) if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); - if (hci_dev_test_flag(hdev, HCI_MGMT)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) { + struct adv_info *adv_instance; + cancel_delayed_work_sync(&hdev->rpa_expired); + list_for_each_entry(adv_instance, &hdev->adv_instances, list) + cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); + } + /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -2704,6 +2711,8 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) hdev->cur_adv_instance = 0x00; } + cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); + list_del(&adv_instance->list); kfree(adv_instance); @@ -2712,6 +2721,14 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) return 0; } +void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) +{ + struct adv_info *adv_instance, *n; + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) + adv_instance->rpa_expired = rpa_expired; +} + /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { @@ -2723,6 +2740,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev) } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } @@ -2731,6 +2749,16 @@ void hci_adv_instances_clear(struct hci_dev *hdev) hdev->cur_adv_instance = 0x00; } +static void adv_instance_rpa_expired(struct work_struct *work) +{ + struct adv_info *adv_instance = container_of(work, struct adv_info, + rpa_expired_cb.work); + + BT_DBG(""); + + adv_instance->rpa_expired = true; +} + /* This function requires the caller holds hdev->lock */ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, @@ -2781,6 +2809,9 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, adv_instance->tx_power = HCI_TX_POWER_INVALID; + INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb, + adv_instance_rpa_expired); + BT_DBG("%s for %dMR", hdev->name, instance); return 0; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a78d1dd2f57b..392c9d8febd0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1064,6 +1064,35 @@ static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + struct hci_cp_le_set_adv_set_rand_addr *cp; + struct adv_info *adv_instance; + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR); + if (!cp) + return; + + hci_dev_lock(hdev); + + if (!hdev->cur_adv_instance) { + /* Store in hdev for instance 0 (Set adv and Directed advs) */ + bacpy(&hdev->random_addr, &cp->bdaddr); + } else { + adv_instance = hci_find_adv_instance(hdev, + hdev->cur_adv_instance); + if (adv_instance) + bacpy(&adv_instance->random_addr, &cp->bdaddr); + } + + hci_dev_unlock(hdev); +} + static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { __u8 *sent, status = *((__u8 *) skb->data); @@ -2830,8 +2859,10 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) /* We should disregard the current RPA and generate a new one * whenever the encryption procedure fails. */ - if (ev->status && conn->type == LE_LINK) + if (ev->status && conn->type == LE_LINK) { hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, true); + } clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); @@ -3283,6 +3314,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_ext_adv_enable(hdev, skb); break; + case HCI_OP_LE_SET_ADV_SET_RAND_ADDR: + hci_cc_le_set_adv_set_random_addr(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 96e1e05a92c3..c72fd9202666 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1440,6 +1440,87 @@ unlock: hci_dev_unlock(hdev); } +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr) +{ + int err; + + bacpy(rand_addr, BDADDR_ANY); + + /* If privacy is enabled use a resolvable private address. If + * current RPA has expired then generate a new one. + */ + if (use_rpa) { + int to; + + *own_addr_type = ADDR_LE_DEV_RANDOM; + + if (adv_instance) { + if (!adv_instance->rpa_expired && + !bacmp(&adv_instance->random_addr, &hdev->rpa)) + return 0; + + adv_instance->rpa_expired = false; + } else { + if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && + !bacmp(&hdev->random_addr, &hdev->rpa)) + return 0; + } + + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); + if (err < 0) { + BT_ERR("%s failed to generate new RPA", hdev->name); + return err; + } + + bacpy(rand_addr, &hdev->rpa); + + to = msecs_to_jiffies(hdev->rpa_timeout * 1000); + if (adv_instance) + queue_delayed_work(hdev->workqueue, + &adv_instance->rpa_expired_cb, to); + else + queue_delayed_work(hdev->workqueue, + &hdev->rpa_expired, to); + + return 0; + } + + /* In case of required privacy without resolvable private address, + * use an non-resolvable private address. This is useful for + * non-connectable advertising. + */ + if (require_privacy) { + bdaddr_t nrpa; + + while (true) { + /* The non-resolvable private address is generated + * from random six bytes with the two most significant + * bits cleared. + */ + get_random_bytes(&nrpa, 6); + nrpa.b[5] &= 0x3f; + + /* The non-resolvable private address shall not be + * equal to the public address. + */ + if (bacmp(&hdev->bdaddr, &nrpa)) + break; + } + + *own_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(rand_addr, &nrpa); + + return 0; + } + + /* No privacy so use a public address. */ + *own_addr_type = ADDR_LE_DEV_PUBLIC; + + return 0; +} + void __hci_req_clear_ext_adv_sets(struct hci_request *req) { hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); @@ -1451,9 +1532,21 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) struct hci_dev *hdev = req->hdev; bool connectable; u32 flags; + bdaddr_t random_addr; + u8 own_addr_type; + int err; + struct adv_info *adv_instance; /* In ext adv set param interval is 3 octets */ const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; + if (instance > 0) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -EINVAL; + } else { + adv_instance = NULL; + } + flags = get_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between @@ -1465,6 +1558,16 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) if (!is_advertising_allowed(hdev, connectable)) return -EPERM; + /* Set require_privacy to true only when non-connectable + * advertising is used. In that case it is fine to use a + * non-resolvable private address. + */ + err = hci_get_random_address(hdev, !connectable, + adv_use_rpa(hdev, flags), adv_instance, + &own_addr_type, &random_addr); + if (err < 0) + return err; + memset(&cp, 0, sizeof(cp)); memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); @@ -1477,7 +1580,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) else cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); - cp.own_addr_type = BDADDR_LE_PUBLIC; + cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = 127; cp.primary_phy = HCI_ADV_PHY_1M; @@ -1486,6 +1589,29 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); + if (own_addr_type == ADDR_LE_DEV_RANDOM && + bacmp(&random_addr, BDADDR_ANY)) { + struct hci_cp_le_set_adv_set_rand_addr cp; + + /* Check if random address need to be updated */ + if (adv_instance) { + if (!bacmp(&random_addr, &adv_instance->random_addr)) + return 0; + } else { + if (!bacmp(&random_addr, &hdev->random_addr)) + return 0; + } + + memset(&cp, 0, sizeof(cp)); + + cp.handle = 0; + bacpy(&cp.bdaddr, &random_addr); + + hci_req_add(req, + HCI_OP_LE_SET_ADV_SET_RAND_ADDR, + sizeof(cp), &cp); + } + return 0; } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 2451861bb4f8..692cc8b13368 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -84,6 +84,9 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); void __hci_req_enable_ext_advertising(struct hci_request *req); void __hci_req_clear_ext_adv_sets(struct hci_request *req); +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr); void __hci_req_update_class(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c283f0364c0f..949986727019 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4972,6 +4972,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY); memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, true); if (cp->privacy == 0x02) hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY); else @@ -4980,6 +4981,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY); memset(hdev->irk, 0, sizeof(hdev->irk)); hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, false); hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); } -- cgit v1.2.3 From acf0aeae431a0f1723385cd1cb50177e4cc10edd Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:46 +0530 Subject: Bluetooth: Handle ADv set terminated event This event comes after connection complete event for incoming connections. Since we now have different random address for each instance, conn resp address is assigned from this event. As of now only connection part is handled as we are not enabling duration or max num of events while starting ext adv. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ net/bluetooth/hci_core.c | 8 ++++++++ net/bluetooth/hci_event.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 56 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8d348d0d3eea..57e3e3675d66 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2155,6 +2155,14 @@ struct hci_ev_le_enh_conn_complete { __u8 clk_accurancy; } __packed; +#define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 +struct hci_evt_le_ext_adv_set_term { + __u8 status; + __u8 handle; + __le16 conn_handle; + __u8 num_evts; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 840e8fd89fa5..79e02d24a215 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -712,6 +712,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Complete */ + /* If the controller supports the LE Extended Advertising + * command, enable the corresponding event. + */ + if (ext_adv_capable(hdev)) + events[2] |= 0x02; /* LE Advertising Set + * Terminated + */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 392c9d8febd0..754714c8d752 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4798,10 +4798,15 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, * the advertising address type. */ conn->resp_addr_type = hdev->adv_addr_type; - if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) - bacpy(&conn->resp_addr, &hdev->random_addr); - else + if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { + /* In case of ext adv, resp_addr will be updated in + * Adv Terminated event. + */ + if (!ext_adv_capable(hdev)) + bacpy(&conn->resp_addr, &hdev->random_addr); + } else { bacpy(&conn->resp_addr, &hdev->bdaddr); + } conn->init_addr_type = bdaddr_type; bacpy(&conn->init_addr, bdaddr); @@ -4931,6 +4936,34 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + if (ev->status) + return; + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle)); + if (conn) { + struct adv_info *adv_instance; + + if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM) + return; + + if (!hdev->cur_adv_instance) { + bacpy(&conn->resp_addr, &hdev->random_addr); + return; + } + + adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + if (adv_instance) + bacpy(&conn->resp_addr, &adv_instance->random_addr); + } +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5578,6 +5611,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_enh_conn_complete_evt(hdev, skb); break; + case HCI_EV_LE_EXT_ADV_SET_TERM: + hci_le_ext_adv_term_evt(hdev, skb); + break; + default: break; } -- cgit v1.2.3 From 85a721a8b0b6880d8cf6b9def70404ade8563225 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:47 +0530 Subject: Bluetooth: Implement secondary advertising on different PHYs This patch adds support for advertising in primary and secondary channel on different PHYs. User can add the phy preference in the flag based on which phy type will be added in extended advertising parameter would be set. @ MGMT Command: Add Advertising (0x003e) plen 11 Instance: 1 Flags: 0x00000200 Advertise in CODED on Secondary channel Duration: 0 Timeout: 0 Advertising data length: 0 Scan response length: 0 < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 2 Extended advertising: Disabled (0x00) Number of sets: Disable all sets (0x00) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) < HCI Command: LE Set Extended Advertising Parameters (0x08|0x0036) plen 25 Handle: 0x00 Properties: 0x0000 Min advertising interval: 1280.000 msec (0x0800) Max advertising interval: 1280.000 msec (0x0800) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: 127 dbm (0x7f) Primary PHY: LE Coded (0x03) Secondary max skip: 0x00 Secondary PHY: LE Coded (0x03) SID: 0x00 Scan request notifications: Disabled (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 ++++ include/net/bluetooth/mgmt.h | 6 ++++++ net/bluetooth/hci_request.c | 39 +++++++++++++++++++++++++++++++-------- net/bluetooth/mgmt.c | 18 +++++++++++++++--- 4 files changed, 56 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 57e3e3675d66..8ff36463719f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -410,6 +410,8 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_PHY_2M 0x01 +#define HCI_LE_PHY_CODED 0x08 #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 @@ -1606,6 +1608,8 @@ struct hci_cp_le_set_ext_adv_params { } __packed; #define HCI_ADV_PHY_1M 0X01 +#define HCI_ADV_PHY_2M 0x02 +#define HCI_ADV_PHY_CODED 0x03 struct hci_rp_le_set_ext_adv_params { __u8 status; diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 7f372e9067c9..9cee7ddc6741 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -562,6 +562,12 @@ struct mgmt_rp_add_advertising { #define MGMT_ADV_FLAG_TX_POWER BIT(4) #define MGMT_ADV_FLAG_APPEARANCE BIT(5) #define MGMT_ADV_FLAG_LOCAL_NAME BIT(6) +#define MGMT_ADV_FLAG_SEC_1M BIT(7) +#define MGMT_ADV_FLAG_SEC_2M BIT(8) +#define MGMT_ADV_FLAG_SEC_CODED BIT(9) + +#define MGMT_ADV_FLAG_SEC_MASK (MGMT_ADV_FLAG_SEC_1M | MGMT_ADV_FLAG_SEC_2M | \ + MGMT_ADV_FLAG_SEC_CODED) #define MGMT_OP_REMOVE_ADVERTISING 0x003F struct mgmt_cp_remove_advertising { diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c72fd9202666..e8c9ef1e1922 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1536,6 +1536,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) u8 own_addr_type; int err; struct adv_info *adv_instance; + bool secondary_adv; /* In ext adv set param interval is 3 octets */ const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; @@ -1573,20 +1574,42 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval)); - if (connectable) - cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - else if (get_adv_instance_scan_rsp_len(hdev, instance)) - cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); - else - cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); + secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); + + if (connectable) { + if (secondary_adv) + cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); + } else if (get_adv_instance_scan_rsp_len(hdev, instance)) { + if (secondary_adv) + cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); + } else { + if (secondary_adv) + cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); + } cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = 127; - cp.primary_phy = HCI_ADV_PHY_1M; - cp.secondary_phy = HCI_ADV_PHY_1M; cp.handle = 0; + if (flags & MGMT_ADV_FLAG_SEC_2M) { + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_2M; + } else if (flags & MGMT_ADV_FLAG_SEC_CODED) { + cp.primary_phy = HCI_ADV_PHY_CODED; + cp.secondary_phy = HCI_ADV_PHY_CODED; + } else { + /* In all other cases use 1M */ + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_1M; + } + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); if (own_addr_type == ADDR_LE_DEV_RANDOM && diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 949986727019..231602f7cb66 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6339,6 +6339,16 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) ext_adv_capable(hdev)) flags |= MGMT_ADV_FLAG_TX_POWER; + if (ext_adv_capable(hdev)) { + flags |= MGMT_ADV_FLAG_SEC_1M; + + if (hdev->le_features[1] & HCI_LE_PHY_2M) + flags |= MGMT_ADV_FLAG_SEC_2M; + + if (hdev->le_features[1] & HCI_LE_PHY_CODED) + flags |= MGMT_ADV_FLAG_SEC_CODED; + } + return flags; } @@ -6544,7 +6554,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_add_advertising *cp = data; struct mgmt_rp_add_advertising rp; u32 flags; - u32 supported_flags; + u32 supported_flags, phy_flags; u8 status; u16 timeout, duration; unsigned int prev_instance_cnt = hdev->adv_instance_cnt; @@ -6574,10 +6584,12 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, duration = __le16_to_cpu(cp->duration); /* The current implementation only supports a subset of the specified - * flags. + * flags. Also need to check mutual exclusiveness of sec flags. */ supported_flags = get_supported_adv_flags(hdev); - if (flags & ~supported_flags) + phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK; + if (flags & ~supported_flags || + ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags))))) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); -- cgit v1.2.3 From 740011cfe94859df8d05f5400d589a8693b095e7 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Fri, 20 Jul 2018 13:12:28 +0800 Subject: Bluetooth: Add new quirk for non-persistent setup settings Add a new quirk HCI_QUIRK_NON_PERSISTENT_SETUP allowing that a quirk that runs setup() after every open() and not just after the first open(). Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 9 +++++++++ net/bluetooth/hci_core.c | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8ff36463719f..7f008097552e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -183,6 +183,15 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_NON_PERSISTENT_DIAG, + + /* When this quirk is set, setup() would be run after every + * open() and not just after the first open(). + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + * + */ + HCI_QUIRK_NON_PERSISTENT_SETUP, }; /* HCI device flags */ diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 79e02d24a215..74b29c7d841c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1415,7 +1415,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); - if (hci_dev_test_flag(hdev, HCI_SETUP)) { + if (hci_dev_test_flag(hdev, HCI_SETUP) || + test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) { hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) -- cgit v1.2.3 From 51c23b47e6b8590ea7a6a6776ffb21810ece73bf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 13 Jul 2018 14:54:45 +0200 Subject: netfilter: nf_osf: add nf_osf_find() This new function returns the OS genre as a string. Plan is to use to from the new nft_osf extension. Note that this doesn't yet support ttl options, but it could be easily extended to do so. Tested-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 9 +++++++++ net/netfilter/nf_osf.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'net') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h index 0e114c492fb8..aee460fcbd31 100644 --- a/include/linux/netfilter/nf_osf.h +++ b/include/linux/netfilter/nf_osf.h @@ -1,3 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFOSF_H +#define _NFOSF_H + #include /* Initial window size option state machine: multiple of mss, mtu or @@ -31,3 +35,8 @@ bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers); + +#endif /* _NFOSF_H */ diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c index b44d62d5d9a9..f4c75e982902 100644 --- a/net/netfilter/nf_osf.c +++ b/net/netfilter/nf_osf.c @@ -249,4 +249,34 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, } EXPORT_SYMBOL_GPL(nf_osf_match); +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers) +{ + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + struct nf_osf_hdr_ctx ctx; + const struct tcphdr *tcp; + const char *genre = NULL; + + memset(&ctx, 0, sizeof(ctx)); + + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + if (!tcp) + return false; + + list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { + f = &kf->finger; + if (!nf_osf_match_one(skb, f, -1, &ctx)) + continue; + + genre = f->genre; + break; + } + + return genre; +} +EXPORT_SYMBOL_GPL(nf_osf_find); + MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 33b78aaa4457ce5d531c6a06f461f8d402774cad Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 19 Jul 2018 21:20:09 +0800 Subject: netfilter: use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: net/netfilter/xt_connlimit.c:96:1-3: WARNING: PTR_ERR_OR_ZERO can be used net/netfilter/nft_numgen.c:240:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: YueHaibing Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 4 +--- net/netfilter/xt_connlimit.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 1f4d0854cf70..649d1700ec5b 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -237,10 +237,8 @@ static int nft_ng_random_map_init(const struct nft_ctx *ctx, priv->map = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_NG_SET_NAME], tb[NFTA_NG_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 6275106ccf50..bc6c8ab0fa62 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -93,10 +93,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) /* init private data */ info->data = nf_conncount_init(par->net, par->family, keylen); - if (IS_ERR(info->data)) - return PTR_ERR(info->data); - return 0; + return PTR_ERR_OR_ZERO(info->data); } static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) -- cgit v1.2.3 From f6b7b5f4f3bcd7e1897c16dd65a10cbcc159cbde Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 25 Jul 2018 01:32:44 +0200 Subject: netfilter: nf_osf: rename nf_osf.c to nfnetlink_osf.c Rename nf_osf.c to nfnetlink_osf.c as we introduce nfnetlink_osf which is the OSF infraestructure. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 15 ++- net/netfilter/Makefile | 2 +- net/netfilter/nf_osf.c | 282 ------------------------------------------ net/netfilter/nfnetlink_osf.c | 282 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 293 insertions(+), 288 deletions(-) delete mode 100644 net/netfilter/nf_osf.c create mode 100644 net/netfilter/nfnetlink_osf.c (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6f6c959aeb8f..85333431e524 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -46,6 +46,14 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. +config NETFILTER_NETLINK_OSF + tristate "Netfilter OSF over NFNETLINK interface" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for passive OS fingerprint via NFNETLINK. + config NF_CONNTRACK tristate "Netfilter connection tracking support" default m if NETFILTER_ADVANCED=n @@ -442,9 +450,6 @@ config NETFILTER_SYNPROXY endif # NF_CONNTRACK -config NF_OSF - tristate - config NF_TABLES select NETFILTER_NETLINK tristate "Netfilter nf_tables support" @@ -1368,8 +1373,8 @@ config NETFILTER_XT_MATCH_NFACCT config NETFILTER_XT_MATCH_OSF tristate '"osf" Passive OS fingerprint match' - depends on NETFILTER_ADVANCED && NETFILTER_NETLINK - select NF_OSF + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_OSF help This option selects the Passive OS Fingerprinting match module that allows to passively match the remote operating system by diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index dd26e4961f43..e684f9b8a9c3 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o +obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o @@ -108,7 +109,6 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o -obj-$(CONFIG_NF_OSF) += nf_osf.o obj-$(CONFIG_NFT_SOCKET) += nft_socket.o # nf_tables netdev diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c deleted file mode 100644 index f4c75e982902..000000000000 --- a/net/netfilter/nf_osf.c +++ /dev/null @@ -1,282 +0,0 @@ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -static inline int nf_osf_ttl(const struct sk_buff *skb, - int ttl_check, unsigned char f_ttl) -{ - const struct iphdr *ip = ip_hdr(skb); - - if (ttl_check != -1) { - if (ttl_check == NF_OSF_TTL_TRUE) - return ip->ttl == f_ttl; - if (ttl_check == NF_OSF_TTL_NOCHECK) - return 1; - else if (ip->ttl <= f_ttl) - return 1; - else { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); - int ret = 0; - - for_ifa(in_dev) { - if (inet_ifa_match(ip->saddr, ifa)) { - ret = (ip->ttl == f_ttl); - break; - } - } - endfor_ifa(in_dev); - - return ret; - } - } - - return ip->ttl == f_ttl; -} - -struct nf_osf_hdr_ctx { - bool df; - u16 window; - u16 totlen; - const unsigned char *optp; - unsigned int optsize; -}; - -static bool nf_osf_match_one(const struct sk_buff *skb, - const struct nf_osf_user_finger *f, - int ttl_check, - struct nf_osf_hdr_ctx *ctx) -{ - unsigned int check_WSS = 0; - int fmatch = FMATCH_WRONG; - int foptsize, optnum; - u16 mss = 0; - - if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) - return false; - - /* - * Should not happen if userspace parser was written correctly. - */ - if (f->wss.wc >= OSF_WSS_MAX) - return false; - - /* Check options */ - - foptsize = 0; - for (optnum = 0; optnum < f->opt_num; ++optnum) - foptsize += f->opt[optnum].length; - - if (foptsize > MAX_IPOPTLEN || - ctx->optsize > MAX_IPOPTLEN || - ctx->optsize != foptsize) - return false; - - check_WSS = f->wss.wc; - - for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == *ctx->optp) { - __u32 len = f->opt[optnum].length; - const __u8 *optend = ctx->optp + len; - - fmatch = FMATCH_OK; - - switch (*ctx->optp) { - case OSFOPT_MSS: - mss = ctx->optp[3]; - mss <<= 8; - mss |= ctx->optp[2]; - - mss = ntohs((__force __be16)mss); - break; - case OSFOPT_TS: - break; - } - - ctx->optp = optend; - } else - fmatch = FMATCH_OPT_WRONG; - - if (fmatch != FMATCH_OK) - break; - } - - if (fmatch != FMATCH_OPT_WRONG) { - fmatch = FMATCH_WRONG; - - switch (check_WSS) { - case OSF_WSS_PLAIN: - if (f->wss.val == 0 || ctx->window == f->wss.val) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MSS: - /* - * Some smart modems decrease mangle MSS to - * SMART_MSS_2, so we check standard, decreased - * and the one provided in the fingerprint MSS - * values. - */ -#define SMART_MSS_1 1460 -#define SMART_MSS_2 1448 - if (ctx->window == f->wss.val * mss || - ctx->window == f->wss.val * SMART_MSS_1 || - ctx->window == f->wss.val * SMART_MSS_2) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MTU: - if (ctx->window == f->wss.val * (mss + 40) || - ctx->window == f->wss.val * (SMART_MSS_1 + 40) || - ctx->window == f->wss.val * (SMART_MSS_2 + 40)) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MODULO: - if ((ctx->window % f->wss.val) == 0) - fmatch = FMATCH_OK; - break; - } - } - - return fmatch == FMATCH_OK; -} - -static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, - const struct sk_buff *skb, - const struct iphdr *ip, - unsigned char *opts) -{ - const struct tcphdr *tcp; - struct tcphdr _tcph; - - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); - if (!tcp) - return NULL; - - if (!tcp->syn) - return NULL; - - ctx->totlen = ntohs(ip->tot_len); - ctx->df = ntohs(ip->frag_off) & IP_DF; - ctx->window = ntohs(tcp->window); - - if (tcp->doff * 4 > sizeof(struct tcphdr)) { - ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); - - ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + - sizeof(struct tcphdr), ctx->optsize, opts); - } - - return tcp; -} - -bool -nf_osf_match(const struct sk_buff *skb, u_int8_t family, - int hooknum, struct net_device *in, struct net_device *out, - const struct nf_osf_info *info, struct net *net, - const struct list_head *nf_osf_fingers) -{ - const struct iphdr *ip = ip_hdr(skb); - const struct nf_osf_user_finger *f; - unsigned char opts[MAX_IPOPTLEN]; - const struct nf_osf_finger *kf; - int fcount = 0, ttl_check; - int fmatch = FMATCH_WRONG; - struct nf_osf_hdr_ctx ctx; - const struct tcphdr *tcp; - - memset(&ctx, 0, sizeof(ctx)); - - tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); - if (!tcp) - return false; - - ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; - - list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { - - f = &kf->finger; - - if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) - continue; - - if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) - continue; - - fmatch = FMATCH_OK; - - fcount++; - - if (info->flags & NF_OSF_LOG) - nf_log_packet(net, family, hooknum, skb, - in, out, NULL, - "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", - f->genre, f->version, f->subtype, - &ip->saddr, ntohs(tcp->source), - &ip->daddr, ntohs(tcp->dest), - f->ttl - ip->ttl); - - if ((info->flags & NF_OSF_LOG) && - info->loglevel == NF_OSF_LOGLEVEL_FIRST) - break; - } - - if (!fcount && (info->flags & NF_OSF_LOG)) - nf_log_packet(net, family, hooknum, skb, in, out, NULL, - "Remote OS is not known: %pI4:%u -> %pI4:%u\n", - &ip->saddr, ntohs(tcp->source), - &ip->daddr, ntohs(tcp->dest)); - - if (fcount) - fmatch = FMATCH_OK; - - return fmatch == FMATCH_OK; -} -EXPORT_SYMBOL_GPL(nf_osf_match); - -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers) -{ - const struct iphdr *ip = ip_hdr(skb); - const struct nf_osf_user_finger *f; - unsigned char opts[MAX_IPOPTLEN]; - const struct nf_osf_finger *kf; - struct nf_osf_hdr_ctx ctx; - const struct tcphdr *tcp; - const char *genre = NULL; - - memset(&ctx, 0, sizeof(ctx)); - - tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); - if (!tcp) - return false; - - list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { - f = &kf->finger; - if (!nf_osf_match_one(skb, f, -1, &ctx)) - continue; - - genre = f->genre; - break; - } - - return genre; -} -EXPORT_SYMBOL_GPL(nf_osf_find); - -MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c new file mode 100644 index 000000000000..f4c75e982902 --- /dev/null +++ b/net/netfilter/nfnetlink_osf.c @@ -0,0 +1,282 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static inline int nf_osf_ttl(const struct sk_buff *skb, + int ttl_check, unsigned char f_ttl) +{ + const struct iphdr *ip = ip_hdr(skb); + + if (ttl_check != -1) { + if (ttl_check == NF_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (ttl_check == NF_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + else { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int ret = 0; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; + } + } + endfor_ifa(in_dev); + + return ret; + } + } + + return ip->ttl == f_ttl; +} + +struct nf_osf_hdr_ctx { + bool df; + u16 window; + u16 totlen; + const unsigned char *optp; + unsigned int optsize; +}; + +static bool nf_osf_match_one(const struct sk_buff *skb, + const struct nf_osf_user_finger *f, + int ttl_check, + struct nf_osf_hdr_ctx *ctx) +{ + unsigned int check_WSS = 0; + int fmatch = FMATCH_WRONG; + int foptsize, optnum; + u16 mss = 0; + + if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) + return false; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + return false; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + ctx->optsize > MAX_IPOPTLEN || + ctx->optsize != foptsize) + return false; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == *ctx->optp) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = ctx->optp + len; + + fmatch = FMATCH_OK; + + switch (*ctx->optp) { + case OSFOPT_MSS: + mss = ctx->optp[3]; + mss <<= 8; + mss |= ctx->optp[2]; + + mss = ntohs((__force __be16)mss); + break; + case OSFOPT_TS: + break; + } + + ctx->optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || ctx->window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (ctx->window == f->wss.val * mss || + ctx->window == f->wss.val * SMART_MSS_1 || + ctx->window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (ctx->window == f->wss.val * (mss + 40) || + ctx->window == f->wss.val * (SMART_MSS_1 + 40) || + ctx->window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((ctx->window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + return fmatch == FMATCH_OK; +} + +static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, + const struct sk_buff *skb, + const struct iphdr *ip, + unsigned char *opts) +{ + const struct tcphdr *tcp; + struct tcphdr _tcph; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return NULL; + + if (!tcp->syn) + return NULL; + + ctx->totlen = ntohs(ip->tot_len); + ctx->df = ntohs(ip->frag_off) & IP_DF; + ctx->window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), ctx->optsize, opts); + } + + return tcp; +} + +bool +nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers) +{ + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + int fcount = 0, ttl_check; + int fmatch = FMATCH_WRONG; + struct nf_osf_hdr_ctx ctx; + const struct tcphdr *tcp; + + memset(&ctx, 0, sizeof(ctx)); + + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + if (!tcp) + return false; + + ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; + + list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { + + f = &kf->finger; + + if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) + continue; + + fmatch = FMATCH_OK; + + fcount++; + + if (info->flags & NF_OSF_LOG) + nf_log_packet(net, family, hooknum, skb, + in, out, NULL, + "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", + f->genre, f->version, f->subtype, + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest), + f->ttl - ip->ttl); + + if ((info->flags & NF_OSF_LOG) && + info->loglevel == NF_OSF_LOGLEVEL_FIRST) + break; + } + + if (!fcount && (info->flags & NF_OSF_LOG)) + nf_log_packet(net, family, hooknum, skb, in, out, NULL, + "Remote OS is not known: %pI4:%u -> %pI4:%u\n", + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest)); + + if (fcount) + fmatch = FMATCH_OK; + + return fmatch == FMATCH_OK; +} +EXPORT_SYMBOL_GPL(nf_osf_match); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers) +{ + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + struct nf_osf_hdr_ctx ctx; + const struct tcphdr *tcp; + const char *genre = NULL; + + memset(&ctx, 0, sizeof(ctx)); + + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + if (!tcp) + return false; + + list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { + f = &kf->finger; + if (!nf_osf_match_one(skb, f, -1, &ctx)) + continue; + + genre = f->genre; + break; + } + + return genre; +} +EXPORT_SYMBOL_GPL(nf_osf_find); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From f9324952088f1cd62ea4addf9ff532f1e6452a22 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 25 Jul 2018 01:32:45 +0200 Subject: netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c Move nfnetlink osf subsystem from xt_osf.c to standalone module so we can reuse it from the new nft_ost extension. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_osf.h | 11 +++ include/uapi/linux/netfilter/xt_osf.h | 9 +- net/netfilter/nfnetlink_osf.c | 154 ++++++++++++++++++++++++++++++++++ net/netfilter/xt_osf.c | 149 +------------------------------- 4 files changed, 169 insertions(+), 154 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h index 3738116b2bbe..cc2487ff74f6 100644 --- a/include/uapi/linux/netfilter/nf_osf.h +++ b/include/uapi/linux/netfilter/nf_osf.h @@ -70,6 +70,8 @@ struct nf_osf_nlmsg { struct tcphdr tcp; }; +extern struct list_head nf_osf_fingers[2]; + /* Defines for IANA option kinds */ enum iana_options { OSFOPT_EOL = 0, /* End of options */ @@ -94,4 +96,13 @@ enum nf_osf_attr_type { OSF_ATTR_MAX, }; +/* + * Add/remove fingerprint from the kernel. + */ +enum nf_osf_msg_types { + OSF_MSG_ADD, + OSF_MSG_REMOVE, + OSF_MSG_MAX, +}; + #endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index b189007f4f28..a90e90c27cef 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -47,13 +47,6 @@ #define xt_osf_nlmsg nf_osf_nlmsg #define xt_osf_attr_type nf_osf_attr_type -/* - * Add/remove fingerprint from the kernel. - */ -enum xt_osf_msg_types { - OSF_MSG_ADD, - OSF_MSG_REMOVE, - OSF_MSG_MAX, -}; +#define xt_osf_msg_types nf_osf_msg_types #endif /* _XT_OSF_H */ diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f4c75e982902..ba0fa11869ce 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -20,6 +20,13 @@ #include #include +/* + * Indexed by dont-fragment bit. + * It is the only constant value in the fingerprint. + */ +struct list_head nf_osf_fingers[2]; +EXPORT_SYMBOL_GPL(nf_osf_fingers); + static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { @@ -279,4 +286,151 @@ const char *nf_osf_find(const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_osf_find); +static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { + [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, +}; + +static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[], + struct netlink_ext_ack *extack) +{ + struct nf_osf_user_finger *f; + struct nf_osf_finger *kf = NULL, *sf; + int err = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); + if (!kf) + return -ENOMEM; + + memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); + + list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) + continue; + + kfree(kf); + kf = NULL; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + err = -EEXIST; + break; + } + + /* + * We are protected by nfnl mutex. + */ + if (kf) + list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); + + return err; +} + +static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[], + struct netlink_ext_ack *extack) +{ + struct nf_osf_user_finger *f; + struct nf_osf_finger *sf; + int err = -ENOENT; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) + continue; + + /* + * We are protected by nfnl mutex. + */ + list_del_rcu(&sf->finger_entry); + kfree_rcu(sf, rcu_head); + + err = 0; + break; + } + + return err; +} + +static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { + [OSF_MSG_ADD] = { + .call = nfnl_osf_add_callback, + .attr_count = OSF_ATTR_MAX, + .policy = nfnl_osf_policy, + }, + [OSF_MSG_REMOVE] = { + .call = nfnl_osf_remove_callback, + .attr_count = OSF_ATTR_MAX, + .policy = nfnl_osf_policy, + }, +}; + +static const struct nfnetlink_subsystem nfnl_osf_subsys = { + .name = "osf", + .subsys_id = NFNL_SUBSYS_OSF, + .cb_count = OSF_MSG_MAX, + .cb = nfnl_osf_callbacks, +}; + +static int __init nfnl_osf_init(void) +{ + int err = -EINVAL; + int i; + + for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) + INIT_LIST_HEAD(&nf_osf_fingers[i]); + + err = nfnetlink_subsys_register(&nfnl_osf_subsys); + if (err < 0) { + pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); + goto err_out_exit; + } + return 0; + +err_out_exit: + return err; +} + +static void __exit nfnl_osf_fini(void) +{ + struct nf_osf_finger *f; + int i; + + nfnetlink_subsys_unregister(&nfnl_osf_subsys); + + rcu_read_lock(); + for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { + list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { + list_del_rcu(&f->finger_entry); + kfree_rcu(f, rcu_head); + } + } + rcu_read_unlock(); + + rcu_barrier(); +} + +module_init(nfnl_osf_init); +module_exit(nfnl_osf_fini); + MODULE_LICENSE("GPL"); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 9cfef73b4107..bf7bba80e24c 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -37,118 +37,6 @@ #include #include -/* - * Indexed by dont-fragment bit. - * It is the only constant value in the fingerprint. - */ -static struct list_head xt_osf_fingers[2]; - -static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { - [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, -}; - -static int xt_osf_add_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) -{ - struct xt_osf_user_finger *f; - struct xt_osf_finger *kf = NULL, *sf; - int err = 0; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!osf_attrs[OSF_ATTR_FINGER]) - return -EINVAL; - - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) - return -EINVAL; - - f = nla_data(osf_attrs[OSF_ATTR_FINGER]); - - kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL); - if (!kf) - return -ENOMEM; - - memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger)); - - list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { - if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) - continue; - - kfree(kf); - kf = NULL; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - err = -EEXIST; - break; - } - - /* - * We are protected by nfnl mutex. - */ - if (kf) - list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]); - - return err; -} - -static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) -{ - struct xt_osf_user_finger *f; - struct xt_osf_finger *sf; - int err = -ENOENT; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!osf_attrs[OSF_ATTR_FINGER]) - return -EINVAL; - - f = nla_data(osf_attrs[OSF_ATTR_FINGER]); - - list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { - if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) - continue; - - /* - * We are protected by nfnl mutex. - */ - list_del_rcu(&sf->finger_entry); - kfree_rcu(sf, rcu_head); - - err = 0; - break; - } - - return err; -} - -static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = { - [OSF_MSG_ADD] = { - .call = xt_osf_add_callback, - .attr_count = OSF_ATTR_MAX, - .policy = xt_osf_policy, - }, - [OSF_MSG_REMOVE] = { - .call = xt_osf_remove_callback, - .attr_count = OSF_ATTR_MAX, - .policy = xt_osf_policy, - }, -}; - -static const struct nfnetlink_subsystem xt_osf_nfnetlink = { - .name = "osf", - .subsys_id = NFNL_SUBSYS_OSF, - .cb_count = OSF_MSG_MAX, - .cb = xt_osf_nfnetlink_callbacks, -}; - static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { @@ -159,7 +47,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) return false; return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), - xt_out(p), info, net, xt_osf_fingers); + xt_out(p), info, net, nf_osf_fingers); } static struct xt_match xt_osf_match = { @@ -177,52 +65,21 @@ static struct xt_match xt_osf_match = { static int __init xt_osf_init(void) { - int err = -EINVAL; - int i; - - for (i=0; ifinger_entry); - kfree_rcu(f, rcu_head); - } - } - rcu_read_unlock(); - - rcu_barrier(); } module_init(xt_osf_init); -- cgit v1.2.3 From b96af92d6eaf9fadd77aa798c508a8a9d2e60020 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 25 Jul 2018 01:32:46 +0200 Subject: netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf Add basic module functions into nft_osf.[ch] in order to implement OSF module in nf_tables. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 7 ++ net/netfilter/Kconfig | 7 ++ net/netfilter/Makefile | 1 + net/netfilter/nft_osf.c | 106 +++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+) create mode 100644 net/netfilter/nft_osf.c (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f466860bcf75..382c32d630e9 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1463,6 +1463,13 @@ enum nft_flowtable_hook_attributes { }; #define NFTA_FLOWTABLE_HOOK_MAX (__NFTA_FLOWTABLE_HOOK_MAX - 1) +enum nft_osf_attributes { + NFTA_OSF_UNSPEC, + NFTA_OSF_DREG, + __NFTA_OSF_MAX, +}; +#define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1) + /** * enum nft_device_attributes - nf_tables device netlink attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 85333431e524..16fdfb75efb5 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -627,6 +627,13 @@ config NFT_SOCKET This option allows matching for the presence or absence of a corresponding socket and its attributes. +config NFT_OSF + tristate "Netfilter nf_tables passive OS fingerprint support" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_OSF + help + This option allows matching packets from an specific OS. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e684f9b8a9c3..5cbbf6978b55 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_SOCKET) += nft_socket.o +obj-$(CONFIG_NFT_OSF) += nft_osf.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c new file mode 100644 index 000000000000..bdacc4cffba4 --- /dev/null +++ b/net/netfilter/nft_osf.c @@ -0,0 +1,106 @@ +#include +#include + +#include +#include + +#define OSF_GENRE_SIZE 32 + +struct nft_osf { + enum nft_registers dreg:8; +}; + +static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { + [NFTA_OSF_DREG] = { .type = NLA_U32 }, +}; + +static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_osf *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct sk_buff *skb = pkt->skb; + const struct tcphdr *tcp; + struct tcphdr _tcph; + const char *os_name; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(struct tcphdr), &_tcph); + if (!tcp) { + regs->verdict.code = NFT_BREAK; + return; + } + if (!tcp->syn) { + regs->verdict.code = NFT_BREAK; + return; + } + + os_name = nf_osf_find(skb, nf_osf_fingers); + if (!os_name) + strncpy((char *)dest, "unknown", IFNAMSIZ); + else + strncpy((char *)dest, os_name, IFNAMSIZ); +} + +static int nft_osf_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_osf *priv = nft_expr_priv(expr); + int err; + + priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, NULL, + NFTA_DATA_VALUE, OSF_GENRE_SIZE); + if (err < 0) + return err; + + return 0; +} + +static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_osf *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_osf_type; +static const struct nft_expr_ops nft_osf_op = { + .eval = nft_osf_eval, + .size = NFT_EXPR_SIZE(sizeof(struct nft_osf)), + .init = nft_osf_init, + .dump = nft_osf_dump, + .type = &nft_osf_type, +}; + +static struct nft_expr_type nft_osf_type __read_mostly = { + .ops = &nft_osf_op, + .name = "osf", + .owner = THIS_MODULE, + .policy = nft_osf_policy, + .maxattr = NFTA_OSF_MAX, +}; + +static int __init nft_osf_module_init(void) +{ + return nft_register_expr(&nft_osf_type); +} + +static void __exit nft_osf_module_exit(void) +{ + return nft_unregister_expr(&nft_osf_type); +} + +module_init(nft_osf_module_init); +module_exit(nft_osf_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fernando Fernandez "); +MODULE_ALIAS_NFT_EXPR("osf"); -- cgit v1.2.3 From 4ed8eb6570a49931c705512060acd50058d61616 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Mon, 30 Jul 2018 11:07:32 +0200 Subject: netfilter: nf_tables: Add native tproxy support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A great portion of the code is taken from xt_TPROXY.c There are some changes compared to the iptables implementation: - tproxy statement is not terminal here - Either address or port has to be specified, but at least one of them is necessary. If one of them is not specified, the evaluation will be performed with the original attribute of the packet (ie. target port is not specified => the packet's dport will be used). To make this work in inet tables, the tproxy structure has a family member (typically called priv->family) which is not necessarily equal to ctx->family. priv->family can have three values legally: - NFPROTO_IPV4 if the table family is ip OR if table family is inet, but an ipv4 address is specified as a target address. The rule only evaluates ipv4 packets in this case. - NFPROTO_IPV6 if the table family is ip6 OR if table family is inet, but an ipv6 address is specified as a target address. The rule only evaluates ipv6 packets in this case. - NFPROTO_UNSPEC if the table family is inet AND if only the port is specified. The rule will evaluate both ipv4 and ipv6 packets. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 16 ++ net/netfilter/Kconfig | 10 + net/netfilter/Makefile | 1 + net/netfilter/nft_tproxy.c | 314 +++++++++++++++++++++++++++++++ 4 files changed, 341 insertions(+) create mode 100644 net/netfilter/nft_tproxy.c (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 382c32d630e9..f112ea52dc1a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1252,6 +1252,22 @@ enum nft_nat_attributes { }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) +/** + * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes + * + * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) + * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) + * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) + */ +enum nft_tproxy_attributes { + NFTA_TPROXY_UNSPEC, + NFTA_TPROXY_FAMILY, + NFTA_TPROXY_REG_ADDR, + NFTA_TPROXY_REG_PORT, + __NFTA_TPROXY_MAX +}; +#define NFTA_TPROXY_MAX (__NFTA_TPROXY_MAX - 1) + /** * enum nft_masq_attributes - nf_tables masquerade expression attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 16fdfb75efb5..0febf3e21f91 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -634,6 +634,16 @@ config NFT_OSF help This option allows matching packets from an specific OS. +config NFT_TPROXY + tristate "Netfilter nf_tables tproxy support" + depends on IPV6 || IPV6=n + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if NF_TABLES_IPV6 + select NF_TPROXY_IPV4 + select NF_TPROXY_IPV6 if NF_TABLES_IPV6 + help + This makes transparent proxy support available in nftables. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5cbbf6978b55..cf61615cc529 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o +obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c new file mode 100644 index 000000000000..c6845f7baa08 --- /dev/null +++ b/net/netfilter/nft_tproxy.c @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) +#include +#endif + +struct nft_tproxy { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_port:8; + u8 family; +}; + +static void nft_tproxy_eval_v4(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp; + __be32 taddr = 0; + __be16 tport = 0; + struct sock *sk; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (!hp) { + regs->verdict.code = NFT_BREAK; + return; + } + + /* check if there's an ongoing connection on the packet addresses, this + * happens if the redirect already happened and the current packet + * belongs to an already established connection + */ + sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol, + iph->saddr, iph->daddr, + hp->source, hp->dest, + skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); + + if (priv->sreg_addr) + taddr = regs->data[priv->sreg_addr]; + taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); + + if (priv->sreg_port) + tport = regs->data[priv->sreg_port]; + if (!tport) + tport = hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) { + /* reopening a TIME_WAIT connection needs special handling */ + sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk); + } else if (!sk) { + /* no, there's no established connection, check if + * there's a listener on the redirected addr/port + */ + sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol, + iph->saddr, taddr, + hp->source, tport, + skb->dev, NF_TPROXY_LOOKUP_LISTENER); + } + + if (sk && nf_tproxy_sk_is_transparent(sk)) + nf_tproxy_assign_sock(skb, sk); + else + regs->verdict.code = NFT_BREAK; +} + +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) +static void nft_tproxy_eval_v6(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct in6_addr taddr = {0}; + int thoff = pkt->xt.thoff; + struct udphdr _hdr, *hp; + __be16 tport = 0; + struct sock *sk; + int l4proto; + + if (!pkt->tprot_set) { + regs->verdict.code = NFT_BREAK; + return; + } + l4proto = pkt->tprot; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + regs->verdict.code = NFT_BREAK; + return; + } + + /* check if there's an ongoing connection on the packet addresses, this + * happens if the redirect already happened and the current packet + * belongs to an already established connection + */ + sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto, + &iph->saddr, &iph->daddr, + hp->source, hp->dest, + nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED); + + if (priv->sreg_addr) + memcpy(&taddr, ®s->data[priv->sreg_addr], sizeof(taddr)); + taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); + + if (priv->sreg_port) + tport = regs->data[priv->sreg_port]; + if (!tport) + tport = hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) { + /* reopening a TIME_WAIT connection needs special handling */ + sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff, + nft_net(pkt), + &taddr, + tport, + sk); + } else if (!sk) { + /* no there's no established connection, check if + * there's a listener on the redirected addr/port + */ + sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, + l4proto, &iph->saddr, &taddr, + hp->source, tport, + nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER); + } + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && nf_tproxy_sk_is_transparent(sk)) + nf_tproxy_assign_sock(skb, sk); + else + regs->verdict.code = NFT_BREAK; +} +#endif + +static void nft_tproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + switch (priv->family) { + case NFPROTO_IPV4: + case NFPROTO_UNSPEC: + nft_tproxy_eval_v4(expr, regs, pkt); + return; + } + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + switch (priv->family) { + case NFPROTO_IPV6: + case NFPROTO_UNSPEC: + nft_tproxy_eval_v6(expr, regs, pkt); + return; + } +#endif + } + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { + [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 }, + [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, + [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, +}; + +static int nft_tproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_tproxy *priv = nft_expr_priv(expr); + unsigned int alen = 0; + int err; + + if (!tb[NFTA_TPROXY_FAMILY] || + (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT])) + return -EINVAL; + + priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY])); + + switch (ctx->family) { + case NFPROTO_IPV4: + if (priv->family != NFPROTO_IPV4) + return -EINVAL; + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + if (priv->family != NFPROTO_IPV6) + return -EINVAL; + break; +#endif + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + + /* Address is specified but the rule family is not set accordingly */ + if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR]) + return -EINVAL; + + switch (priv->family) { + case NFPROTO_IPV4: + alen = FIELD_SIZEOF(union nf_inet_addr, in); + err = nf_defrag_ipv4_enable(ctx->net); + if (err) + return err; + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + alen = FIELD_SIZEOF(union nf_inet_addr, in6); + err = nf_defrag_ipv6_enable(ctx->net); + if (err) + return err; + break; +#endif + case NFPROTO_UNSPEC: + /* No address is specified here */ + err = nf_defrag_ipv4_enable(ctx->net); + if (err) + return err; + err = nf_defrag_ipv6_enable(ctx->net); + if (err) + return err; + break; + default: + return -EOPNOTSUPP; + } + + if (tb[NFTA_TPROXY_REG_ADDR]) { + priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]); + err = nft_validate_register_load(priv->sreg_addr, alen); + if (err < 0) + return err; + } + + if (tb[NFTA_TPROXY_REG_PORT]) { + priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]); + err = nft_validate_register_load(priv->sreg_port, sizeof(u16)); + if (err < 0) + return err; + } + + return 0; +} + +static int nft_tproxy_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family))) + return -1; + + if (priv->sreg_addr && + nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr)) + return -1; + + if (priv->sreg_port && + nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port)) + return -1; + + return 0; +} + +static struct nft_expr_type nft_tproxy_type; +static const struct nft_expr_ops nft_tproxy_ops = { + .type = &nft_tproxy_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)), + .eval = nft_tproxy_eval, + .init = nft_tproxy_init, + .dump = nft_tproxy_dump, +}; + +static struct nft_expr_type nft_tproxy_type __read_mostly = { + .name = "tproxy", + .ops = &nft_tproxy_ops, + .policy = nft_tproxy_policy, + .maxattr = NFTA_TPROXY_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_tproxy_module_init(void) +{ + return nft_register_expr(&nft_tproxy_type); +} + +static void __exit nft_tproxy_module_exit(void) +{ + nft_unregister_expr(&nft_tproxy_type); +} + +module_init(nft_tproxy_module_init); +module_exit(nft_tproxy_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Máté Eckl"); +MODULE_DESCRIPTION("nf_tables tproxy support module"); +MODULE_ALIAS_NFT_EXPR("tproxy"); -- cgit v1.2.3 From b3cadaa485f0c20add1644a5c877b0765b285c0c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 30 Jul 2018 13:57:41 +0200 Subject: Bluetooth: hidp: Fix handling of strncpy for hid->name information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes two issues with setting hid->name information. CC net/bluetooth/hidp/core.o In function ‘hidp_setup_hid’, inlined from ‘hidp_session_dev_init’ at net/bluetooth/hidp/core.c:815:9, inlined from ‘hidp_session_new’ at net/bluetooth/hidp/core.c:953:8, inlined from ‘hidp_connection_add’ at net/bluetooth/hidp/core.c:1366:8: net/bluetooth/hidp/core.c:778:2: warning: ‘strncpy’ output may be truncated copying 127 bytes from a string of length 127 [-Wstringop-truncation] strncpy(hid->name, req->name, sizeof(req->name) - 1); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CC net/bluetooth/hidp/core.o net/bluetooth/hidp/core.c: In function ‘hidp_setup_hid’: net/bluetooth/hidp/core.c:778:38: warning: argument to ‘sizeof’ in ‘strncpy’ call is the same expression as the source; did you mean to use the size of the destination? [-Wsizeof-pointer-memaccess] strncpy(hid->name, req->name, sizeof(req->name)); ^ Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hidp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 1036e4fa1ea2..6f3eaf2fb94f 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->version = req->version; hid->country = req->country; - strncpy(hid->name, req->name, sizeof(req->name) - 1); + strncpy(hid->name, req->name, sizeof(hid->name)); snprintf(hid->phys, sizeof(hid->phys), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->src); -- cgit v1.2.3 From dd979b4df817e9976f18fb6f9d134d6bc4a3c317 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:10 +0200 Subject: net: simplify sock_poll_wait The wait_address argument is always directly derived from the filp argument, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- crypto/af_alg.c | 2 +- include/net/sock.h | 11 ++++++----- net/atm/common.c | 2 +- net/caif/caif_socket.c | 2 +- net/core/datagram.c | 2 +- net/dccp/proto.c | 2 +- net/ipv4/tcp.c | 2 +- net/iucv/af_iucv.c | 2 +- net/nfc/llcp_sock.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/smc/af_smc.c | 2 +- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 4 ++-- 13 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index c166f424871c..b053179e0bc5 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1071,7 +1071,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock, struct af_alg_ctx *ctx = ask->private; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; if (!ctx->more || ctx->used) diff --git a/include/net/sock.h b/include/net/sock.h index 83b747538bd0..0518f61926ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2057,16 +2057,17 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file - * @wait_address: socket wait queue * @p: poll_table * * See the comments in the wq_has_sleeper function. */ -static inline void sock_poll_wait(struct file *filp, - wait_queue_head_t *wait_address, poll_table *p) +static inline void sock_poll_wait(struct file *filp, poll_table *p) { - if (!poll_does_not_wait(p) && wait_address) { - poll_wait(filp, wait_address, p); + struct socket *sock = filp->private_data; + wait_queue_head_t *wq = sk_sleep(sock->sk); + + if (!poll_does_not_wait(p) && wq) { + poll_wait(filp, wq, p); /* We need to be sure we are in sync with the * socket flags modification. * diff --git a/net/atm/common.c b/net/atm/common.c index a7a68e509628..9f8cb0d2e71e 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) struct atm_vcc *vcc; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; vcc = ATM_SD(sock); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a6fb1b3bcad9..d18965f3291f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file, __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/datagram.c b/net/core/datagram.c index 9938952c5c78..9aac0d63d53e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d56e36a6db7..875858c8b059 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, __poll_t mask; struct sock *sk = sock->sk; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 514aaac1626f..f3bfb9f29520 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 8d1c43f8fed4..92ee91e34395 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1494,7 +1494,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask = 0; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ea0c0c6f1874..dd4adf8b1167 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, pr_debug("%p\n", sk); - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2b463047dd7b..ac44d8afffb1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -741,7 +741,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, struct rxrpc_sock *rx = rxrpc_sk(sk); __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fce7e4751151..0fc94f296e54 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1535,7 +1535,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3d21414ba357..3763bedecf5f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -716,7 +716,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e5473c03d667..1772a0e32665 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2635,7 +2635,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ @@ -2672,7 +2672,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, unsigned int writable; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ -- cgit v1.2.3 From f641f13b992979b97e595b761a9ba1a64fed7c4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:12 +0200 Subject: net: remove sock_poll_busy_loop There is no point in hiding this logic in a helper. Also remove the useless events != 0 check and only busy loop once we know we actually have a poll method. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/busy_poll.h | 9 --------- net/socket.c | 5 ++++- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 9e36fda652b7..85777e68f738 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,15 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } -static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events) -{ - if (sk_can_busy_loop(sock->sk) && - events && (events & POLL_BUSY_LOOP)) { - /* once, only if requested by syscall */ - sk_busy_loop(sock->sk, 1); - } -} - /* if this socket can poll_ll, tell the system call */ static inline __poll_t sock_poll_busy_flag(struct socket *sock) { diff --git a/net/socket.c b/net/socket.c index 85633622c94d..674434127b3a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1132,9 +1132,12 @@ static __poll_t sock_poll(struct file *file, poll_table *wait) struct socket *sock = file->private_data; __poll_t events = poll_requested_events(wait); - sock_poll_busy_loop(sock, events); if (!sock->ops->poll) return 0; + + /* poll once if requested by the syscall */ + if (sk_can_busy_loop(sock->sk) && (events & POLL_BUSY_LOOP)) + sk_busy_loop(sock->sk, 1); return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); } -- cgit v1.2.3 From a331de3bf0e66ab2437fc8c5b99bd3c0d9da3088 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:13 +0200 Subject: net: remove sock_poll_busy_flag Fold it into the only caller to make the code simpler and easier to read. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/busy_poll.h | 6 ------ net/socket.c | 16 +++++++++++----- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 85777e68f738..ba61cdd09eaa 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,12 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } -/* if this socket can poll_ll, tell the system call */ -static inline __poll_t sock_poll_busy_flag(struct socket *sock) -{ - return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0; -} - /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) diff --git a/net/socket.c b/net/socket.c index 674434127b3a..5b7df6695f4f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1130,15 +1130,21 @@ EXPORT_SYMBOL(sock_create_lite); static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; - __poll_t events = poll_requested_events(wait); + __poll_t events = poll_requested_events(wait), flag = 0; if (!sock->ops->poll) return 0; - /* poll once if requested by the syscall */ - if (sk_can_busy_loop(sock->sk) && (events & POLL_BUSY_LOOP)) - sk_busy_loop(sock->sk, 1); - return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); + if (sk_can_busy_loop(sock->sk)) { + /* poll once if requested by the syscall */ + if (events & POLL_BUSY_LOOP) + sk_busy_loop(sock->sk, 1); + + /* if this socket can poll_ll, tell the system call */ + flag = POLL_BUSY_LOOP; + } + + return sock->ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3 From 802bfb19152c0fb4137c6ba72bcf042ee023e743 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:42 +0200 Subject: net/sched: user-space can't set unknown tcfa_action values Currently, when initializing an action, the user-space can specify and use arbitrary values for the tcfa_action field. If the value is unknown by the kernel, is implicitly threaded as TC_ACT_UNSPEC. This change explicitly checks for unknown values at action creation time, and explicitly convert them to TC_ACT_UNSPEC. No functional changes are introduced, but this will allow introducing tcfa_action values not exposed to user-space in a later patch. Note: we can't use the above to hide TC_ACT_REDIRECT from user-space, as the latter is already part of uAPI. v3 -> v4: - use an helper to check for action validity (JiriP) - emit an extack for invalid actions (JiriP) v4 -> v5: - keep messages on a single line, drop net_warn (Marcelo) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 6 ++++-- net/sched/act_api.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index b4512254036b..48e5b5d49a34 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -45,6 +45,7 @@ enum { * the skb and act like everything * is alright. */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP /* There is a special kind of actions called "extended actions", * which need a value parameter. These have a local opcode located in @@ -55,11 +56,12 @@ enum { #define __TC_ACT_EXT_SHIFT 28 #define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) #define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) -#define TC_ACT_EXT_CMP(combined, opcode) \ - (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) #define TC_ACT_JUMP __TC_ACT_EXT(1) #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN /* Action type identifiers*/ enum { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b43df1e25c6d..229d63c99be2 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -786,6 +786,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static bool tcf_action_valid(int action) +{ + int opcode = TC_ACT_EXT_OPCODE(action); + + if (!opcode) + return action <= TC_ACT_VALUE_MAX; + return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC; +} + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -895,6 +904,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, } } + if (!tcf_action_valid(a->tcfa_action)) { + NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead"); + a->tcfa_action = TC_ACT_UNSPEC; + } + return a; err_mod: -- cgit v1.2.3 From 7fd4b288ea6a3e45ad8afbcd5ec39554d57f1ae0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:43 +0200 Subject: tc/act: remove unneeded RCU lock in action callback Each lockless action currently does its own RCU locking in ->act(). This allows using plain RCU accessor, even if the context is really RCU BH. This change drops the per action RCU lock, replace the accessors with the _bh variant, cleans up a bit the surrounding code and documents the RCU status in the relevant header. No functional nor performance change is intended. The goal of this patch is clarifying that the RCU critical section used by the tc actions extends up to the classifier's caller. v1 -> v2: - preserve rcu lock in act_bpf: it's needed by eBPF helpers, as pointed out by Daniel v3 -> v4: - fixed some typos in the commit message (JiriP) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/sch_generic.h | 2 ++ net/sched/act_csum.c | 12 +++--------- net/sched/act_ife.c | 5 +---- net/sched/act_mirred.c | 4 +--- net/sched/act_sample.c | 4 +--- net/sched/act_skbedit.c | 10 +++------- net/sched/act_skbmod.c | 21 +++++++++------------ net/sched/act_tunnel_key.c | 6 +----- net/sched/act_vlan.c | 19 +++++++------------ 10 files changed, 29 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 683ce41053d9..8c9bc02d05e1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -85,7 +85,7 @@ struct tc_action_ops { size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, - struct tcf_result *); + struct tcf_result *); /* called under RCU BH lock*/ int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c5432362dc26..bcae181c1857 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -285,6 +285,8 @@ struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; + + /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4e8c383f379e..648a3a35b720 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -561,15 +561,14 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, u32 update_flags; int action; - rcu_read_lock(); - params = rcu_dereference(p->params); + params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) - goto drop_stats; + goto drop; update_flags = params->update_flags; switch (tc_skb_protocol(skb)) { @@ -583,16 +582,11 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, break; } -unlock: - rcu_read_unlock(); return action; drop: - action = TC_ACT_SHOT; - -drop_stats: qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); - goto unlock; + return TC_ACT_SHOT; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3d6e265758c0..df4060e32d43 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -820,14 +820,11 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_ife_params *p; int ret; - rcu_read_lock(); - p = rcu_dereference(ife->params); + p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); - rcu_read_unlock(); return ret; } - rcu_read_unlock(); return tcf_ife_decode(skb, a, res); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6afd89a36c69..eeb335f03102 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -181,11 +181,10 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - rcu_read_lock(); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = READ_ONCE(m->tcf_action); - dev = rcu_dereference(m->tcfm_dev); + dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); goto out; @@ -236,7 +235,6 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - rcu_read_unlock(); return retval; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 3079e7be5bde..2608ccc83e5e 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -140,8 +140,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); - rcu_read_lock(); - psample_group = rcu_dereference(s->psample_group); + psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { @@ -165,7 +164,6 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, skb_pull(skb, skb->mac_len); } - rcu_read_unlock(); return retval; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index da56e6938c9e..a6db47ebec11 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,8 +43,7 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - rcu_read_lock(); - params = rcu_dereference(d->params); + params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) @@ -77,14 +76,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - -unlock: - rcu_read_unlock(); return action; + err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); - action = TC_ACT_SHOT; - goto unlock; + return TC_ACT_SHOT; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index cdc6bacfb190..c437c6d51a71 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -41,20 +41,14 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, * then MAX_EDIT_LEN needs to change appropriately */ err = skb_ensure_writable(skb, MAX_EDIT_LEN); - if (unlikely(err)) { /* best policy is to drop on the floor */ - qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); - return TC_ACT_SHOT; - } + if (unlikely(err)) /* best policy is to drop on the floor */ + goto drop; - rcu_read_lock(); action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) { - qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); - rcu_read_unlock(); - return action; - } + if (unlikely(action == TC_ACT_SHOT)) + goto drop; - p = rcu_dereference(d->skbmod_p); + p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; if (flags & SKBMOD_F_DMAC) ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); @@ -62,7 +56,6 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src); if (flags & SKBMOD_F_ETYPE) eth_hdr(skb)->h_proto = p->eth_type; - rcu_read_unlock(); if (flags & SKBMOD_F_SWAPMAC) { u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */ @@ -73,6 +66,10 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, } return action; + +drop: + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + return TC_ACT_SHOT; } static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index f811850fd1d0..d42d9e112789 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -31,9 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_tunnel_key_params *params; int action; - rcu_read_lock(); - - params = rcu_dereference(t->params); + params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); @@ -53,8 +51,6 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, break; } - rcu_read_unlock(); - return action; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ad37f308175a..15a0ee214c9c 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -40,11 +40,9 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - rcu_read_lock(); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference(v->vlan_p); + p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: @@ -61,7 +59,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, case TCA_VLAN_ACT_MODIFY: /* No-op if no vlan tag (either hw-accel or in-payload) */ if (!skb_vlan_tagged(skb)) - goto unlock; + goto out; /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); @@ -86,18 +84,15 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, BUG(); } - goto unlock; - -drop: - action = TC_ACT_SHOT; - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); - -unlock: - rcu_read_unlock(); +out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); return action; + +drop: + qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + return TC_ACT_SHOT; } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { -- cgit v1.2.3 From cd11b164073b719203318227918f9510809d5e10 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:44 +0200 Subject: net/tc: introduce TC_ACT_REINSERT. This is similar TC_ACT_REDIRECT, but with a slightly different semantic: - on ingress the mirred skbs are passed to the target device network stack without any additional check not scrubbing. - the rcu-protected stats provided via the tcf_result struct are updated on error conditions. This new tcfa_action value is not exposed to the user-space and can be used only internally by clsact. v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce a new action type instead v2 -> v3: - rename the new action value TC_ACT_REINJECT, update the helper accordingly - take care of uncloned reinjected packets in XDP generic hook v3 -> v4: - renamed again the new action value (JiriP) v4 -> v5: - fix build error with !NET_CLS_ACT (kbuild bot) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 28 ++++++++++++++++++++++++++++ net/core/dev.c | 6 +++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6d02f31abba8..22bfc3a13c25 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -7,6 +7,9 @@ #include #include +/* TC action not accessible from user space */ +#define TC_ACT_REINSERT (TC_ACT_VALUE_MAX + 1) + /* Basic packet classifier frontend definitions. */ struct tcf_walker { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bcae181c1857..a6d00093f35e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -235,6 +235,12 @@ struct tcf_result { u32 classid; }; const struct tcf_proto *goto_tp; + + /* used by the TC_ACT_REINSERT action */ + struct { + bool ingress; + struct gnet_stats_queue *qstats; + }; }; }; @@ -569,6 +575,15 @@ static inline void skb_reset_tc(struct sk_buff *skb) #endif } +static inline bool skb_is_tc_redirected(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_CLS_ACT + return skb->tc_redirected; +#else + return false; +#endif +} + static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT @@ -1108,4 +1123,17 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); +static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) +{ + struct gnet_stats_queue *stats = res->qstats; + int ret; + + if (res->ingress) + ret = netif_receive_skb(skb); + else + ret = dev_queue_xmit(skb); + if (ret && stats) + qstats_overlimit_inc(res->qstats); +} + #endif diff --git a/net/core/dev.c b/net/core/dev.c index 89031b5fef9f..38b0c414d780 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4252,7 +4252,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb)) + if (skb_cloned(skb) || skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom @@ -4602,6 +4602,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, __skb_push(skb, skb->mac_len); skb_do_redirect(skb); return NULL; + case TC_ACT_REINSERT: + /* this does not scrub the packet, and updates stats on error */ + skb_tc_reinsert(skb, &cl_res); + return NULL; default: break; } -- cgit v1.2.3 From e5cf1baf92cb785b90390db1c624948e70c8b8bd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:45 +0200 Subject: act_mirred: use TC_ACT_REINSERT when possible When mirred is invoked from the ingress path, and it wants to redirect the processed packet, it can now use the TC_ACT_REINSERT action, filling the tcf_result accordingly, and avoiding a per packet skb_clone(). Overall this gives a ~10% improvement in forwarding performance for the TC S/W data path and TC S/W performances are now comparable to the kernel openvswitch datapath. v1 -> v2: use ACT_MIRRED instead of ACT_REDIRECT v2 -> v3: updated after action rename, fixed typo into the commit message v3 -> v4: updated again after action rename, added more comments to the code (JiriP), skip the optimization if the control action need to touch the tcf_result (Paolo) v4 -> v5: fix sparse warning (kbuild bot) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 53 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index eeb335f03102..b26d060da08e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,18 @@ static bool tcf_mirred_act_wants_ingress(int action) } } +static bool tcf_mirred_can_reinsert(int action) +{ + switch (action) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return true; + } + return false; +} + static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); @@ -171,10 +184,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); + struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; - struct sk_buff *skb2; int retval, err = 0; + bool use_reinsert; + bool want_ingress; + bool is_redirect; int m_eaction; int mac_len; @@ -196,16 +212,25 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, goto out; } - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - goto out; + /* we could easily avoid the clone only if called by ingress and clsact; + * since we can't easily detect the clsact caller, skip clone only for + * ingress - that covers the TC S/W datapath. + */ + is_redirect = tcf_mirred_is_act_redirect(m_eaction); + use_reinsert = skb_at_tc_ingress(skb) && is_redirect && + tcf_mirred_can_reinsert(retval); + if (!use_reinsert) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out; + } /* If action's target direction differs than filter's direction, * and devices expect a mac header on xmit, then mac push/pull is * needed. */ - if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) && - m_mac_header_xmit) { + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) { if (!skb_at_tc_ingress(skb)) { /* caught at egress, act ingress: pull mac */ mac_len = skb_network_header(skb) - skb_mac_header(skb); @@ -216,15 +241,23 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } } + skb2->skb_iif = skb->dev->ifindex; + skb2->dev = dev; + /* mirror is always swallowed */ - if (tcf_mirred_is_act_redirect(m_eaction)) { + if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; + + /* let's the caller reinsert the packet, if possible */ + if (use_reinsert) { + res->ingress = want_ingress; + res->qstats = this_cpu_ptr(m->common.cpu_qstats); + return TC_ACT_REINSERT; + } } - skb2->skb_iif = skb->dev->ifindex; - skb2->dev = dev; - if (!tcf_mirred_act_wants_ingress(m_eaction)) + if (!want_ingress) err = dev_queue_xmit(skb2); else err = netif_receive_skb(skb2); -- cgit v1.2.3 From ad13acce8dcd35cfc15281c1348beb70ca64091b Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 30 Jul 2018 16:08:33 +0530 Subject: net/tls: Use socket data_ready callback on record availability On receipt of a complete tls record, use socket's saved data_ready callback instead of state_change callback. In function tls_queue(), the TLS record is queued in encrypted state. But the decryption happen inline when tls_sw_recvmsg() or tls_sw_splice_read() get invoked. So it should be ok to notify the waiting context about the availability of data as soon as we could collect a full TLS record. For new data availability notification, sk_data_ready callback is more appropriate. It points to sock_def_readable() which wakes up specifically for EPOLLIN event. This is in contrast to the socket callback sk_state_change which points to sock_def_wakeup() which issues a wakeup unconditionally (without event mask). Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6deceb7c56ba..33838f11fafa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1028,7 +1028,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) ctx->recv_pkt = skb; strp_pause(strp); - strp->sk->sk_state_change(strp->sk); + ctx->saved_data_ready(strp->sk); } static void tls_data_ready(struct sock *sk) -- cgit v1.2.3 From 778c4d5c5b96a61c7981ad6d841071326a713845 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 30 Jul 2018 21:07:24 +0800 Subject: fib_rules: NULL check before kfree is not needed kfree(NULL) is safe,so this removes NULL check before freeing the mem Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f64aa13811ea..0ff3953f64aa 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -924,8 +924,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; errout: - if (nlrule) - kfree(nlrule); + kfree(nlrule); rules_ops_put(ops); return err; } -- cgit v1.2.3 From 486cdf21583e5b1fad488a3e4f0a5242a31c0ffa Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Thu, 26 Jul 2018 02:10:40 +0000 Subject: bpf: add End.DT6 action to bpf_lwt_seg6_action helper The seg6local LWT provides the End.DT6 action, which allows to decapsulate an outer IPv6 header containing a Segment Routing Header (SRH), full specification is available here: https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-05 This patch adds this action now to the seg6local BPF interface. Since it is not mandatory that the inner IPv6 header also contains a SRH, seg6_bpf_srh_state has been extended with a pointer to a possible SRH of the outermost IPv6 header. This helps assessing if the validation must be triggered or not, and avoids some calls to ipv6_find_hdr. v3: s/1/true, s/0/false for boolean values v2: - changed true/false -> 1/0 - preempt_enable no longer called in first conditional block Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- include/net/seg6_local.h | 4 ++- net/core/filter.c | 88 ++++++++++++++++++++++++++++++++---------------- net/ipv6/seg6_local.c | 50 +++++++++++++++++---------- 3 files changed, 94 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 661fd5b4d3e0..08359e2d8b35 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -21,10 +21,12 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); +extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { - bool valid; + struct ipv6_sr_hdr *srh; u16 hdrlen; + bool valid; }; DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); diff --git a/net/core/filter.c b/net/core/filter.c index 104d560946da..7df1a0f1d1e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4542,26 +4542,28 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; - struct ipv6_sr_hdr *srh; int srhoff = 0; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (srh == NULL) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) - srh_state->valid = 0; + srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; @@ -4577,52 +4579,78 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .arg4_type = ARG_CONST_SIZE }; -BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, - u32, action, void *, param, u32, param_len) +static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct ipv6_sr_hdr *srh; int srhoff = 0; - int err; - - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - - if (!srh_state->valid) { - if (unlikely((srh_state->hdrlen & 7) != 0)) - return -EBADMSG; - - srh->hdrlen = (u8)(srh_state->hdrlen >> 3); - if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) - return -EBADMSG; - srh_state->valid = 1; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { + srh_state->srh = NULL; + } else { + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen = srh_state->srh->hdrlen << 3; + srh_state->valid = true; } +} + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int hdroff = 0; + int err; switch (action) { case SEG6_LOCAL_ACTION_END_X: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_DT6: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; + + if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) + return -EBADMSG; + if (!pskb_pull(skb, hdroff)) + return -EBADMSG; + + skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + bpf_compute_data_pointers(skb); + bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; default: return -EINVAL; @@ -4644,15 +4672,14 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; - struct ipv6_sr_hdr *srh; struct ipv6hdr *hdr; int srhoff = 0; int ret; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (unlikely(srh == NULL)) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); @@ -4682,8 +4709,11 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; - srh_state->valid = 0; + srh_state->valid = false; return 0; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index e1025b493a18..60325dbfe88b 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -459,36 +459,57 @@ drop: DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +bool seg6_bpf_has_valid_srh(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + + if (unlikely(srh == NULL)) + return false; + + if (unlikely(!srh_state->valid)) { + if ((srh_state->hdrlen & 7) != 0) + return false; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) + return false; + + srh_state->valid = true; + } + + return true; +} + static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct seg6_bpf_srh_state local_srh_state; struct ipv6_sr_hdr *srh; - int srhoff = 0; int ret; srh = get_and_validate_srh(skb); - if (!srh) - goto drop; + if (!srh) { + kfree_skb(skb); + return -EINVAL; + } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); /* preempt_disable is needed to protect the per-CPU buffer srh_state, * which is also accessed by the bpf_lwt_seg6_* helpers */ preempt_disable(); + srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; - srh_state->valid = 1; + srh_state->valid = true; rcu_read_lock(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); rcu_read_unlock(); - local_srh_state = *srh_state; - preempt_enable(); - switch (ret) { case BPF_OK: case BPF_REDIRECT: @@ -500,24 +521,17 @@ static int input_action_end_bpf(struct sk_buff *skb, goto drop; } - if (unlikely((local_srh_state.hdrlen & 7) != 0)) - goto drop; - - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - goto drop; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); - - if (!local_srh_state.valid && - unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: + preempt_enable(); kfree_skb(skb); return -EINVAL; } -- cgit v1.2.3 From 1f821611f49a89d2258d256efedd618eda6344be Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 30 Jul 2018 22:22:59 +0900 Subject: lwt_bpf: remove unnecessary rcu_read_lock in run_lwt_bpf run_lwt_bpf is called by bpf_{input/output/xmit}. These functions are already protected by rcu_read_lock. because lwtunnel_{input/output/xmit} holds rcu_read_lock and then calls bpf_{input/output/xmit}. So that rcu_read_lock in the run_lwt_bpf is unnecessary. Signed-off-by: Taehee Yoo Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- net/core/lwt_bpf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index e7e626fb87bb..a49c7baf62f8 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -50,10 +50,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, * mixing with BH RCU lock doesn't work. */ preempt_disable(); - rcu_read_lock(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); - rcu_read_unlock(); switch (ret) { case BPF_OK: -- cgit v1.2.3 From d692f1138a4bac2efd2c8656ca15556b63479e82 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 30 Jul 2018 17:42:28 -0700 Subject: bpf: Support bpf_get_socket_cookie in more prog types bpf_get_socket_cookie() helper can be used to identify skb that correspond to the same socket. Though socket cookie can be useful in many other use-cases where socket is available in program context. Specifically BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS programs can benefit from it so that one of them can augment a value in a map prepared earlier by other program for the same socket. The patch adds support to call bpf_get_socket_cookie() from BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS. It doesn't introduce new helpers. Instead it reuses same helper name bpf_get_socket_cookie() but adds support to this helper to accept `struct bpf_sock_addr` and `struct bpf_sock_ops`. Documentation in bpf.h is changed in a way that should not break automatic generation of markdown. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 14 ++++++++++++++ net/core/filter.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 870113916cac..0ebaaf7f3568 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1371,6 +1371,20 @@ union bpf_attr { * A 8-byte long non-decreasing number on success, or 0 if the * socket field is missing inside *skb*. * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * Return + * A 8-byte long non-decreasing number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * Return + * A 8-byte long non-decreasing number. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket diff --git a/net/core/filter.c b/net/core/filter.c index 7df1a0f1d1e1..9bb9a4488e25 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3812,6 +3812,30 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { + .func = bpf_get_socket_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { + .func = bpf_get_socket_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -4818,6 +4842,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) default: return NULL; } + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_addr_proto; default: return bpf_base_func_proto(func_id); } @@ -4960,6 +4986,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_ops_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From f734607e819b951bae3b436b026ec672082e9241 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:52 -0700 Subject: xsk: refactor xdp_umem_assign_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return early and only take the ref on dev once there is no possibility of failing. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Björn Töpel Signed-off-by: David S. Miller --- net/xdp/xdp_umem.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index f47abb46c587..c199d66b5f3f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -56,41 +56,34 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_copy) return 0; - dev_hold(dev); + if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ - if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) { - bpf.command = XDP_QUERY_XSK_UMEM; + bpf.command = XDP_QUERY_XSK_UMEM; - rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); - if (err) { - dev_put(dev); - return force_zc ? -ENOTSUPP : 0; - } + if (err) + return force_zc ? -ENOTSUPP : 0; - bpf.command = XDP_SETUP_XSK_UMEM; - bpf.xsk.umem = umem; - bpf.xsk.queue_id = queue_id; + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; - rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); - if (err) { - dev_put(dev); - return force_zc ? err : 0; /* fail or fallback */ - } - - umem->dev = dev; - umem->queue_id = queue_id; - umem->zc = true; - return 0; - } + if (err) + return force_zc ? err : 0; /* fail or fallback */ - dev_put(dev); - return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ + dev_hold(dev); + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; } static void xdp_umem_clear_dev(struct xdp_umem *umem) -- cgit v1.2.3 From 84c6b86875e01a08a0daa6fdd4a01b36bf0bf0b2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:53 -0700 Subject: xsk: don't allow umem replace at stack level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently drivers have to check if they already have a umem installed for a given queue and return an error if so. Make better use of XDP_QUERY_XSK_UMEM and move this functionality to the core. We need to keep rtnl across the calls now. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Björn Töpel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- net/xdp/xdp_umem.c | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3bf7e93c9e96..282e2e95ad5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -872,10 +872,10 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */ struct { - struct xdp_umem *umem; - u16 queue_id; + struct xdp_umem *umem; /* out for query*/ + u16 queue_id; /* in for query */ } xsk; }; }; @@ -3568,6 +3568,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); +int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index c199d66b5f3f..911ca6d3cb5a 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "xdp_umem.h" #include "xsk_queue.h" @@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) } } +int xdp_umem_query(struct net_device *dev, u16 queue_id) +{ + struct netdev_bpf bpf; + + ASSERT_RTNL(); + + memset(&bpf, 0, sizeof(bpf)); + bpf.command = XDP_QUERY_XSK_UMEM; + bpf.xsk.queue_id = queue_id; + + if (!dev->netdev_ops->ndo_bpf) + return 0; + return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -62,28 +79,30 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - - if (err) - return force_zc ? -ENOTSUPP : 0; + err = xdp_umem_query(dev, queue_id); + if (err) { + err = err < 0 ? -ENOTSUPP : -EBUSY; + goto err_rtnl_unlock; + } bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = umem; bpf.xsk.queue_id = queue_id; - rtnl_lock(); err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - if (err) - return force_zc ? err : 0; /* fail or fallback */ + goto err_rtnl_unlock; + rtnl_unlock(); dev_hold(dev); umem->dev = dev; umem->queue_id = queue_id; umem->zc = true; return 0; + +err_rtnl_unlock: + rtnl_unlock(); + return force_zc ? err : 0; /* fail or fallback */ } static void xdp_umem_clear_dev(struct xdp_umem *umem) -- cgit v1.2.3 From e6476c21447c4b17c47e476aade6facf050f31e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:45:07 +0200 Subject: net: remove bogus RCU annotations on socket.wq We never use RCU protection for it, just a lot of cargo-cult rcu_deference_protects calls. Note that we do keep the kfree_rcu call for it, as the references through struct sock are RCU protected and thus might require a grace period before freeing. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- include/net/sock.h | 2 +- net/socket.c | 10 ++++------ 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/net.h b/include/linux/net.h index 6554d3ba4396..e0930678c8bf 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -114,7 +114,7 @@ struct socket { unsigned long flags; - struct socket_wq __rcu *wq; + struct socket_wq *wq; struct file *file; struct sock *sk; diff --git a/include/net/sock.h b/include/net/sock.h index 2afea5d1bdfe..433f45fc2d68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1788,7 +1788,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); - sk->sk_wq = parent->wq; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; diff --git a/net/socket.c b/net/socket.c index 5b7df6695f4f..475247e347ae 100644 --- a/net/socket.c +++ b/net/socket.c @@ -251,7 +251,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; wq->flags = 0; - RCU_INIT_POINTER(ei->socket.wq, wq); + ei->socket.wq = wq; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; @@ -265,11 +265,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb) static void sock_destroy_inode(struct inode *inode) { struct socket_alloc *ei; - struct socket_wq *wq; ei = container_of(inode, struct socket_alloc, vfs_inode); - wq = rcu_dereference_protected(ei->socket.wq, 1); - kfree_rcu(wq, rcu); + kfree_rcu(ei->socket.wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } @@ -603,7 +601,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) module_put(owner); } - if (rcu_dereference_protected(sock->wq, 1)->fasync_list) + if (sock->wq->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { @@ -1181,7 +1179,7 @@ static int sock_fasync(int fd, struct file *filp, int on) return -EINVAL; lock_sock(sk); - wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk)); + wq = sock->wq; fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) -- cgit v1.2.3 From 7992c18810e568b95c869b227137a2215702a805 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 31 Jul 2018 15:02:13 -0700 Subject: Bluetooth: hidp: buffer overflow in hidp_process_report CVE-2018-9363 The buffer length is unsigned at all layers, but gets cast to int and checked in hidp_process_report and can lead to a buffer overflow. Switch len parameter to unsigned int to resolve issue. This affects 3.18 and newer kernels. Signed-off-by: Mark Salyzyn Fixes: a4b1b5877b514b276f0f31efe02388a9c2836728 ("HID: Bluetooth: hidp: make sure input buffers are big enough") Cc: Marcel Holtmann Cc: Johan Hedberg Cc: "David S. Miller" Cc: Kees Cook Cc: Benjamin Tissoires Cc: linux-bluetooth@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: security@kernel.org Cc: kernel-team@android.com Acked-by: Kees Cook Signed-off-by: Marcel Holtmann --- net/bluetooth/hidp/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 6f3eaf2fb94f..253975cce943 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -431,8 +431,8 @@ static void hidp_del_timer(struct hidp_session *session) del_timer(&session->timer); } -static void hidp_process_report(struct hidp_session *session, - int type, const u8 *data, int len, int intr) +static void hidp_process_report(struct hidp_session *session, int type, + const u8 *data, unsigned int len, int intr) { if (len > HID_MAX_BUFFER_SIZE) len = HID_MAX_BUFFER_SIZE; -- cgit v1.2.3 From f597a5792ada511e3c69ecf7201fc178c574d822 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 1 Aug 2018 17:52:34 +0800 Subject: rxrpc: remove redundant variables 'sp' and 'did_discard' Variables 'sp' and 'did_discard' are being assigned, but are never used, hence they are redundant and can be removed. fix following warning: net/rxrpc/call_event.c:165:25: warning: variable 'sp' set but not used [-Wunused-but-set-variable] net/rxrpc/conn_client.c:1054:7: warning: variable 'did_discard' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 -- net/rxrpc/conn_client.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 20210418904b..8e7434e92097 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -162,7 +162,6 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) */ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { - struct rxrpc_skb_priv *sp; struct sk_buff *skb; unsigned long resend_at; rxrpc_seq_t cursor, seq, top; @@ -207,7 +206,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) skb = call->rxtx_buffer[ix]; rxrpc_see_skb(skb, rxrpc_skb_tx_seen); - sp = rxrpc_skb(skb); if (anno_type == RXRPC_TX_ANNO_UNACK) { if (ktime_after(skb->tstamp, max_age)) { diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 5736f643c516..e4bfbd7e48a8 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -1051,7 +1051,6 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work) container_of(work, struct rxrpc_net, client_conn_reaper); unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; - bool did_discard = false; _enter(""); @@ -1113,7 +1112,6 @@ next: * If someone re-sets the flag and re-gets the ref, that's fine. */ rxrpc_put_connection(conn); - did_discard = true; nr_conns--; goto next; -- cgit v1.2.3 From 887763bbc34112f4126ec52d16072ba736c83a6f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:36 +0100 Subject: rxrpc: Display call expect-receive-by timeout in proc Display in /proc/net/rxrpc/calls the timeout by which a call next expects to receive a packet. This makes it easier to debug timeout issues. Signed-off-by: David Howells --- net/rxrpc/proc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index d9fca8c4bcdc..bc6f27c8869d 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -63,6 +63,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_peer *peer; struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + unsigned long timeout = 0, nowj; rxrpc_seq_t tx_hard_ack, rx_hard_ack; char lbuff[50], rbuff[50]; @@ -71,7 +72,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) "Proto Local " " Remote " " SvID ConnID CallID End Use State Abort " - " UserID\n"); + " UserID TxSeq TW RxSeq RW RxTimo\n"); return 0; } @@ -94,11 +95,17 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) else strcpy(rbuff, "no_connection"); + if (call->state != RXRPC_CALL_SERVER_PREALLOC) { + timeout = READ_ONCE(call->expect_rx_by); + nowj = jiffies; + timeout -= jiffies; + } + tx_hard_ack = READ_ONCE(call->tx_hard_ack); rx_hard_ack = READ_ONCE(call->rx_hard_ack); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx %08x %02x %08x %02x\n", + " %-8.8s %08x %lx %08x %02x %08x %02x %06lx\n", lbuff, rbuff, call->service_id, @@ -110,7 +117,8 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->abort_code, call->user_call_ID, tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, - rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack); + rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack, + timeout); return 0; } -- cgit v1.2.3 From 6b97bd7a272cddc48adb384142db99a935834765 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:36 +0100 Subject: rxrpc: Show some more information through /proc files Show the four current call IDs in /proc/net/rxrpc/conns. Show the current packet Rx serial number in /proc/net/rxrpc/calls. Signed-off-by: David Howells --- net/rxrpc/proc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index bc6f27c8869d..163d05df339d 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -72,7 +72,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) "Proto Local " " Remote " " SvID ConnID CallID End Use State Abort " - " UserID TxSeq TW RxSeq RW RxTimo\n"); + " UserID TxSeq TW RxSeq RW RxSerial RxTimo\n"); return 0; } @@ -105,7 +105,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rx_hard_ack = READ_ONCE(call->rx_hard_ack); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx %08x %02x %08x %02x %06lx\n", + " %-8.8s %08x %lx %08x %02x %08x %02x %08x %06lx\n", lbuff, rbuff, call->service_id, @@ -118,6 +118,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->user_call_ID, tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack, + call->rx_serial, timeout); return 0; @@ -187,7 +188,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) print: seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %s %3u" - " %s %08x %08x %08x\n", + " %s %08x %08x %08x %08x %08x %08x %08x\n", lbuff, rbuff, conn->service_id, @@ -197,7 +198,11 @@ print: rxrpc_conn_states[conn->state], key_serial(conn->params.key), atomic_read(&conn->serial), - conn->hi_serial); + conn->hi_serial, + conn->channels[0].call_id, + conn->channels[1].call_id, + conn->channels[2].call_id, + conn->channels[3].call_id); return 0; } -- cgit v1.2.3 From f3f8337c9e2a4964671c652469202ec485afddc0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:36 +0100 Subject: rxrpc: Fix the trace for terminal ACK (re)transmission Fix the trace for terminal ACK (re)transmission to put in the right parameters. Signed-off-by: David Howells --- net/rxrpc/conn_event.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 8229a52c2acd..d46a68807f08 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -129,8 +129,10 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); break; case RXRPC_PACKET_TYPE_ACK: - trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0, - RXRPC_ACK_DUPLICATE, 0); + trace_rxrpc_tx_ack(NULL, serial, + ntohl(pkt.ack.firstPacket), + ntohl(pkt.ack.serial), + pkt.ack.reason, 0); _proto("Tx ACK %%%u [re]", serial); break; } -- cgit v1.2.3 From 4764c0da69dc500791c840c88dfd940d13b452e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:37 +0100 Subject: rxrpc: Trace packet transmission Trace successful packet transmission (kernel_sendmsg() succeeded, that is) in AF_RXRPC. We can share the enum that defines the transmission points with the trace_rxrpc_tx_fail() tracepoint, so rename its constants to be applicable to both. Also, save the internal call->debug_id in the rxrpc_channel struct so that it can be used in retransmission trace lines. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 107 +++++++++++++++++++++++++++++-------------- net/rxrpc/ar-internal.h | 1 + net/rxrpc/conn_client.c | 1 + net/rxrpc/conn_event.c | 13 ++++-- net/rxrpc/input.c | 11 ++++- net/rxrpc/local_event.c | 5 +- net/rxrpc/output.c | 32 ++++++++++--- net/rxrpc/rxkad.c | 7 ++- 8 files changed, 127 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4fff00e9da8a..2aa6f615b60d 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -211,18 +211,18 @@ enum rxrpc_congest_change { rxrpc_cong_saw_nack, }; -enum rxrpc_tx_fail_trace { - rxrpc_tx_fail_call_abort, - rxrpc_tx_fail_call_ack, - rxrpc_tx_fail_call_data_frag, - rxrpc_tx_fail_call_data_nofrag, - rxrpc_tx_fail_call_final_resend, - rxrpc_tx_fail_conn_abort, - rxrpc_tx_fail_conn_challenge, - rxrpc_tx_fail_conn_response, - rxrpc_tx_fail_reject, - rxrpc_tx_fail_version_keepalive, - rxrpc_tx_fail_version_reply, +enum rxrpc_tx_point { + rxrpc_tx_point_call_abort, + rxrpc_tx_point_call_ack, + rxrpc_tx_point_call_data_frag, + rxrpc_tx_point_call_data_nofrag, + rxrpc_tx_point_call_final_resend, + rxrpc_tx_point_conn_abort, + rxrpc_tx_point_rxkad_challenge, + rxrpc_tx_point_rxkad_response, + rxrpc_tx_point_reject, + rxrpc_tx_point_version_keepalive, + rxrpc_tx_point_version_reply, }; #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -452,18 +452,18 @@ enum rxrpc_tx_fail_trace { EM(RXRPC_CALL_LOCAL_ERROR, "LocalError") \ E_(RXRPC_CALL_NETWORK_ERROR, "NetError") -#define rxrpc_tx_fail_traces \ - EM(rxrpc_tx_fail_call_abort, "CallAbort") \ - EM(rxrpc_tx_fail_call_ack, "CallAck") \ - EM(rxrpc_tx_fail_call_data_frag, "CallDataFrag") \ - EM(rxrpc_tx_fail_call_data_nofrag, "CallDataNofrag") \ - EM(rxrpc_tx_fail_call_final_resend, "CallFinalResend") \ - EM(rxrpc_tx_fail_conn_abort, "ConnAbort") \ - EM(rxrpc_tx_fail_conn_challenge, "ConnChall") \ - EM(rxrpc_tx_fail_conn_response, "ConnResp") \ - EM(rxrpc_tx_fail_reject, "Reject") \ - EM(rxrpc_tx_fail_version_keepalive, "VerKeepalive") \ - E_(rxrpc_tx_fail_version_reply, "VerReply") +#define rxrpc_tx_points \ + EM(rxrpc_tx_point_call_abort, "CallAbort") \ + EM(rxrpc_tx_point_call_ack, "CallAck") \ + EM(rxrpc_tx_point_call_data_frag, "CallDataFrag") \ + EM(rxrpc_tx_point_call_data_nofrag, "CallDataNofrag") \ + EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ + EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ + EM(rxrpc_tx_point_reject, "Reject") \ + EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ + EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \ + EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ + E_(rxrpc_tx_point_version_reply, "VerReply") /* * Export enum symbols via userspace. @@ -488,7 +488,7 @@ rxrpc_propose_ack_traces; rxrpc_propose_ack_outcomes; rxrpc_congest_modes; rxrpc_congest_changes; -rxrpc_tx_fail_traces; +rxrpc_tx_points; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -801,7 +801,7 @@ TRACE_EVENT(rxrpc_transmit, ); TRACE_EVENT(rxrpc_rx_data, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, + TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags, u8 anno), TP_ARGS(call, seq, serial, flags, anno), @@ -815,7 +815,7 @@ TRACE_EVENT(rxrpc_rx_data, ), TP_fast_assign( - __entry->call = call->debug_id; + __entry->call = call; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; @@ -918,6 +918,37 @@ TRACE_EVENT(rxrpc_rx_rwind_change, __entry->wake ? " wake" : "") ); +TRACE_EVENT(rxrpc_tx_packet, + TP_PROTO(unsigned int call_id, struct rxrpc_wire_header *whdr, + enum rxrpc_tx_point where), + + TP_ARGS(call_id, whdr, where), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_tx_point, where ) + __field_struct(struct rxrpc_wire_header, whdr ) + ), + + TP_fast_assign( + __entry->call = call_id; + memcpy(&__entry->whdr, whdr, sizeof(__entry->whdr)); + ), + + TP_printk("c=%08x %08x:%08x:%08x:%04x %08x %08x %02x %02x %s %s", + __entry->call, + ntohl(__entry->whdr.epoch), + ntohl(__entry->whdr.cid), + ntohl(__entry->whdr.callNumber), + ntohs(__entry->whdr.serviceId), + ntohl(__entry->whdr.serial), + ntohl(__entry->whdr.seq), + __entry->whdr.type, __entry->whdr.flags, + __entry->whdr.type <= 15 ? + __print_symbolic(__entry->whdr.type, rxrpc_pkts) : "?UNK", + __print_symbolic(__entry->where, rxrpc_tx_points)) + ); + TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags, bool retrans, bool lose), @@ -928,6 +959,8 @@ TRACE_EVENT(rxrpc_tx_data, __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) __field(rxrpc_serial_t, serial ) + __field(u32, cid ) + __field(u32, call_id ) __field(u8, flags ) __field(bool, retrans ) __field(bool, lose ) @@ -935,6 +968,8 @@ TRACE_EVENT(rxrpc_tx_data, TP_fast_assign( __entry->call = call->debug_id; + __entry->cid = call->cid; + __entry->call_id = call->call_id; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; @@ -942,8 +977,10 @@ TRACE_EVENT(rxrpc_tx_data, __entry->lose = lose; ), - TP_printk("c=%08x DATA %08x q=%08x fl=%02x%s%s", + TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s%s", __entry->call, + __entry->cid, + __entry->call_id, __entry->serial, __entry->seq, __entry->flags, @@ -952,7 +989,7 @@ TRACE_EVENT(rxrpc_tx_data, ); TRACE_EVENT(rxrpc_tx_ack, - TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + TP_PROTO(unsigned int call, rxrpc_serial_t serial, rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, u8 reason, u8 n_acks), @@ -968,7 +1005,7 @@ TRACE_EVENT(rxrpc_tx_ack, ), TP_fast_assign( - __entry->call = call ? call->debug_id : 0; + __entry->call = call; __entry->serial = serial; __entry->ack_first = ack_first; __entry->ack_serial = ack_serial; @@ -1434,29 +1471,29 @@ TRACE_EVENT(rxrpc_rx_icmp, TRACE_EVENT(rxrpc_tx_fail, TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, int ret, - enum rxrpc_tx_fail_trace what), + enum rxrpc_tx_point where), - TP_ARGS(debug_id, serial, ret, what), + TP_ARGS(debug_id, serial, ret, where), TP_STRUCT__entry( __field(unsigned int, debug_id ) __field(rxrpc_serial_t, serial ) __field(int, ret ) - __field(enum rxrpc_tx_fail_trace, what ) + __field(enum rxrpc_tx_point, where ) ), TP_fast_assign( __entry->debug_id = debug_id; __entry->serial = serial; __entry->ret = ret; - __entry->what = what; + __entry->where = where; ), TP_printk("c=%08x r=%x ret=%d %s", __entry->debug_id, __entry->serial, __entry->ret, - __print_symbolic(__entry->what, rxrpc_tx_fail_traces)) + __print_symbolic(__entry->where, rxrpc_tx_points)) ); TRACE_EVENT(rxrpc_call_reset, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5fb7d3254d9e..7eee955a768a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -420,6 +420,7 @@ struct rxrpc_connection { struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call __rcu *call; /* Active call */ + unsigned int call_debug_id; /* call->debug_id */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index e4bfbd7e48a8..f8f37188a932 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -590,6 +590,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, */ smp_wmb(); chan->call_id = call_id; + chan->call_debug_id = call->debug_id; rcu_assign_pointer(chan->call, call); wake_up(&call->waitq); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index d46a68807f08..84d40ba9856f 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -129,7 +129,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); break; case RXRPC_PACKET_TYPE_ACK: - trace_rxrpc_tx_ack(NULL, serial, + trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), pkt.ack.reason, 0); @@ -140,8 +140,11 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); conn->params.peer->last_tx_at = ktime_get_real(); if (ret < 0) - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_fail_call_final_resend); + trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret, + rxrpc_tx_point_call_final_resend); + else + trace_rxrpc_tx_packet(chan->call_debug_id, &pkt.whdr, + rxrpc_tx_point_call_final_resend); _leave(""); } @@ -242,11 +245,13 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_fail_conn_abort); + rxrpc_tx_point_conn_abort); _debug("sendmsg failed: %d", ret); return -EAGAIN; } + trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); + conn->params.peer->last_tx_at = ktime_get_real(); _leave(" = 0"); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 608d078a4981..8989d760b6b2 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -496,7 +496,7 @@ next_subpacket: return rxrpc_proto_abort("LSA", call, seq); } - trace_rxrpc_rx_data(call, seq, serial, flags, annotation); + trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation); if (before_eq(seq, hard_ack)) { ack = RXRPC_ACK_DUPLICATE; ack_serial = serial; @@ -592,6 +592,10 @@ ack: rxrpc_propose_ACK(call, ack, skew, ack_serial, immediate_ack, true, rxrpc_propose_ack_input_data); + else + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, + false, true, + rxrpc_propose_ack_input_data); if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) rxrpc_notify_socket(call); @@ -1262,6 +1266,11 @@ void rxrpc_data_ready(struct sock *udp_sk) /* But otherwise we need to retransmit the final packet * from data cached in the connection record. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) + trace_rxrpc_rx_data(chan->call_debug_id, + sp->hdr.seq, + sp->hdr.serial, + sp->hdr.flags, 0); rxrpc_post_packet_to_conn(conn, skb); goto out_unlock; } diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 8325f1b86840..13bd8a4dfac7 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -72,7 +72,10 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, - rxrpc_tx_fail_version_reply); + rxrpc_tx_point_version_reply); + else + trace_rxrpc_tx_packet(local->debug_id, &whdr, + rxrpc_tx_point_version_reply); _leave(""); } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f03de1c59ba3..801dbf3d3478 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -183,7 +183,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, serial = atomic_inc_return(&conn->serial); pkt->whdr.serial = htonl(serial); - trace_rxrpc_tx_ack(call, serial, + trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); @@ -212,7 +212,10 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, conn->params.peer->last_tx_at = ktime_get_real(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_fail_call_ack); + rxrpc_tx_point_call_ack); + else + trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, + rxrpc_tx_point_call_ack); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -299,7 +302,10 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) conn->params.peer->last_tx_at = ktime_get_real(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_fail_call_abort); + rxrpc_tx_point_call_abort); + else + trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, + rxrpc_tx_point_call_abort); rxrpc_put_connection(conn); @@ -396,7 +402,10 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, up_read(&conn->params.local->defrag_sem); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_fail_call_data_nofrag); + rxrpc_tx_point_call_data_nofrag); + else + trace_rxrpc_tx_packet(call->debug_id, &whdr, + rxrpc_tx_point_call_data_nofrag); if (ret == -EMSGSIZE) goto send_fragmentable; @@ -488,7 +497,10 @@ send_fragmentable: if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_fail_call_data_frag); + rxrpc_tx_point_call_data_frag); + else + trace_rxrpc_tx_packet(call->debug_id, &whdr, + rxrpc_tx_point_call_data_frag); up_write(&conn->params.local->defrag_sem); goto done; @@ -545,7 +557,10 @@ void rxrpc_reject_packets(struct rxrpc_local *local) ret = kernel_sendmsg(local->socket, &msg, iov, 2, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, - rxrpc_tx_fail_reject); + rxrpc_tx_point_reject); + else + trace_rxrpc_tx_packet(local->debug_id, &whdr, + rxrpc_tx_point_reject); } rxrpc_free_skb(skb, rxrpc_skb_rx_freed); @@ -597,7 +612,10 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); if (ret < 0) trace_rxrpc_tx_fail(peer->debug_id, 0, ret, - rxrpc_tx_fail_version_keepalive); + rxrpc_tx_point_version_keepalive); + else + trace_rxrpc_tx_packet(peer->debug_id, &whdr, + rxrpc_tx_point_version_keepalive); peer->last_tx_at = ktime_get_real(); _leave(""); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 278ac0807a60..6988073ae842 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -665,11 +665,13 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_fail_conn_challenge); + rxrpc_tx_point_rxkad_challenge); return -EAGAIN; } conn->params.peer->last_tx_at = ktime_get_real(); + trace_rxrpc_tx_packet(conn->debug_id, &whdr, + rxrpc_tx_point_rxkad_challenge); _leave(" = 0"); return 0; } @@ -721,11 +723,12 @@ static int rxkad_send_response(struct rxrpc_connection *conn, ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_fail_conn_response); + rxrpc_tx_point_rxkad_response); return -EAGAIN; } conn->params.peer->last_tx_at = ktime_get_real(); + trace_rxrpc_tx_packet(0, &whdr, rxrpc_tx_point_rxkad_response); _leave(" = 0"); return 0; } -- cgit v1.2.3 From 4272d3034e69aea6e17085ba285d14f5824b430d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:37 +0100 Subject: rxrpc: Trace socket notification Trace notifications from the softirq side of the socket to the process-context side. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 20 ++++++++++++++++++++ net/rxrpc/input.c | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c1a800a6dee3..196587b8f204 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1528,6 +1528,26 @@ TRACE_EVENT(rxrpc_call_reset, __entry->tx_seq, __entry->rx_seq) ); +TRACE_EVENT(rxrpc_notify_socket, + TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial), + + TP_ARGS(debug_id, serial), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(rxrpc_serial_t, serial ) + ), + + TP_fast_assign( + __entry->debug_id = debug_id; + __entry->serial = serial; + ), + + TP_printk("c=%08x r=%08x", + __entry->debug_id, + __entry->serial) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 8989d760b6b2..cfdc199c6351 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -597,8 +597,10 @@ ack: false, true, rxrpc_propose_ack_input_data); - if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) + if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) { + trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); + } _leave(" [queued]"); } -- cgit v1.2.3 From 4075295ab87670e33eaf98389e319ce84c54c8e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:37 +0100 Subject: rxrpc: Increase the size of a call's Rx window Increase the size of a call's Rx window from 32 to 63 - ie. one less than the size of the ring buffer. This makes large data transfers perform better when the Tx window on the other side is around 64 (as is the case with Auristor's YFS fileserver). If the server window size is ~32 or smaller, this should make no difference. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7eee955a768a..e791d35ee34b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -589,7 +589,7 @@ struct rxrpc_call { */ #define RXRPC_RXTX_BUFF_SIZE 64 #define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1) -#define RXRPC_INIT_RX_WINDOW_SIZE 32 +#define RXRPC_INIT_RX_WINDOW_SIZE 63 struct sk_buff **rxtx_buffer; u8 *rxtx_annotations; #define RXRPC_TX_ANNO_ACK 0 -- cgit v1.2.3 From a71a2651bdd3ad9ccae7d8e8c6782727c7ecba98 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:37 +0100 Subject: rxrpc: Propose, but don't immediately transmit, the final ACK for a call The final ACK that closes out an rxrpc call needs to be transmitted by the client unless we're going to follow up with a DATA packet for a new call on the same channel (which implicitly ACK's the previous call, thereby saving an ACK). Currently, we don't do that, so if no follow on call is immediately forthcoming, the server will resend the last DATA packet - at which point rxrpc_conn_retransmit_call() will be triggered and will (re)send the final ACK. But the server has to hold on to the last packet until the ACK is received, thereby holding up its resources. Fix the client side to propose a delayed final ACK, to be transmitted after a short delay, assuming the call isn't superseded by a new one. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 7bff716e911e..02f1f768e16a 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -144,13 +144,11 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); -#if 0 // TODO: May want to transmit final ACK under some circumstances anyway if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, false, true, rxrpc_propose_ack_terminal_ack); - rxrpc_send_ack_packet(call, false, NULL); + //rxrpc_send_ack_packet(call, false, NULL); } -#endif write_lock_bh(&call->state_lock); -- cgit v1.2.3 From d0b35a42031a3107a5735e0d0a605a68f530a96b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 23 Jul 2018 17:18:38 +0100 Subject: rxrpc: Transmit more ACKs during data reception Immediately flush any outstanding ACK on entry to rxrpc_recvmsg_data() - which transfers data to the target buffers - if we previously had an Rx underrun (ie. we returned -EAGAIN because we ran out of received data). This lets the server know what we've managed to receive something. Also flush any outstanding ACK after calling the function if it hit -EAGAIN to let the server know we processed some data. It might be better to send more ACKs, possibly on a time-based scheme, but that needs some more consideration. With this and some additional AFS patches, it is possible to get large unencrypted O_DIRECT reads to be almost as fast as NFS over TCP. It looks like it might be theoretically possible to improve performance yet more for a server running a single operation as investigation of packet timestamps indicates that the server keeps stalling. The issue appears to be that rxrpc runs in to trouble with ACK packets getting batched together (up to ~32 at a time) somewhere between the IP transmit queue on the client and the ethernet receive queue on the server. However, this case isn't too much of a worry as even a lightly loaded server should be receiving sufficient packet flux to flush the ACK packets to the UDP socket. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/recvmsg.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e791d35ee34b..9d9278a13d91 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -479,6 +479,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ + RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ }; /* diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 02f1f768e16a..a57ea96c84ea 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -313,6 +313,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; + if (test_and_clear_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags) && + call->ackr_reason) + rxrpc_send_ack_packet(call, false, NULL); + rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; @@ -412,6 +416,8 @@ out: done: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); + if (ret == -EAGAIN) + set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags); return ret; } @@ -684,6 +690,17 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: + switch (call->ackr_reason) { + case RXRPC_ACK_IDLE: + break; + case RXRPC_ACK_DELAY: + if (ret != -EAGAIN) + break; + /* Fall through */ + default: + rxrpc_send_ack_packet(call, false, NULL); + } + if (_service) *_service = call->service_id; mutex_unlock(&call->user_mutex); -- cgit v1.2.3 From f394ad28feffbeebab77c8bf9a203bd49b957c9a Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Mon, 30 Jul 2018 22:48:41 -0700 Subject: rds: rds_ib_recv_alloc_cache() should call alloc_percpu_gfp() instead Currently, rds_ib_conn_alloc() calls rds_ib_recv_alloc_caches() without passing along the gfp_t flag. But rds_ib_recv_alloc_caches() and rds_ib_recv_alloc_cache() should take a gfp_t parameter so that rds_ib_recv_alloc_cache() can call alloc_percpu_gfp() using the correct flag instead of calling alloc_percpu(). Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.h | 2 +- net/rds/ib_cm.c | 2 +- net/rds/ib_recv.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index beb95b893f78..73427ff439f9 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -400,7 +400,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); int rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv_path(struct rds_conn_path *conn); -int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index a33b82dc0804..0d654d99fe41 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1102,7 +1102,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) if (!ic) return -ENOMEM; - ret = rds_ib_recv_alloc_caches(ic); + ret = rds_ib_recv_alloc_caches(ic, gfp); if (ret) { kfree(ic); return ret; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 557ccbb1ce00..d300186b8dc0 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -98,12 +98,12 @@ static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) } } -static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp) { struct rds_ib_cache_head *head; int cpu; - cache->percpu = alloc_percpu(struct rds_ib_cache_head); + cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp); if (!cache->percpu) return -ENOMEM; @@ -118,13 +118,13 @@ static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) return 0; } -int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp) { int ret; - ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp); if (!ret) { - ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp); if (ret) free_percpu(ic->i_cache_incs.percpu); } -- cgit v1.2.3 From e65d4d96334e3ff4fe0064612a93a51c63de08de Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Mon, 30 Jul 2018 22:48:42 -0700 Subject: rds: Remove IPv6 dependency This patch removes the IPv6 dependency from RDS. Signed-off-by: Ka-Cheong Poon Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/Kconfig | 2 +- net/rds/af_rds.c | 32 +++++++++++++++++++------------- net/rds/bind.c | 4 +++- net/rds/connection.c | 26 ++++++++++++++++++++++++-- net/rds/ib.c | 31 ++++++++++++++++++++++++++----- net/rds/ib_cm.c | 9 +++++++++ net/rds/ib_rdma.c | 2 ++ net/rds/rdma_transport.c | 10 ++++++++++ net/rds/recv.c | 2 ++ net/rds/send.c | 2 ++ net/rds/tcp.c | 25 +++++++++++++++++++++++++ net/rds/tcp_listen.c | 21 +++++++++++++++++---- 12 files changed, 140 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 4c7f2595d919..41f75563b54b 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -1,7 +1,7 @@ config RDS tristate "The RDS Protocol" - depends on INET && IPV6 + depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband or TCP. diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index fc5c48b248fe..65387e1e6964 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -156,18 +156,20 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, return sizeof(*sin); } - if (ipv6_addr_type(&rs->rs_conn_addr) & - IPV6_ADDR_MAPPED) { - sin = (struct sockaddr_in *)uaddr; - memset(sin, 0, sizeof(*sin)); - sin->sin_family = AF_INET; - return sizeof(*sin); +#if IS_ENABLED(CONFIG_IPV6) + if (!(ipv6_addr_type(&rs->rs_conn_addr) & + IPV6_ADDR_MAPPED)) { + sin6 = (struct sockaddr_in6 *)uaddr; + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + return sizeof(*sin6); } +#endif - sin6 = (struct sockaddr_in6 *)uaddr; - memset(sin6, 0, sizeof(*sin6)); - sin6->sin6_family = AF_INET6; - return sizeof(*sin6); + sin = (struct sockaddr_in *)uaddr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + return sizeof(*sin); } if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { sin = (struct sockaddr_in *)uaddr; @@ -501,9 +503,7 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; struct rds_sock *rs = rds_sk_to_rs(sk); - int addr_type; int ret = 0; lock_sock(sk); @@ -528,7 +528,11 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, rs->rs_conn_port = sin->sin_port; break; - case AF_INET6: +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6; + int addr_type; + sin6 = (struct sockaddr_in6 *)uaddr; if (addr_len < sizeof(struct sockaddr_in6)) { ret = -EINVAL; @@ -575,6 +579,8 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, rs->rs_conn_addr = sin6->sin6_addr; rs->rs_conn_port = sin6->sin6_port; break; + } +#endif default: ret = -EAFNOSUPPORT; diff --git a/net/rds/bind.c b/net/rds/bind.c index ba778760cbc2..3ab55784b637 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -165,7 +165,6 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; - int addr_type; int ret = 0; __be16 port; @@ -183,8 +182,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; +#if IS_ENABLED(CONFIG_IPV6) } else if (uaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; + int addr_type; if (addr_len < sizeof(struct sockaddr_in6)) return -EINVAL; @@ -212,6 +213,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } binding_addr = &sin6->sin6_addr; port = sin6->sin6_port; +#endif } else { return -EINVAL; } diff --git a/net/rds/connection.c b/net/rds/connection.c index 051e35c1e7c6..3bd2f4a5a30d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -63,8 +63,12 @@ static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; +#if IS_ENABLED(CONFIG_IPV6) fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); - hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); +#else + fhash = (__force u32)faddr->s6_addr32[3]; +#endif + hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } @@ -201,6 +205,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; + +#if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the * index used for this connection. Otherwise, set it to 0 as * the socket is not bound to an interface. c_bound_if is used @@ -209,6 +215,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) conn->c_bound_if = dev_if; else +#endif conn->c_bound_if = 0; rds_conn_net_set(conn, net); @@ -500,9 +507,11 @@ static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, void *saddr, void *daddr, int flip, bool isv6) { +#if IS_ENABLED(CONFIG_IPV6) if (isv6) rds6_inc_info_copy(inc, iter, saddr, daddr, flip); else +#endif rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); } @@ -581,6 +590,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); } +#if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -588,6 +598,7 @@ static void rds6_conn_message_info(struct socket *sock, unsigned int len, { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } +#endif static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -596,12 +607,14 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len, rds_conn_message_info(sock, len, iter, lens, 1); } +#if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 1); } +#endif static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, @@ -611,6 +624,7 @@ static void rds_conn_message_info_retrans(struct socket *sock, rds_conn_message_info(sock, len, iter, lens, 0); } +#if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -618,6 +632,7 @@ static void rds6_conn_message_info_retrans(struct socket *sock, { rds6_conn_message_info(sock, len, iter, lens, 0); } +#endif void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -734,6 +749,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) return 1; } +#if IS_ENABLED(CONFIG_IPV6) static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds6_info_connection *cinfo6 = buffer; @@ -761,6 +777,7 @@ static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) */ return 1; } +#endif static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -774,6 +791,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } +#if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -785,6 +803,7 @@ static void rds6_conn_info(struct socket *sock, unsigned int len, buffer, sizeof(struct rds6_info_connection)); } +#endif int rds_conn_init(void) { @@ -807,12 +826,13 @@ int rds_conn_init(void) rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); +#if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_register_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); - +#endif return 0; } @@ -830,11 +850,13 @@ void rds_conn_exit(void) rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); +#if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); +#endif } /* diff --git a/net/rds/ib.c b/net/rds/ib.c index a4245c42d43b..89c6333ecd39 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -321,6 +321,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, return 1; } +#if IS_ENABLED(CONFIG_IPV6) /* IPv6 version of rds_ib_conn_info_visitor(). */ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) @@ -357,6 +358,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, } return 1; } +#endif static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -370,6 +372,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } +#if IS_ENABLED(CONFIG_IPV6) /* IPv6 version of rds_ib_ic_info(). */ static void rds6_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -382,6 +385,7 @@ static void rds6_ib_ic_info(struct socket *sock, unsigned int len, buffer, sizeof(struct rds6_info_rdma_connection)); } +#endif /* * Early RDS/IB was built to only bind to an address if there is an IPoIB @@ -398,7 +402,9 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, { int ret; struct rdma_cm_id *cm_id; +#if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 sin6; +#endif struct sockaddr_in sin; struct sockaddr *sa; bool isv4; @@ -418,6 +424,7 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, sin.sin_addr.s_addr = addr->s6_addr32[3]; sa = (struct sockaddr *)&sin; } else { +#if IS_ENABLED(CONFIG_IPV6) memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr; @@ -432,21 +439,30 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { struct net_device *dev; - if (scope_id == 0) - return -EADDRNOTAVAIL; + if (scope_id == 0) { + ret = -EADDRNOTAVAIL; + goto out; + } /* Use init_net for now as RDS is not network * name space aware. */ dev = dev_get_by_index(&init_net, scope_id); - if (!dev) - return -EADDRNOTAVAIL; + if (!dev) { + ret = -EADDRNOTAVAIL; + goto out; + } if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { dev_put(dev); - return -EADDRNOTAVAIL; + ret = -EADDRNOTAVAIL; + goto out; } dev_put(dev); } +#else + ret = -EADDRNOTAVAIL; + goto out; +#endif } /* rdma_bind_addr will only succeed for IB & iWARP devices */ @@ -461,6 +477,7 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); +out: rdma_destroy_id(cm_id); return ret; @@ -491,7 +508,9 @@ void rds_ib_exit(void) rds_ib_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); +#if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); +#endif rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); @@ -553,7 +572,9 @@ int rds_ib_init(void) rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); +#if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); +#endif goto out; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0d654d99fe41..bfbb31f0c7fd 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -678,6 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) return version; } +#if IS_ENABLED(CONFIG_IPV6) /* Given an IPv6 address, find the net_device which hosts that address and * return its index. This is used by the rds_ib_cm_handle_connect() code to * find the interface index of where an incoming request comes from when @@ -704,6 +705,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) return idx; } +#endif int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6) @@ -732,6 +734,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, dp = event->param.conn.private_data; if (isv6) { +#if IS_ENABLED(CONFIG_IPV6) dp_cmn = &dp->ricp_v6.dp_cmn; saddr6 = &dp->ricp_v6.dp_saddr; daddr6 = &dp->ricp_v6.dp_daddr; @@ -756,6 +759,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } } +#else + err = -EOPNOTSUPP; + goto out; +#endif } else { dp_cmn = &dp->ricp_v4.dp_cmn; ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr); @@ -893,9 +900,11 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ +#if IS_ENABLED(CONFIG_IPV6) if (conn->c_isv6) handler = rds6_rdma_cm_event_handler; else +#endif handler = rds_rdma_cm_event_handler; ic->i_cm_id = rdma_create_id(&init_net, handler, conn, RDMA_PS_TCP, IB_QPT_RC); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e3c8bbbdb43f..99ccafb90410 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -180,6 +180,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } +#if IS_ENABLED(CONFIG_IPV6) void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds6_info_rdma_connection *iinfo6) { @@ -188,6 +189,7 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, iinfo6->rdma_mr_max = pool_1m->max_items; iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; } +#endif struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index ad78929036ef..6b0f57c83a2a 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -39,7 +39,9 @@ /* Global IPv4 and IPv6 RDS RDMA listener cm_id */ static struct rdma_cm_id *rds_rdma_listen_id; +#if IS_ENABLED(CONFIG_IPV6) static struct rdma_cm_id *rds6_rdma_listen_id; +#endif static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, @@ -155,11 +157,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, return rds_rdma_cm_event_handler_cmn(cm_id, event, false); } +#if IS_ENABLED(CONFIG_IPV6) int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { return rds_rdma_cm_event_handler_cmn(cm_id, event, true); } +#endif static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, struct sockaddr *sa, @@ -214,7 +218,9 @@ out: static int rds_rdma_listen_init(void) { int ret; +#if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 sin6; +#endif struct sockaddr_in sin; sin.sin_family = PF_INET; @@ -226,6 +232,7 @@ static int rds_rdma_listen_init(void) if (ret != 0) return ret; +#if IS_ENABLED(CONFIG_IPV6) sin6.sin6_family = PF_INET6; sin6.sin6_addr = in6addr_any; sin6.sin6_port = htons(RDS_CM_PORT); @@ -237,6 +244,7 @@ static int rds_rdma_listen_init(void) /* Keep going even when IPv6 is not enabled in the system. */ if (ret != 0) rdsdebug("Cannot set up IPv6 RDMA listener\n"); +#endif return 0; } @@ -247,11 +255,13 @@ static void rds_rdma_listen_stop(void) rdma_destroy_id(rds_rdma_listen_id); rds_rdma_listen_id = NULL; } +#if IS_ENABLED(CONFIG_IPV6) if (rds6_rdma_listen_id) { rdsdebug("cm %p\n", rds6_rdma_listen_id); rdma_destroy_id(rds6_rdma_listen_id); rds6_rdma_listen_id = NULL; } +#endif } static int rds_rdma_init(void) diff --git a/net/rds/recv.c b/net/rds/recv.c index 03cd8df54c26..504cd6bcc54c 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -793,6 +793,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, rds_info_copy(iter, &minfo, sizeof(minfo)); } +#if IS_ENABLED(CONFIG_IPV6) void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, @@ -817,3 +818,4 @@ void rds6_inc_info_copy(struct rds_incoming *inc, rds_info_copy(iter, &minfo6, sizeof(minfo6)); } +#endif diff --git a/net/rds/send.c b/net/rds/send.c index 18e2b4d3931f..36a5dba56a43 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1103,6 +1103,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) dport = usin->sin_port; break; +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { int addr_type; @@ -1142,6 +1143,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) dport = sin6->sin6_port; break; } +#endif default: ret = -EINVAL; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index f23925af0b8d..2c7b7c352d3e 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -51,7 +51,9 @@ static LIST_HEAD(rds_tcp_tc_list); * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. */ static unsigned int rds_tcp_tc_count; +#if IS_ENABLED(CONFIG_IPV6) static unsigned int rds6_tcp_tc_count; +#endif /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -118,7 +120,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_del_init(&tc->t_list_item); +#if IS_ENABLED(CONFIG_IPV6) rds6_tcp_tc_count--; +#endif if (!tc->t_cpath->cp_conn->c_isv6) rds_tcp_tc_count--; spin_unlock(&rds_tcp_tc_list_lock); @@ -207,7 +211,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); +#if IS_ENABLED(CONFIG_IPV6) rds6_tcp_tc_count++; +#endif if (!tc->t_cpath->cp_conn->c_isv6) rds_tcp_tc_count++; spin_unlock(&rds_tcp_tc_list_lock); @@ -273,6 +279,7 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } +#if IS_ENABLED(CONFIG_IPV6) /* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped * address. @@ -314,12 +321,15 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } +#endif static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id) { struct net_device *dev = NULL; +#if IS_ENABLED(CONFIG_IPV6) int ret; +#endif if (ipv6_addr_v4mapped(addr)) { if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL) @@ -340,9 +350,11 @@ static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, } rcu_read_unlock(); } +#if IS_ENABLED(CONFIG_IPV6) ret = ipv6_chk_addr(net, addr, dev, 0); if (ret) return 0; +#endif return -EADDRNOTAVAIL; } @@ -545,18 +557,27 @@ static __net_init int rds_tcp_init_net(struct net *net) err = -ENOMEM; goto fail; } + +#if IS_ENABLED(CONFIG_IPV6) rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); +#else + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); +#endif if (!rtn->rds_tcp_listen_sock) { pr_warn("could not set up IPv6 listen sock\n"); +#if IS_ENABLED(CONFIG_IPV6) /* Try IPv4 as some systems disable IPv6 */ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); if (!rtn->rds_tcp_listen_sock) { +#endif unregister_net_sysctl_table(rtn->rds_tcp_sysctl); rtn->rds_tcp_sysctl = NULL; err = -EAFNOSUPPORT; goto fail; +#if IS_ENABLED(CONFIG_IPV6) } +#endif } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; @@ -670,7 +691,9 @@ static void rds_tcp_exit(void) rds_tcp_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); +#if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); +#endif unregister_pernet_device(&rds_tcp_net_ops); rds_tcp_destroy_conns(); rds_trans_unregister(&rds_tcp_transport); @@ -702,7 +725,9 @@ static int rds_tcp_init(void) rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); +#if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); +#endif goto out; out_recv: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 0cf0147117d8..c12203f646da 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -132,7 +132,10 @@ int rds_tcp_accept_one(struct socket *sock) int conn_state; struct rds_conn_path *cp; struct in6_addr *my_addr, *peer_addr; - int dev_if; +#if !IS_ENABLED(CONFIG_IPV6) + struct in6_addr saddr, daddr; +#endif + int dev_if = 0; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; @@ -165,12 +168,21 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); +#if IS_ENABLED(CONFIG_IPV6) my_addr = &new_sock->sk->sk_v6_rcv_saddr; peer_addr = &new_sock->sk->sk_v6_daddr; - rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", +#else + ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr); + ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr); + my_addr = &saddr; + peer_addr = &daddr; +#endif + rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", + sock->sk->sk_family, my_addr, ntohs(inet->inet_sport), peer_addr, ntohs(inet->inet_dport)); +#if IS_ENABLED(CONFIG_IPV6) /* sk_bound_dev_if is not set if the peer address is not link local * address. In this case, it happens that mcast_oif is set. So * just use it. @@ -184,9 +196,10 @@ int rds_tcp_accept_one(struct socket *sock) } else { dev_if = new_sock->sk->sk_bound_dev_if; } +#endif + conn = rds_conn_create(sock_net(sock->sk), - &new_sock->sk->sk_v6_rcv_saddr, - &new_sock->sk->sk_v6_daddr, + my_addr, peer_addr, &rds_tcp_transport, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { -- cgit v1.2.3 From b053fcc4a1c3c8f9080e2904acee73481fb58c44 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 31 Jul 2018 17:01:37 +0100 Subject: net/tipc: remove redundant variables 'tn' and 'oport' Variables 'tn' and 'oport' are being assigned but are never used hence they are redundant and can be removed. Cleans up clang warnings: warning: variable 'oport' set but not used [-Wunused-but-set-variable] warning: variable 'tn' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/socket.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3763bedecf5f..c1e93c9515bc 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -411,7 +411,6 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, int kern) { - struct tipc_net *tn; const struct proto_ops *ops; struct sock *sk; struct tipc_sock *tsk; @@ -446,7 +445,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; - tn = net_generic(sock_net(sk), tipc_net_id); /* Finish initializing socket data structures */ sock->ops = ops; @@ -1117,7 +1115,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, u32 self = tipc_own_addr(net); u32 type, lower, upper, scope; struct sk_buff *skb, *_skb; - u32 portid, oport, onode; + u32 portid, onode; struct sk_buff_head tmpq; struct list_head dports; struct tipc_msg *hdr; @@ -1133,7 +1131,6 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, user = msg_user(hdr); mtyp = msg_type(hdr); hlen = skb_headroom(skb) + msg_hdr_sz(hdr); - oport = msg_origport(hdr); onode = msg_orignode(hdr); type = msg_nametype(hdr); -- cgit v1.2.3 From 969d509003b8d64f5766a16143bf22d56c1b66fe Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 1 Aug 2018 00:50:24 +0800 Subject: net/tls: Use kmemdup to simplify the code Kmemdup is better than kmalloc+memcpy. So replace them. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- net/tls/tls_device.c | 3 +-- net/tls/tls_sw.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1e968d238adf..292742e50bfa 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -716,12 +716,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); ctx->tx.rec_seq_size = rec_seq_size; - ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 33838f11fafa..ff3a6904a722 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1173,12 +1173,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); cctx->rec_seq_size = rec_seq_size; - cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(cctx->rec_seq, rec_seq, rec_seq_size); if (sw_ctx_tx) { sg_init_table(sw_ctx_tx->sg_encrypted_data, -- cgit v1.2.3 From 83ba4645152d1177c161750e1064e3a8e7cee19b Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 31 Jul 2018 21:18:11 +0200 Subject: net: add helpers checking if socket can be bound to nonlocal address The construction "net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent" is present three times and its IPv6 counterpart is also present three times. We introduce two small helpers to characterize these tests uniformly. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/inet_sock.h | 8 ++++++++ include/net/ipv6.h | 7 +++++++ net/ipv4/af_inet.c | 3 +-- net/ipv4/ping.c | 6 ++---- net/ipv6/af_inet6.c | 6 ++---- net/ipv6/datagram.c | 3 +-- 6 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 314be484c696..e03b93360f33 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -359,4 +359,12 @@ static inline bool inet_get_convert_csum(struct sock *sk) return !!inet_sk(sk)->convert_csum; } + +static inline bool inet_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv4.sysctl_ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + #endif /* _INET_SOCK_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a44509f4e985..82deb684ba73 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,6 +766,13 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2a0a3bab6b5..ee707b91d1a7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -486,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b54c964ad925..8d7aaf118a30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && + if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) @@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - isk->freebind || isk->transparent || has_addr || + if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c9535354149f..020f6e14a7af 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!net->ipv6.sysctl.ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!ipv6_can_nonlocal_bind(net, inet) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index f0264dfd38de..1ede7a16a0be 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -803,8 +803,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && -- cgit v1.2.3 From 432e05d328921c68c35bfdeff7d7b7400b8e3d1a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:03 +0200 Subject: net: ipv4: Control SKB reprioritization after forwarding After IPv4 packets are forwarded, the priority of the corresponding SKB is updated according to the TOS field of IPv4 header. This overrides any prioritization done earlier by e.g. an skbedit action or ingress-qos-map defined at a vlan device. Such overriding may not always be desirable. Even if the packet ends up being routed, which implies this is an L3 network node, an administrator may wish to preserve whatever prioritization was done earlier on in the pipeline. Therefore introduce a sysctl that controls this behavior. Keep the default value at 1 to maintain backward-compatible behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/ip_forward.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 77c37fb0b6a6..e74515ecaa9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -81,6 +81,15 @@ fib_multipath_hash_policy - INTEGER 0 - Layer 3 1 - Layer 4 +ip_forward_update_priority - INTEGER + Whether to update SKB priority from "TOS" field in IPv4 header after it + is forwarded. The new SKB priority is mapped from TOS field value + according to an rt_tos2priority table (see e.g. man tc-prio). + Default: 1 (Update priority.) + Possible values: + 0 - Do not update priority. + 1 - Update priority. + route/max_size - INTEGER Maximum number of routes allowed in the kernel. Increase this when using large numbers of interfaces and/or routes. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 661348f23ea5..e47503b4e4d1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,6 +98,7 @@ struct netns_ipv4 { int sysctl_ip_default_ttl; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_fwd_update_priority; int sysctl_ip_nonlocal_bind; /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ee707b91d1a7..20fda8fb8ffd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net) * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index b54b948b0596..32662e9e5d21 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - skb->priority = rt_tos2priority(iph->tos); + if (net->ipv4.sysctl_ip_fwd_update_priority) + skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, rt->dst.dev, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5fa335fd3852..e21dda015513 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -663,6 +663,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ip_forward_update_priority", + .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, -- cgit v1.2.3 From d18c5d1995aa322b722fa731397e28ebcd00b3c6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:42 +0200 Subject: net: ipv4: Notify about changes to ip_forward_update_priority Drivers may make offloading decision based on whether ip_forward_update_priority is enabled or not. Therefore distribute netevent notifications to give them a chance to react to a change. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + net/ipv4/sysctl_net_ipv4.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/netevent.h b/include/net/netevent.h index d9918261701c..4107016c3bb4 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -28,6 +28,7 @@ enum netevent_notif_type { NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e21dda015513..b92f422f2fa8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, return ret; } +static int ipv4_fwd_update_priority(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_ip_fwd_update_priority); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, + net); + + return ret; +} + static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -668,7 +685,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = ipv4_fwd_update_priority, .extra1 = &zero, .extra2 = &one, }, -- cgit v1.2.3 From 984988aa72188453f8c8a42dcf94bba1c57e73aa Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:20 -0700 Subject: tcp: add a helper to calculate size of opt_stats This is to refactor the calculation of the size of opt_stats to a helper function to make the code cleaner and easier for later changes. Suggested-by: Stephen Hemminger Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3bfb9f29520..27bbe6a792b7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3205,6 +3205,29 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } EXPORT_SYMBOL_GPL(tcp_get_info); +static size_t tcp_opt_stats_get_size(void) +{ + return + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -3213,9 +3236,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u64 rate64; u32 rate; - stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 7 * nla_total_size(sizeof(u32)) + - 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; -- cgit v1.2.3 From ba113c3aa79a7f941ac162d05a3620bdc985c58d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:21 -0700 Subject: tcp: add data bytes sent stats Introduce a new TCP stat to record the number of bytes sent (RFC4898 tcpEStatsPerfHCDataOctetsOut) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 4 +++- net/ipv4/tcp.c | 6 ++++++ net/ipv4/tcp_output.c | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 58a8d7d71354..d0798dcd2cab 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -181,6 +181,9 @@ struct tcp_sock { u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index e3f6ed8a7064..1c70ed287c3b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -235,6 +235,8 @@ struct tcp_info { __u32 tcpi_delivered; __u32 tcpi_delivered_ce; + + __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -257,7 +259,7 @@ enum { TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ - + TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27bbe6a792b7..873cb9968ff5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2594,6 +2594,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); tp->compressed_ack = 0; + tp->bytes_sent = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3201,6 +3202,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; + info->tcpi_bytes_sent = tp->bytes_sent; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3225,6 +3227,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ 0; } @@ -3272,6 +3275,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, + TCP_NLA_PAD); + return stats; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 490df62f26d4..861531fe0e97 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1136,6 +1136,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tp->bytes_sent += skb->len - tcp_header_size; tcp_internal_pacing(sk, skb); } -- cgit v1.2.3 From fb31c9b9f6c85b1bad569ecedbde78d9e37cd87b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:22 -0700 Subject: tcp: add data bytes retransmitted stats Introduce a new TCP stat to record the number of bytes retransmitted (RFC4898 tcpEStatsPerfOctetsRetrans) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 5 +++++ net/ipv4/tcp_output.c | 1 + 4 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d0798dcd2cab..fb67f9a51b95 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -333,6 +333,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans + * Total data bytes retransmitted + */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 1c70ed287c3b..c31f5100b744 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -237,6 +237,7 @@ struct tcp_info { __u32 tcpi_delivered_ce; __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -260,6 +261,7 @@ enum { TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ + TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 873cb9968ff5..5ed1be88e922 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->bytes_sent = 0; + tp->bytes_retrans = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3203,6 +3204,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; + info->tcpi_bytes_retrans = tp->bytes_retrans; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3228,6 +3230,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ 0; } @@ -3277,6 +3280,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + TCP_NLA_PAD); return stats; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 861531fe0e97..50cabf7656f3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2871,6 +2871,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; + tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom -- cgit v1.2.3 From 7e10b6554ff2ce7f86d5d3eec3af5db8db482caa Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:23 -0700 Subject: tcp: add dsack blocks received stats Introduce a new TCP stat to record the number of DSACK blocks received (RFC4989 tcpEStatsStackDSACKDups) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 1 + 4 files changed, 10 insertions(+) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fb67f9a51b95..da6281c549a5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -188,6 +188,9 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c31f5100b744..0e1c0aec0153 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -238,6 +238,7 @@ struct tcp_info { __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -262,6 +263,7 @@ enum { TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ + TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5ed1be88e922..d6232b598cae 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2596,6 +2596,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->compressed_ack = 0; tp->bytes_sent = 0; tp->bytes_retrans = 0; + tp->dsack_dups = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3205,6 +3206,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; + info->tcpi_dsack_dups = tp->dsack_dups; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3231,6 +3233,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ 0; } @@ -3282,6 +3285,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d51fa358b2b1..fbc85ff7d71d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -874,6 +874,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; + tp->dsack_dups++; } /* It's reordering when higher sequence was delivered (i.e. sacked) before -- cgit v1.2.3 From 7ec65372ca534217b53fd208500cf7aac223a383 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:24 -0700 Subject: tcp: add stat of data packet reordering events Introduce a new TCP stats to record the number of reordering events seen and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Application can use this stats to track the frequency of the reordering events in addition to the existing reordering stats which tracks the magnitude of the latest reordering event. Note: this new stats tracks reordering events triggered by ACKs, which could often be fewer than the actual number of packets being delivered out-of-order. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++-- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_recovery.c | 2 +- 5 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index da6281c549a5..263e37271afd 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -220,8 +220,7 @@ struct tcp_sock { #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ - advanced:1, /* mstamp advanced since last lost marking */ - reord:1; /* reordering detected */ + advanced:1; /* mstamp advanced since last lost marking */ } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; @@ -267,6 +266,7 @@ struct tcp_sock { u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reordering; /* Packet reordering metric. */ + u32 reord_seen; /* number of data packet reordering events */ u32 snd_up; /* Urgent pointer */ /* diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 0e1c0aec0153..e02d31986ff9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -239,6 +239,7 @@ struct tcp_info { __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + __u32 tcpi_reord_seen; /* reordering events seen */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -264,6 +265,7 @@ enum { TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ + TCP_NLA_REORD_SEEN, /* reordering events seen */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d6232b598cae..31fa1c080f28 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2597,6 +2597,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->bytes_sent = 0; tp->bytes_retrans = 0; tp->dsack_dups = 0; + tp->reord_seen = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3207,6 +3208,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; + info->tcpi_reord_seen = tp->reord_seen; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3234,6 +3236,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ 0; } @@ -3286,6 +3289,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); + nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fbc85ff7d71d..3d6156f07a8d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -906,8 +906,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } - tp->rack.reord = 1; /* This exciting event is worth to be remembered. 8) */ + tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } @@ -1871,6 +1871,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) tp->reordering = min_t(u32, tp->packets_out + addend, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 71593e4400ab..c81aadff769b 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!tp->rack.reord) { + if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ -- cgit v1.2.3 From 13dde04f5c436f3b9f50ccfd0784db6db13401f3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 1 Aug 2018 01:59:56 +0000 Subject: tcp: remove set but not used variable 'skb_size' Fixes gcc '-Wunused-but-set-variable' warning: net/ipv4/tcp_output.c: In function 'tcp_collapse_retrans': net/ipv4/tcp_output.c:2700:6: warning: variable 'skb_size' set but not used [-Wunused-but-set-variable] int skb_size, next_skb_size; ^ Signed-off-by: Wei Yongjun Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 50cabf7656f3..597dbd749f05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2698,9 +2698,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); - int skb_size, next_skb_size; + int next_skb_size; - skb_size = skb->len; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); -- cgit v1.2.3 From 1296ee8ffc74fea4350c756f722000211b38400d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 1 Aug 2018 10:04:02 +0800 Subject: ip_gre: remove redundant variables t_hlen After commit ffc2b6ee4174 ("ip_gre: fix IFLA_MTU ignored on NEWLINK") variable t_hlen is assigned values that are never read, hence they are redundant and can be removed. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c8ca5d8f0f75..51a5d06085ac 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -985,7 +985,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - int t_hlen; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); @@ -993,8 +992,6 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -1304,13 +1301,11 @@ static const struct net_device_ops gre_tap_netdev_ops = { static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - int t_hlen; tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); - t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; -- cgit v1.2.3 From bd707f17efc9e5dfc0fd05370cb89d2ee41d3558 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 1 Aug 2018 15:10:37 +0800 Subject: strparser: remove redundant variable 'rd_desc' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable 'rd_desc' is being assigned but never used, so can be removed. fix this clang warning: net/strparser/strparser.c:411:20: warning: variable ‘rd_desc’ set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/strparser/strparser.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 3a512936eea9..da1a676860ca 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -408,8 +408,6 @@ EXPORT_SYMBOL_GPL(strp_data_ready); static void do_strp_work(struct strparser *strp) { - read_descriptor_t rd_desc; - /* We need the read lock to synchronize with strp_data_ready. We * need the socket lock for calling strp_read_sock. */ @@ -421,8 +419,6 @@ static void do_strp_work(struct strparser *strp) if (strp->paused) goto out; - rd_desc.arg.data = strp; - if (strp_read_sock(strp) == -ENOMEM) queue_work(strp_wq, &strp->work); -- cgit v1.2.3 From 87f70132b08eadc19e5a78e43b814366f9929399 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 1 Aug 2018 15:14:07 +0800 Subject: rds: remove redundant variable 'rds_ibdev' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable 'rds_ibdev' is being assigned but never used, so can be removed. fix this clang warning: net/rds/ib_send.c:762:24: warning: variable ‘rds_ibdev’ set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/rds/ib_send.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index c4cdfe491d96..c8dd3125d398 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -759,14 +759,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_send_work *send = NULL; struct ib_send_wr *failed_wr; - struct rds_ib_device *rds_ibdev; u32 pos; u32 work_alloc; int ret; int nr_sig = 0; - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); if (work_alloc != 1) { rds_ib_stats_inc(s_ib_tx_ring_full); -- cgit v1.2.3 From 3d32f4c548bd8f3af58c59521bee9be127f3e87d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:55 +0200 Subject: net: sched: change name of zombie chain to "held_by_acts_only" As mentioned by Cong and Jakub during the review process, it is a bit odd to sometimes (act flow) create a new chain which would be immediately a "zombie". So just rename it to "held_by_acts_only". Signed-off-by: Jiri Pirko Suggested-by: Cong Wang Suggested-by: Jakub Kicinski Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e20aad1987b8..2f78341f2888 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -272,11 +272,10 @@ static void tcf_chain_release_by_act(struct tcf_chain *chain) --chain->action_refcnt; } -static bool tcf_chain_is_zombie(struct tcf_chain *chain) +static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { /* In case all the references are action references, this - * chain is a zombie and should not be listed in the chain - * dump list. + * chain should not be shown to the user. */ return chain->refcnt == chain->action_refcnt; } @@ -1838,10 +1837,9 @@ replay: chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { - if (tcf_chain_is_zombie(chain)) { + if (tcf_chain_held_by_acts_only(chain)) { /* The chain exists only because there is - * some action referencing it, meaning it - * is a zombie. + * some action referencing it. */ tcf_chain_hold(chain); } else { @@ -1860,7 +1858,7 @@ replay: } } } else { - if (!chain || tcf_chain_is_zombie(chain)) { + if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); return -EINVAL; } @@ -1988,7 +1986,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; continue; } - if (tcf_chain_is_zombie(chain)) + if (tcf_chain_held_by_acts_only(chain)) continue; err = tc_chain_fill_node(chain, net, skb, block, NETLINK_CB(cb->skb).portid, -- cgit v1.2.3 From 5368140730e4a67169303edd3a13e31fd9b9d355 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:56 +0200 Subject: net: sched: fix notifications for action-held chains Chains that only have action references serve as placeholders. Until a non-action reference is created, user should not be aware of the chain. Also he should not receive any notifications about it. So send notifications for the new chain only in case the chain gets the first non-action reference. Symmetrically to that, when the last non-action reference is dropped, send the notification about deleted chain. Reported-by: Cong Wang Signed-off-by: Jiri Pirko Acked-by: Cong Wang v1->v2: - made __tcf_chain_{get,put}() static as suggested by Cong Signed-off-by: David S. Miller --- net/sched/cls_api.c | 71 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2f78341f2888..b194a5abfc6a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,16 +262,6 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } -static void tcf_chain_hold_by_act(struct tcf_chain *chain) -{ - ++chain->action_refcnt; -} - -static void tcf_chain_release_by_act(struct tcf_chain *chain) -{ - --chain->action_refcnt; -} - static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { /* In case all the references are action references, this @@ -295,52 +285,77 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast); -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, + u32 chain_index, bool create, + bool by_act) { struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); if (chain) { tcf_chain_hold(chain); - return chain; + } else { + if (!create) + return NULL; + chain = tcf_chain_create(block, chain_index); + if (!chain) + return NULL; } - if (!create) - return NULL; - chain = tcf_chain_create(block, chain_index); - if (!chain) - return NULL; - tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + if (by_act) + ++chain->action_refcnt; + + /* Send notification only in case we got the first + * non-action reference. Until then, the chain acts only as + * a placeholder for actions pointing to it and user ought + * not know about them. + */ + if (chain->refcnt - chain->action_refcnt == 1 && !by_act) + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + return chain; } + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) +{ + return __tcf_chain_get(block, chain_index, create, false); +} EXPORT_SYMBOL(tcf_chain_get); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { - struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); - - tcf_chain_hold_by_act(chain); - return chain; + return __tcf_chain_get(block, chain_index, true, true); } EXPORT_SYMBOL(tcf_chain_get_by_act); static void tc_chain_tmplt_del(struct tcf_chain *chain); -void tcf_chain_put(struct tcf_chain *chain) +static void __tcf_chain_put(struct tcf_chain *chain, bool by_act) { - if (--chain->refcnt == 0) { + if (by_act) + chain->action_refcnt--; + chain->refcnt--; + + /* The last dropped non-action reference will trigger notification. */ + if (chain->refcnt - chain->action_refcnt == 0 && !by_act) tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + + if (chain->refcnt == 0) { tc_chain_tmplt_del(chain); tcf_chain_destroy(chain); } } + +void tcf_chain_put(struct tcf_chain *chain) +{ + __tcf_chain_put(chain, false); +} EXPORT_SYMBOL(tcf_chain_put); void tcf_chain_put_by_act(struct tcf_chain *chain) { - tcf_chain_release_by_act(chain); - tcf_chain_put(chain); + __tcf_chain_put(chain, true); } EXPORT_SYMBOL(tcf_chain_put_by_act); -- cgit v1.2.3 From 290b1c8b1a902c0902df9ec05577ab209296f345 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:57 +0200 Subject: net: sched: make tcf_chain_{get,put}() static These are no longer used outside of cls_api.c so make them static. Move tcf_chain_flush() to avoid fwd declaration of tcf_chain_put(). Signed-off-by: Jiri Pirko v1->v2: - new patch Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- net/sched/cls_api.c | 34 ++++++++++++++++------------------ 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 22bfc3a13c25..ef727f71336e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -40,11 +40,8 @@ struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); -void tcf_chain_put(struct tcf_chain *chain); void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b194a5abfc6a..e8b0bbd0883f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -232,19 +232,6 @@ static void tcf_chain0_head_change(struct tcf_chain *chain, tcf_chain_head_change_item(item, tp_head); } -static void tcf_chain_flush(struct tcf_chain *chain) -{ - struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - - tcf_chain0_head_change(chain, NULL); - while (tp) { - RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_proto_destroy(tp, NULL); - tp = rtnl_dereference(chain->filter_chain); - tcf_chain_put(chain); - } -} - static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_block *block = chain->block; @@ -316,12 +303,11 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, return chain; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) { return __tcf_chain_get(block, chain_index, create, false); } -EXPORT_SYMBOL(tcf_chain_get); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { @@ -347,11 +333,10 @@ static void __tcf_chain_put(struct tcf_chain *chain, bool by_act) } } -void tcf_chain_put(struct tcf_chain *chain) +static void tcf_chain_put(struct tcf_chain *chain) { __tcf_chain_put(chain, false); } -EXPORT_SYMBOL(tcf_chain_put); void tcf_chain_put_by_act(struct tcf_chain *chain) { @@ -365,6 +350,19 @@ static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) tcf_chain_put(chain); } +static void tcf_chain_flush(struct tcf_chain *chain) +{ + struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); + + tcf_chain0_head_change(chain, NULL); + while (tp) { + RCU_INIT_POINTER(chain->filter_chain, tp->next); + tcf_proto_destroy(tp, NULL); + tp = rtnl_dereference(chain->filter_chain); + tcf_chain_put(chain); + } +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; -- cgit v1.2.3 From 770b26de1eca97142218de3b2829a790f2ff8803 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 2 Aug 2018 09:13:33 +0100 Subject: rxrpc: Remove set but not used variable 'nowj' Fixes gcc '-Wunused-but-set-variable' warning: net/rxrpc/proc.c: In function 'rxrpc_call_seq_show': net/rxrpc/proc.c:66:29: warning: variable 'nowj' set but not used [-Wunused-but-set-variable] unsigned long timeout = 0, nowj; ^ Signed-off-by: Wei Yongjun Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 163d05df339d..9805e3b85c36 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -63,7 +63,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_peer *peer; struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); - unsigned long timeout = 0, nowj; + unsigned long timeout = 0; rxrpc_seq_t tx_hard_ack, rx_hard_ack; char lbuff[50], rbuff[50]; @@ -97,7 +97,6 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) if (call->state != RXRPC_CALL_SERVER_PREALLOC) { timeout = READ_ONCE(call->expect_rx_by); - nowj = jiffies; timeout -= jiffies; } -- cgit v1.2.3 From 9aba2f801eea5070f1d5588cd4052588437b9eea Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Thu, 2 Aug 2018 15:34:52 +0530 Subject: net: Fix coding style in skb_push() Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 266b954f763e..51b0a9126e12 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1715,7 +1715,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; - if (unlikely(skb->datahead)) + if (unlikely(skb->data < skb->head)) skb_under_panic(skb, len, __builtin_return_address(0)); return skb->data; } -- cgit v1.2.3 From 6b431d50d2a8acd1c418b998b856a055252ebc3a Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 2 Aug 2018 18:14:33 +0200 Subject: net/socket: remove duplicated init code This refactoring work has been started by David Howells in cdfbabfb2f0c (net: Work around lockdep limitation in sockets that use sockets) but the exact same day in 581319c58600 (net/socket: use per af lockdep classes for sk queues), Paolo Abeni added new classes. This reduces the amount of (nearly) duplicated code and eases the addition of new socket types. Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/core/sock.c | 51 +++------------------------------------------------ 1 file changed, 3 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 9c6ebbdfebf3..e31233f5ba39 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -250,58 +250,13 @@ static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { - "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , - "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", - "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , - "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , - "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , - "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , - "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , - "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , - "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , - "rlock-27" , "rlock-28" , "rlock-AF_CAN" , - "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , - "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , - "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , - "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , - "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , - "rlock-AF_MAX" + _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { - "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , - "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", - "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , - "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , - "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , - "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , - "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , - "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , - "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , - "wlock-27" , "wlock-28" , "wlock-AF_CAN" , - "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , - "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , - "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , - "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , - "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , - "wlock-AF_MAX" + _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { - "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , - "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", - "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , - "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , - "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , - "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , - "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , - "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , - "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , - "elock-27" , "elock-28" , "elock-AF_CAN" , - "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , - "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , - "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , - "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , - "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , - "elock-AF_MAX" + _sock_locks("elock-") }; /* -- cgit v1.2.3 From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 23 ++++++++++++++++++++++- 7 files changed, 85 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ca4ac2a39def..cd8790d2c6ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_local_storage_proto; + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b10118ee5afe..dd5758dc35d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2095,6 +2095,24 @@ union bpf_attr { * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. + * + * void* get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the bpf program type, a local storage area + * can be shared between multiple instances of the bpf program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the BPF_STX_XADD instruction to alter + * the shared data. + * Return + * Pointer to the local storage area. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2177,7 +2195,8 @@ union bpf_attr { FN(rc_repeat), \ FN(rc_keydown), \ FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), + FN(get_current_cgroup_id), \ + FN(get_local_storage), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddfa6cc13e57..0a4fe5a7dc91 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9abcf25ebf9f..4d09e610777f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73065e2d23c2..1991466b8327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ede16c8bb40..587468a9c37d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; default: break; } @@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/net/core/filter.c b/net/core/filter.c index 9bb9a4488e25..9f73aae2f089 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4820,6 +4820,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4844,6 +4846,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4866,6 +4870,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + default: + return sk_filter_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4988,6 +5003,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5007,6 +5024,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5034,6 +5053,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -6838,7 +6859,7 @@ const struct bpf_prog_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_verifier_ops = { - .get_func_proto = sk_filter_func_proto, + .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -- cgit v1.2.3 From f42ee093be2980f2689ea7a170d580364820f48b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:27 -0700 Subject: bpf/test_run: support cgroup local storage Allocate a temporary cgroup storage to use for bpf program test runs. Because the test program is not actually attached to a cgroup, the storage is allocated manually just for the execution of the bpf program. If the program is executed multiple times, the storage is not zeroed on each run, emulating multiple runs of the program, attached to a real cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 22a78eedf4b1..f4078830ea50 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -11,12 +11,14 @@ #include #include -static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) +static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, + struct bpf_cgroup_storage *storage) { u32 ret; preempt_disable(); rcu_read_lock(); + bpf_cgroup_storage_set(storage); ret = BPF_PROG_RUN(prog, ctx); rcu_read_unlock(); preempt_enable(); @@ -26,14 +28,19 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { + struct bpf_cgroup_storage *storage = NULL; u64 time_start, time_spent = 0; u32 ret = 0, i; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return PTR_ERR(storage); + if (!repeat) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx); + ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -46,6 +53,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + bpf_cgroup_storage_free(storage); + return ret; } -- cgit v1.2.3 From 285189c78eeb6f684a024b86fb5997d10c6aa564 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 25 Jul 2018 15:52:13 +0800 Subject: netfilter: use kvmalloc_array to allocate memory for hashtable nf_ct_alloc_hashtable is used to allocate memory for conntrack, NAT bysrc and expectation hashtable. Assuming 64k bucket size, which means 7th order page allocation, __get_free_pages, called by nf_ct_alloc_hashtable, will trigger the direct memory reclaim and stall for a long time, when system has lots of memory stress so replace combination of __get_free_pages and vzalloc with kvmalloc_array, which provides a overflow check and a fallback if no high order memory is available, and do not retry to reclaim memory, reduce stall and remove nf_ct_free_hashtable, since it is just a kvfree Signed-off-by: Zhang Yu Signed-off-by: Wang Li Signed-off-by: Li RongQing Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 -- net/netfilter/nf_conntrack_core.c | 29 ++++++----------------------- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_helper.c | 4 ++-- net/netfilter/nf_nat_core.c | 4 ++-- 5 files changed, 11 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a2b0ed025908..7e012312cd61 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -176,8 +176,6 @@ void nf_ct_netns_put(struct net *net, u8 nfproto); */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); -void nf_ct_free_hashtable(void *hash, unsigned int size); - int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8a113ca1eea2..a676d5f76bdc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2022,16 +2022,6 @@ static int kill_all(struct nf_conn *i, void *data) return net_eq(nf_ct_net(i), data); } -void nf_ct_free_hashtable(void *hash, unsigned int size) -{ - if (is_vmalloc_addr(hash)) - vfree(hash); - else - free_pages((unsigned long)hash, - get_order(sizeof(struct hlist_head) * size)); -} -EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); - void nf_conntrack_cleanup_start(void) { conntrack_gc_work.exiting = true; @@ -2042,7 +2032,7 @@ void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + kvfree(nf_conntrack_hash); nf_conntrack_proto_fini(); nf_conntrack_seqadj_fini(); @@ -2108,7 +2098,6 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; - size_t sz; if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) return NULL; @@ -2116,14 +2105,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); - if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) - return NULL; - - sz = nr_slots * sizeof(struct hlist_nulls_head); - hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - get_order(sz)); - if (!hash) - hash = vzalloc(sz); + hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head), + GFP_KERNEL | __GFP_ZERO); if (hash && nulls) for (i = 0; i < nr_slots; i++) @@ -2150,7 +2133,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize) old_size = nf_conntrack_htable_size; if (old_size == hashsize) { - nf_ct_free_hashtable(hash, hashsize); + kvfree(hash); return 0; } @@ -2186,7 +2169,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize) local_bh_enable(); synchronize_net(); - nf_ct_free_hashtable(old_hash, old_size); + kvfree(old_hash); return 0; } @@ -2350,7 +2333,7 @@ err_acct: err_expect: kmem_cache_destroy(nf_conntrack_cachep); err_cachep: - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + kvfree(nf_conntrack_hash); return ret; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 3f586ba23d92..27b84231db10 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -712,5 +712,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); + kvfree(nf_ct_expect_hash); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index d557a425289d..e24b762ffa1d 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -562,12 +562,12 @@ int nf_conntrack_helper_init(void) return 0; out_extend: - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + kvfree(nf_ct_helper_hash); return ret; } void nf_conntrack_helper_fini(void) { nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + kvfree(nf_ct_helper_hash); } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6366f0c0b8c1..e2b196054dfc 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1056,7 +1056,7 @@ static int __init nf_nat_init(void) ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + kvfree(nf_nat_bysource); pr_err("Unable to register extension\n"); return ret; } @@ -1094,7 +1094,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } -- cgit v1.2.3 From ddba40be59c9be4059288464f8e6f38fbba27495 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 31 Jul 2018 20:25:01 +0200 Subject: netfilter: nfnetlink_osf: rename nf_osf header file to nfnetlink_osf The first client of the nf_osf.h userspace header is nft_osf, coming in this batch, rename it to nfnetlink_osf.h as there are no userspace clients for this yet, hence this looks consistent with other nfnetlink subsystem. Suggested-by: Jan Engelhardt Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 44 ----------- include/linux/netfilter/nfnetlink_osf.h | 44 +++++++++++ include/uapi/linux/netfilter/nf_osf.h | 106 --------------------------- include/uapi/linux/netfilter/nfnetlink_osf.h | 106 +++++++++++++++++++++++++++ include/uapi/linux/netfilter/xt_osf.h | 2 +- net/netfilter/nfnetlink_osf.c | 2 +- net/netfilter/nft_osf.c | 2 +- 7 files changed, 153 insertions(+), 153 deletions(-) delete mode 100644 include/linux/netfilter/nf_osf.h create mode 100644 include/linux/netfilter/nfnetlink_osf.h delete mode 100644 include/uapi/linux/netfilter/nf_osf.h create mode 100644 include/uapi/linux/netfilter/nfnetlink_osf.h (limited to 'net') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h deleted file mode 100644 index 3e455d6f94d5..000000000000 --- a/include/linux/netfilter/nf_osf.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFOSF_H -#define _NFOSF_H - -#include - -/* Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum nf_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; - -enum osf_fmatch_states { - /* Packet does not match the fingerprint */ - FMATCH_WRONG = 0, - /* Packet matches the fingerprint */ - FMATCH_OK, - /* Options do not match the fingerprint, but header does */ - FMATCH_OPT_WRONG, -}; - -extern struct list_head nf_osf_fingers[2]; - -struct nf_osf_finger { - struct rcu_head rcu_head; - struct list_head finger_entry; - struct nf_osf_user_finger finger; -}; - -bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, - int hooknum, struct net_device *in, struct net_device *out, - const struct nf_osf_info *info, struct net *net, - const struct list_head *nf_osf_fingers); - -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers); - -#endif /* _NFOSF_H */ diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h new file mode 100644 index 000000000000..a7311bc03d3a --- /dev/null +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFOSF_H +#define _NFOSF_H + +#include + +/* Initial window size option state machine: multiple of mss, mtu or + * plain numeric value. Can also be made as plain numeric value which + * is not a multiple of specified value. + */ +enum nf_osf_window_size_options { + OSF_WSS_PLAIN = 0, + OSF_WSS_MSS, + OSF_WSS_MTU, + OSF_WSS_MODULO, + OSF_WSS_MAX, +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +extern struct list_head nf_osf_fingers[2]; + +struct nf_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct nf_osf_user_finger finger; +}; + +bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers); + +#endif /* _NFOSF_H */ diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h deleted file mode 100644 index 3b93fbb9fc24..000000000000 --- a/include/uapi/linux/netfilter/nf_osf.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef _NF_OSF_H -#define _NF_OSF_H - -#include - -#define MAXGENRELEN 32 - -#define NF_OSF_GENRE (1 << 0) -#define NF_OSF_TTL (1 << 1) -#define NF_OSF_LOG (1 << 2) -#define NF_OSF_INVERT (1 << 3) - -#define NF_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ -#define NF_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ -#define NF_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ - -#define NF_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ - -/* Check if ip TTL is less than fingerprint one */ -#define NF_OSF_TTL_LESS 1 - -/* Do not compare ip and fingerprint TTL at all */ -#define NF_OSF_TTL_NOCHECK 2 - -#define NF_OSF_FLAGMASK (NF_OSF_GENRE | NF_OSF_TTL | \ - NF_OSF_LOG | NF_OSF_INVERT) -/* Wildcard MSS (kind of). - * It is used to implement a state machine for the different wildcard values - * of the MSS and window sizes. - */ -struct nf_osf_wc { - __u32 wc; - __u32 val; -}; - -/* This struct represents IANA options - * http://www.iana.org/assignments/tcp-parameters - */ -struct nf_osf_opt { - __u16 kind, length; - struct nf_osf_wc wc; -}; - -struct nf_osf_info { - char genre[MAXGENRELEN]; - __u32 len; - __u32 flags; - __u32 loglevel; - __u32 ttl; -}; - -struct nf_osf_user_finger { - struct nf_osf_wc wss; - - __u8 ttl, df; - __u16 ss, mss; - __u16 opt_num; - - char genre[MAXGENRELEN]; - char version[MAXGENRELEN]; - char subtype[MAXGENRELEN]; - - /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ - struct nf_osf_opt opt[MAX_IPOPTLEN]; -}; - -struct nf_osf_nlmsg { - struct nf_osf_user_finger f; - struct iphdr ip; - struct tcphdr tcp; -}; - -/* Defines for IANA option kinds */ -enum iana_options { - OSFOPT_EOL = 0, /* End of options */ - OSFOPT_NOP, /* NOP */ - OSFOPT_MSS, /* Maximum segment size */ - OSFOPT_WSO, /* Window scale option */ - OSFOPT_SACKP, /* SACK permitted */ - OSFOPT_SACK, /* SACK */ - OSFOPT_ECHO, - OSFOPT_ECHOREPLY, - OSFOPT_TS, /* Timestamp option */ - OSFOPT_POCP, /* Partial Order Connection Permitted */ - OSFOPT_POSP, /* Partial Order Service Profile */ - - /* Others are not used in the current OSF */ - OSFOPT_EMPTY = 255, -}; - -enum nf_osf_attr_type { - OSF_ATTR_UNSPEC, - OSF_ATTR_FINGER, - OSF_ATTR_MAX, -}; - -/* - * Add/remove fingerprint from the kernel. - */ -enum nf_osf_msg_types { - OSF_MSG_ADD, - OSF_MSG_REMOVE, - OSF_MSG_MAX, -}; - -#endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h new file mode 100644 index 000000000000..3b93fbb9fc24 --- /dev/null +++ b/include/uapi/linux/netfilter/nfnetlink_osf.h @@ -0,0 +1,106 @@ +#ifndef _NF_OSF_H +#define _NF_OSF_H + +#include + +#define MAXGENRELEN 32 + +#define NF_OSF_GENRE (1 << 0) +#define NF_OSF_TTL (1 << 1) +#define NF_OSF_LOG (1 << 2) +#define NF_OSF_INVERT (1 << 3) + +#define NF_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ +#define NF_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ +#define NF_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ + +#define NF_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ + +/* Check if ip TTL is less than fingerprint one */ +#define NF_OSF_TTL_LESS 1 + +/* Do not compare ip and fingerprint TTL at all */ +#define NF_OSF_TTL_NOCHECK 2 + +#define NF_OSF_FLAGMASK (NF_OSF_GENRE | NF_OSF_TTL | \ + NF_OSF_LOG | NF_OSF_INVERT) +/* Wildcard MSS (kind of). + * It is used to implement a state machine for the different wildcard values + * of the MSS and window sizes. + */ +struct nf_osf_wc { + __u32 wc; + __u32 val; +}; + +/* This struct represents IANA options + * http://www.iana.org/assignments/tcp-parameters + */ +struct nf_osf_opt { + __u16 kind, length; + struct nf_osf_wc wc; +}; + +struct nf_osf_info { + char genre[MAXGENRELEN]; + __u32 len; + __u32 flags; + __u32 loglevel; + __u32 ttl; +}; + +struct nf_osf_user_finger { + struct nf_osf_wc wss; + + __u8 ttl, df; + __u16 ss, mss; + __u16 opt_num; + + char genre[MAXGENRELEN]; + char version[MAXGENRELEN]; + char subtype[MAXGENRELEN]; + + /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ + struct nf_osf_opt opt[MAX_IPOPTLEN]; +}; + +struct nf_osf_nlmsg { + struct nf_osf_user_finger f; + struct iphdr ip; + struct tcphdr tcp; +}; + +/* Defines for IANA option kinds */ +enum iana_options { + OSFOPT_EOL = 0, /* End of options */ + OSFOPT_NOP, /* NOP */ + OSFOPT_MSS, /* Maximum segment size */ + OSFOPT_WSO, /* Window scale option */ + OSFOPT_SACKP, /* SACK permitted */ + OSFOPT_SACK, /* SACK */ + OSFOPT_ECHO, + OSFOPT_ECHOREPLY, + OSFOPT_TS, /* Timestamp option */ + OSFOPT_POCP, /* Partial Order Connection Permitted */ + OSFOPT_POSP, /* Partial Order Service Profile */ + + /* Others are not used in the current OSF */ + OSFOPT_EMPTY = 255, +}; + +enum nf_osf_attr_type { + OSF_ATTR_UNSPEC, + OSF_ATTR_FINGER, + OSF_ATTR_MAX, +}; + +/* + * Add/remove fingerprint from the kernel. + */ +enum nf_osf_msg_types { + OSF_MSG_ADD, + OSF_MSG_REMOVE, + OSF_MSG_MAX, +}; + +#endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index a90e90c27cef..c56c59605c2b 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #define XT_OSF_GENRE NF_OSF_GENRE #define XT_OSF_INVERT NF_OSF_INVERT diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ba0fa11869ce..f9dba62c450f 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* * Indexed by dont-fragment bit. diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index bdacc4cffba4..9b2f3de7be4f 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -2,7 +2,7 @@ #include #include -#include +#include #define OSF_GENRE_SIZE 32 -- cgit v1.2.3 From 9e619d87b277bbcc4e0b64cc5963520c1cd99f18 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 31 Jul 2018 17:24:45 +0200 Subject: netfilter: nf_tables: flow event notifier must use transaction mutex Fixes: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard transactions") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f18085639807..06d6af067619 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5940,13 +5940,13 @@ static int nf_tables_flowtable_event(struct notifier_block *this, if (!net) return 0; - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&net->nft.commit_mutex); list_for_each_entry(table, &net->nft.tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&net->nft.commit_mutex); put_net(net); return NOTIFY_DONE; } -- cgit v1.2.3 From 5ca8a25c144dbb04511147601943691baab0aaca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Aug 2018 11:08:47 +0200 Subject: net: sched: fix flush on non-existing chain User was able to perform filter flush on chain 0 even if it didn't have any filters in it. With the patch that avoided implicit chain 0 creation, this changed. So in case user wants filter flush on chain which does not exist, just return success. There's no reason for non-0 chains to behave differently than chain 0, so do the same for them. Reported-by: Ido Schimmel Fixes: f71e0ca4db18 ("net: sched: Avoid implicit chain 0 creation") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e8b0bbd0883f..194c2e0b2737 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1389,6 +1389,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, } chain = tcf_chain_get(block, chain_index, false); if (!chain) { + /* User requested flush on non-existent chain. Nothing to do, + * so just return success. + */ + if (prio == 0) { + err = 0; + goto errout; + } NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; goto errout; -- cgit v1.2.3 From 54424d3891967b83d707c9300a3509c2ae8f42ee Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 3 Aug 2018 10:15:25 +0100 Subject: rxrpc: Reuse SKCIPHER_REQUEST_ON_STACK buffer The use of SKCIPHER_REQUEST_ON_STACK() will trigger FRAME_WARN warnings (when less than 2048) once the VLA is no longer hidden from the check: net/rxrpc/rxkad.c:398:1: warning: the frame size of 1152 bytes is larger than 1024 bytes [-Wframe-larger-than=] net/rxrpc/rxkad.c:242:1: warning: the frame size of 1152 bytes is larger than 1024 bytes [-Wframe-larger-than=] This passes the initial SKCIPHER_REQUEST_ON_STACK allocation to the leaf functions for reuse. Two requests allocated on the stack is not needed when only one is used at a time. Signed-off-by: Kees Cook Acked-by: Arnd Bergmann Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/rxkad.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6988073ae842..eaf8f4f446b0 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -146,10 +146,10 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct sk_buff *skb, u32 data_size, - void *sechdr) + void *sechdr, + struct skcipher_request *req) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxkad_level1_hdr hdr; struct rxrpc_crypt iv; struct scatterlist sg; @@ -183,12 +183,12 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct sk_buff *skb, u32 data_size, - void *sechdr) + void *sechdr, + struct skcipher_request *req) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr rxkhdr; struct rxrpc_skb_priv *sp; - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[16]; struct sk_buff *trailer; @@ -296,11 +296,12 @@ static int rxkad_secure_packet(struct rxrpc_call *call, ret = 0; break; case RXRPC_SECURITY_AUTH: - ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr); + ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr, + req); break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, skb, data_size, - sechdr); + sechdr, req); break; default: ret = -EPERM; @@ -316,10 +317,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call, */ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, unsigned int offset, unsigned int len, - rxrpc_seq_t seq) + rxrpc_seq_t seq, + struct skcipher_request *req) { struct rxkad_level1_hdr sechdr; - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[16]; struct sk_buff *trailer; @@ -402,11 +403,11 @@ nomem: */ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, unsigned int offset, unsigned int len, - rxrpc_seq_t seq) + rxrpc_seq_t seq, + struct skcipher_request *req) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr sechdr; - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; struct sk_buff *trailer; @@ -549,9 +550,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, case RXRPC_SECURITY_PLAIN: return 0; case RXRPC_SECURITY_AUTH: - return rxkad_verify_packet_1(call, skb, offset, len, seq); + return rxkad_verify_packet_1(call, skb, offset, len, seq, req); case RXRPC_SECURITY_ENCRYPT: - return rxkad_verify_packet_2(call, skb, offset, len, seq); + return rxkad_verify_packet_2(call, skb, offset, len, seq, req); default: return -ENOANO; } -- cgit v1.2.3 From 1974d2453fa7bfea5574d09332df3cc7fb0d909a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 1 Aug 2018 10:14:00 +0800 Subject: netfilter: nf_tables: remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable 'ext' is being assigned but are never used hence they are unused and can be removed. Cleans up clang warnings: net/netfilter/nf_tables_api.c:4032:28: warning: variable ‘ext’ set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 06d6af067619..debc1680607c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4029,7 +4029,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - const struct nft_set_ext *ext; struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; @@ -4063,7 +4062,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return PTR_ERR(priv); elem.priv = priv; - ext = nft_set_elem_ext(set, &elem); err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); -- cgit v1.2.3 From c75303269009667cc2b7ddee274bc9e96e840f57 Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Thu, 2 Aug 2018 09:26:24 +0530 Subject: netfilter: cttimeout: Make NF_CT_NETLINK_TIMEOUT depend on NF_CONNTRACK_TIMEOUT With this, remove ifdef for CONFIG_NF_CONNTRACK_TIMEOUT in nfnetlink_cttimeout. This is also required for moving ctnl_untimeout from nfnetlink_cttimeout to nf_conntrack_timeout. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 1 + net/netfilter/nfnetlink_cttimeout.c | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0febf3e21f91..55e399d5af10 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -365,6 +365,7 @@ config NF_CT_NETLINK_TIMEOUT tristate 'Connection tracking timeout tuning via Netlink' select NETFILTER_NETLINK depends on NETFILTER_ADVANCED + depends on NF_CONNTRACK_TIMEOUT help This option enables support for connection tracking timeout fine-grain tuning. This allows you to attach specific timeout diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index d9d952fad3e0..4199e5300575 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -503,7 +503,6 @@ err: return err; } -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT static struct ctnl_timeout * ctnl_timeout_find_get(struct net *net, const char *name) { @@ -534,7 +533,6 @@ static void ctnl_timeout_put(struct ctnl_timeout *timeout) module_put(THIS_MODULE); } -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, @@ -605,10 +603,8 @@ static int __init cttimeout_init(void) "nfnetlink.\n"); goto err_out; } -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ return 0; err_out: @@ -621,11 +617,9 @@ static void __exit cttimeout_exit(void) nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); synchronize_rcu(); -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ } module_init(cttimeout_init); -- cgit v1.2.3 From 1f5cd2a0107d4ed95cbd9118e6a5f7ccd3d4d12a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 3 Aug 2018 12:38:34 +0200 Subject: l2tp: define l2tp_tunnel_dst_mtu() Consolidate retrieval of tunnel's socket mtu in order to simplify l2tp_eth and l2tp_ppp a bit. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.h | 18 ++++++++++++++++++ net/l2tp/l2tp_eth.c | 14 ++++---------- net/l2tp/l2tp_ppp.c | 15 ++++----------- 3 files changed, 26 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index fa5ae9432d38..1ca39629031b 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -12,6 +12,9 @@ #ifndef _L2TP_CORE_H_ #define _L2TP_CORE_H_ +#include +#include + /* Just some random numbers */ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D @@ -268,6 +271,21 @@ static inline int l2tp_get_l2specific_len(struct l2tp_session *session) } } +static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel) +{ + struct dst_entry *dst; + u32 mtu; + + dst = sk_dst_get(tunnel->sock); + if (!dst) + return 0; + + mtu = dst_mtu(dst); + dst_release(dst); + + return mtu; +} + #define l2tp_printk(ptr, type, func, fmt, ...) \ do { \ if (((ptr)->debug) & (type)) \ diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 5c366ecfa1cb..cfca5e63ae31 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -226,8 +226,8 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, struct net_device *dev) { unsigned int overhead = 0; - struct dst_entry *dst; u32 l3_overhead = 0; + u32 mtu; /* if the encap is UDP, account for UDP header size */ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { @@ -256,15 +256,9 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, overhead += session->hdr_len + ETH_HLEN + l3_overhead; /* If PMTU discovery was enabled, use discovered MTU on L2TP device */ - dst = sk_dst_get(tunnel->sock); - if (dst) { - /* dst_mtu will use PMTU if found, else fallback to intf MTU */ - u32 pmtu = dst_mtu(dst); - - if (pmtu != 0) - dev->mtu = pmtu; - dst_release(dst); - } + mtu = l2tp_tunnel_dst_mtu(tunnel); + if (mtu) + dev->mtu = mtu; session->mtu = dev->mtu - overhead; dev->mtu = session->mtu; dev->needed_headroom += session->hdr_len; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 44cac66284a5..1c6da02f976a 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -93,7 +93,6 @@ #include #include #include -#include #include #include #include @@ -554,7 +553,7 @@ static void pppol2tp_show(struct seq_file *m, void *arg) static void pppol2tp_session_init(struct l2tp_session *session) { struct pppol2tp_session *ps; - struct dst_entry *dst; + u32 mtu; session->recv_skb = pppol2tp_recv; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) @@ -566,15 +565,9 @@ static void pppol2tp_session_init(struct l2tp_session *session) ps->owner = current->pid; /* If PMTU discovery was enabled, use the MTU that was discovered */ - dst = sk_dst_get(session->tunnel->sock); - if (dst) { - u32 pmtu = dst_mtu(dst); - - if (pmtu) - session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD; - - dst_release(dst); - } + mtu = l2tp_tunnel_dst_mtu(session->tunnel); + if (mtu) + session->mtu = mtu - PPPOL2TP_HEADER_OVERHEAD; } struct l2tp_connect_info { -- cgit v1.2.3 From 789141b215fc509defdd0f0978e4bf1bb5b31fc2 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 3 Aug 2018 12:38:37 +0200 Subject: l2tp: simplify MTU handling in l2tp_ppp The value of the session's .mtu field, as defined by pppol2tp_connect() or pppol2tp_session_create(), is later overwritten by pppol2tp_session_init() (unless getting the tunnel's socket PMTU fails). This field is then only used when setting the PPP channel's MTU in pppol2tp_connect(). Furthermore, the SIOC[GS]IFMTU ioctls only act on the session's .mtu without propagating this value to the PPP channel, making them useless. This patch initialises the PPP channel's MTU directly and ignores the session's .mtu entirely. MTU is still computed by subtracting the PPPOL2TP_HEADER_OVERHEAD constant. It is not optimal, but that doesn't really matter: po->chan.mtu is only used when the channel is part of a multilink PPP bundle. Running multilink PPP over packet switched networks is certainly not going to be efficient, so not picking the best MTU does not harm (in the worst case, packets will just be fragmented by the underlay). The SIOC[GS]IFMTU ioctls are removed entirely (as opposed to simply ignored), because these ioctls commands are part of the requests that should be handled generically by the socket layer. PX_PROTO_OL2TP was the only socket type abusing these ioctls. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 67 ++++++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1c6da02f976a..b403728e2757 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -553,7 +553,6 @@ static void pppol2tp_show(struct seq_file *m, void *arg) static void pppol2tp_session_init(struct l2tp_session *session) { struct pppol2tp_session *ps; - u32 mtu; session->recv_skb = pppol2tp_recv; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) @@ -563,11 +562,6 @@ static void pppol2tp_session_init(struct l2tp_session *session) ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); ps->owner = current->pid; - - /* If PMTU discovery was enabled, use the MTU that was discovered */ - mtu = l2tp_tunnel_dst_mtu(session->tunnel); - if (mtu) - session->mtu = mtu - PPPOL2TP_HEADER_OVERHEAD; } struct l2tp_connect_info { @@ -654,6 +648,22 @@ static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, return 0; } +/* Rough estimation of the maximum payload size a tunnel can transmit without + * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence + * numbers and no IP option. Not quite accurate, but the result is mostly + * unused anyway. + */ +static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel) +{ + int mtu; + + mtu = l2tp_tunnel_dst_mtu(tunnel); + if (mtu <= PPPOL2TP_HEADER_OVERHEAD) + return 1500 - PPPOL2TP_HEADER_OVERHEAD; + + return mtu - PPPOL2TP_HEADER_OVERHEAD; +} + /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, @@ -771,8 +781,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } } else { - /* Default MTU must allow space for UDP/L2TP/PPP headers */ - cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), @@ -817,7 +825,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, po->chan.private = sk; po->chan.ops = &pppol2tp_chan_ops; - po->chan.mtu = session->mtu; + po->chan.mtu = pppol2tp_tunnel_mtu(tunnel); error = ppp_register_net_channel(sock_net(sk), &po->chan); if (error) { @@ -873,10 +881,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, goto err; } - /* Default MTU values. */ - if (cfg->mtu == 0) - cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; - /* Allocate and initialize a new session context. */ session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, @@ -1040,7 +1044,6 @@ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, static int pppol2tp_session_ioctl(struct l2tp_session *session, unsigned int cmd, unsigned long arg) { - struct ifreq ifr; int err = 0; struct sock *sk; int val = (int) arg; @@ -1056,39 +1059,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, return -EBADR; switch (cmd) { - case SIOCGIFMTU: - err = -ENXIO; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - err = -EFAULT; - if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq))) - break; - ifr.ifr_mtu = session->mtu; - if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq))) - break; - - l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n", - session->name, session->mtu); - err = 0; - break; - - case SIOCSIFMTU: - err = -ENXIO; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - err = -EFAULT; - if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq))) - break; - - session->mtu = ifr.ifr_mtu; - - l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n", - session->name, session->mtu); - err = 0; - break; - case PPPIOCGMRU: case PPPIOCGFLAGS: err = -EFAULT; @@ -1685,8 +1655,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) tunnel->peer_tunnel_id, session->peer_session_id, state, user_data_ok); - seq_printf(m, " %d/0/%c/%c/%s %08x %u\n", - session->mtu, + seq_printf(m, " 0/0/%c/%c/%s %08x %u\n", session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", -- cgit v1.2.3 From e9697e2effad50c0081b3c72002d3975f8ab4347 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 3 Aug 2018 12:38:39 +0200 Subject: l2tp: ignore L2TP_ATTR_MTU This attribute's handling is broken. It can only be used when creating Ethernet pseudo-wires, in which case its value can be used as the initial MTU for the l2tpeth device. However, when handling update requests, L2TP_ATTR_MTU only modifies session->mtu. This value is never propagated to the l2tpeth device. Dump requests also return the value of session->mtu, which is not synchronised anymore with the device MTU. The same problem occurs if the device MTU is properly updated using the generic IFLA_MTU attribute. In this case, session->mtu is not updated, and L2TP_ATTR_MTU will report an invalid value again when dumping the session. It does not seem worthwhile to complexify l2tp_eth.c to synchronise session->mtu with the device MTU. Even the ip-l2tp manpage advises to use 'ip link' to initialise the MTU of l2tpeth devices (iproute2 does not handle L2TP_ATTR_MTU at all anyway). So let's just ignore it entirely. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 2 +- net/l2tp/l2tp_core.c | 1 - net/l2tp/l2tp_core.h | 2 -- net/l2tp/l2tp_debugfs.c | 3 +-- net/l2tp/l2tp_eth.c | 17 +++++++---------- net/l2tp/l2tp_netlink.c | 9 +-------- 6 files changed, 10 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 8bb8c7cfabe5..61158f5a1a5b 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -119,7 +119,7 @@ enum { L2TP_ATTR_IP_DADDR, /* u32 */ L2TP_ATTR_UDP_SPORT, /* u16 */ L2TP_ATTR_UDP_DPORT, /* u16 */ - L2TP_ATTR_MTU, /* u16 */ + L2TP_ATTR_MTU, /* u16 (not used) */ L2TP_ATTR_MRU, /* u16 (not used) */ L2TP_ATTR_STATS, /* nested */ L2TP_ATTR_IP6_SADDR, /* struct in6_addr */ diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index c61a467fd9b8..ac6a00bcec71 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1674,7 +1674,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn if (cfg) { session->pwtype = cfg->pw_type; session->debug = cfg->debug; - session->mtu = cfg->mtu; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 1ca39629031b..5804065dfbfb 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -64,7 +64,6 @@ struct l2tp_session_cfg { int peer_cookie_len; /* 0, 4 or 8 bytes */ int reorder_timeout; /* configured reorder timeout * (in jiffies) */ - int mtu; char *ifname; }; @@ -108,7 +107,6 @@ struct l2tp_session { int reorder_timeout; /* configured reorder timeout * (in jiffies) */ int reorder_skip; /* set if skip to next nr */ - int mtu; enum l2tp_pwtype pwtype; struct l2tp_stats stats; struct hlist_node global_hlist; /* Global hash list node */ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index aee271741f5b..9821a1458555 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -191,8 +191,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) if (session->send_seq || session->recv_seq) seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns); seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count)); - seq_printf(m, " config %d/0/%c/%c/-/%s %08x %u\n", - session->mtu, + seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n", session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index cfca5e63ae31..3728986ec885 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -234,14 +234,11 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, overhead += sizeof(struct udphdr); dev->needed_headroom += sizeof(struct udphdr); } - if (session->mtu != 0) { - dev->mtu = session->mtu; - dev->needed_headroom += session->hdr_len; - return; - } + lock_sock(tunnel->sock); l3_overhead = kernel_sock_ip_overhead(tunnel->sock); release_sock(tunnel->sock); + if (l3_overhead == 0) { /* L3 Overhead couldn't be identified, this could be * because tunnel->sock was NULL or the socket's @@ -255,12 +252,12 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, */ overhead += session->hdr_len + ETH_HLEN + l3_overhead; - /* If PMTU discovery was enabled, use discovered MTU on L2TP device */ - mtu = l2tp_tunnel_dst_mtu(tunnel); - if (mtu) + mtu = l2tp_tunnel_dst_mtu(tunnel) - overhead; + if (mtu < dev->min_mtu || mtu > dev->max_mtu) + dev->mtu = ETH_DATA_LEN - overhead; + else dev->mtu = mtu; - session->mtu = dev->mtu - overhead; - dev->mtu = session->mtu; + dev->needed_headroom += session->hdr_len; } diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index a7c409215336..2e1e92651545 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -608,9 +608,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); - if (info->attrs[L2TP_ATTR_MTU]) - cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]); - #ifdef CONFIG_MODULES if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { genl_unlock(); @@ -698,9 +695,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); - if (info->attrs[L2TP_ATTR_MTU]) - session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]); - ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); @@ -730,8 +724,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || - nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) || - nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu)) + nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; if ((session->ifname[0] && -- cgit v1.2.3 From 033eab53fff7acc0f5718dee6fda641734b94416 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 2 Aug 2018 21:18:31 +0200 Subject: netfilter: nft_tproxy: Add missing config check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A config check was missing form the code when using nf_defrag_ipv6_enable with NFT_TPROXY != n and NF_DEFRAG_IPV6 = n and this caused the following error: ../net/netfilter/nft_tproxy.c: In function 'nft_tproxy_init': ../net/netfilter/nft_tproxy.c:237:3: error: implicit declaration of function +'nf_defrag_ipv6_enable' [-Werror=implicit-function-declaration] err = nf_defrag_ipv6_enable(ctx->net); This patch adds a check for NF_TABLES_IPV6 when NF_DEFRAG_IPV6 is selected by Kconfig. Reported-by: Randy Dunlap Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Signed-off-by: Máté Eckl Acked-by: Randy Dunlap Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tproxy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index c6845f7baa08..eff99dffc842 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -234,9 +234,11 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; +#endif break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From af308b94a2a4a5a27bec9028354c4df444a7c8ba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Aug 2018 20:51:39 +0200 Subject: netfilter: nf_tables: add tunnel support This patch implements the tunnel object type that can be used to configure tunnels via metadata template through the existing lightweight API from the ingress path. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 69 ++++- net/core/dst.c | 1 + net/netfilter/Kconfig | 6 + net/netfilter/Makefile | 1 + net/netfilter/nft_tunnel.c | 458 +++++++++++++++++++++++++++++++ 5 files changed, 534 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/nft_tunnel.c (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f112ea52dc1a..3ee1198eeac1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1416,7 +1416,8 @@ enum nft_ct_helper_attributes { #define NFT_OBJECT_CT_HELPER 3 #define NFT_OBJECT_LIMIT 4 #define NFT_OBJECT_CONNLIMIT 5 -#define __NFT_OBJECT_MAX 6 +#define NFT_OBJECT_TUNNEL 6 +#define __NFT_OBJECT_MAX 7 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** @@ -1580,4 +1581,70 @@ enum nft_ng_types { }; #define NFT_NG_MAX (__NFT_NG_MAX - 1) +enum nft_tunnel_key_ip_attributes { + NFTA_TUNNEL_KEY_IP_UNSPEC, + NFTA_TUNNEL_KEY_IP_SRC, + NFTA_TUNNEL_KEY_IP_DST, + __NFTA_TUNNEL_KEY_IP_MAX +}; +#define NFTA_TUNNEL_KEY_IP_MAX (__NFTA_TUNNEL_KEY_IP_MAX - 1) + +enum nft_tunnel_ip6_attributes { + NFTA_TUNNEL_KEY_IP6_UNSPEC, + NFTA_TUNNEL_KEY_IP6_SRC, + NFTA_TUNNEL_KEY_IP6_DST, + NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + __NFTA_TUNNEL_KEY_IP6_MAX +}; +#define NFTA_TUNNEL_KEY_IP6_MAX (__NFTA_TUNNEL_KEY_IP6_MAX - 1) + +enum nft_tunnel_opts_attributes { + NFTA_TUNNEL_KEY_OPTS_UNSPEC, + NFTA_TUNNEL_KEY_OPTS_VXLAN, + NFTA_TUNNEL_KEY_OPTS_ERSPAN, + __NFTA_TUNNEL_KEY_OPTS_MAX +}; +#define NFTA_TUNNEL_KEY_OPTS_MAX (__NFTA_TUNNEL_KEY_OPTS_MAX - 1) + +enum nft_tunnel_opts_vxlan_attributes { + NFTA_TUNNEL_KEY_VXLAN_UNSPEC, + NFTA_TUNNEL_KEY_VXLAN_GBP, + __NFTA_TUNNEL_KEY_VXLAN_MAX +}; +#define NFTA_TUNNEL_KEY_VXLAN_MAX (__NFTA_TUNNEL_KEY_VXLAN_MAX - 1) + +enum nft_tunnel_opts_erspan_attributes { + NFTA_TUNNEL_KEY_ERSPAN_UNSPEC, + NFTA_TUNNEL_KEY_ERSPAN_VERSION, + NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, + NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, + NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, + __NFTA_TUNNEL_KEY_ERSPAN_MAX +}; +#define NFTA_TUNNEL_KEY_ERSPAN_MAX (__NFTA_TUNNEL_KEY_ERSPAN_MAX - 1) + +enum nft_tunnel_flags { + NFT_TUNNEL_F_ZERO_CSUM_TX = (1 << 0), + NFT_TUNNEL_F_DONT_FRAGMENT = (1 << 1), + NFT_TUNNEL_F_SEQ_NUMBER = (1 << 2), +}; +#define NFT_TUNNEL_F_MASK (NFT_TUNNEL_F_ZERO_CSUM_TX | \ + NFT_TUNNEL_F_DONT_FRAGMENT | \ + NFT_TUNNEL_F_SEQ_NUMBER) + +enum nft_tunnel_key_attributes { + NFTA_TUNNEL_KEY_UNSPEC, + NFTA_TUNNEL_KEY_ID, + NFTA_TUNNEL_KEY_IP, + NFTA_TUNNEL_KEY_IP6, + NFTA_TUNNEL_KEY_FLAGS, + NFTA_TUNNEL_KEY_TOS, + NFTA_TUNNEL_KEY_TTL, + NFTA_TUNNEL_KEY_SPORT, + NFTA_TUNNEL_KEY_DPORT, + NFTA_TUNNEL_KEY_OPTS, + __NFTA_TUNNEL_KEY_MAX +}; +#define NFTA_TUNNEL_KEY_MAX (__NFTA_TUNNEL_KEY_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/core/dst.c b/net/core/dst.c index 2d9b37f8944a..81ccf20e2826 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -307,6 +307,7 @@ void metadata_dst_free(struct metadata_dst *md_dst) #endif kfree(md_dst); } +EXPORT_SYMBOL_GPL(metadata_dst_free); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 55e399d5af10..654588088676 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -559,6 +559,12 @@ config NFT_NAT This option adds the "nat" expression that you can use to perform typical Network Address Translation (NAT) packet transformations. +config NFT_TUNNEL + tristate "Netfilter nf_tables tunnel module" + help + This option adds the "tunnel" expression that you can use to set + tunneling policies. + config NFT_OBJREF tristate "Netfilter nf_tables stateful object reference module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index cf61615cc529..16895e045b66 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o +obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c new file mode 100644 index 000000000000..715613d99c20 --- /dev/null +++ b/net/netfilter/nft_tunnel.c @@ -0,0 +1,458 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_tunnel_opts { + union { + struct vxlan_metadata vxlan; + struct erspan_metadata erspan; + } u; + u32 len; + u32 flags; +}; + +struct nft_tunnel_obj { + struct metadata_dst *md; + struct nft_tunnel_opts opts; +}; + +static const struct nla_policy nft_tunnel_ip_policy[NFTA_TUNNEL_KEY_IP_MAX + 1] = { + [NFTA_TUNNEL_KEY_IP_SRC] = { .type = NLA_U32 }, + [NFTA_TUNNEL_KEY_IP_DST] = { .type = NLA_U32 }, +}; + +static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct ip_tunnel_info *info) +{ + struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr, + nft_tunnel_ip_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_TUNNEL_KEY_IP_DST]) + return -EINVAL; + + if (tb[NFTA_TUNNEL_KEY_IP_SRC]) + info->key.u.ipv4.src = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_SRC]); + if (tb[NFTA_TUNNEL_KEY_IP_DST]) + info->key.u.ipv4.dst = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_DST]); + + return 0; +} + +static const struct nla_policy nft_tunnel_ip6_policy[NFTA_TUNNEL_KEY_IP6_MAX + 1] = { + [NFTA_TUNNEL_KEY_IP6_SRC] = { .len = sizeof(struct in6_addr), }, + [NFTA_TUNNEL_KEY_IP6_DST] = { .len = sizeof(struct in6_addr), }, + [NFTA_TUNNEL_KEY_IP6_FLOWLABEL] = { .type = NLA_U32, } +}; + +static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct ip_tunnel_info *info) +{ + struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr, + nft_tunnel_ip6_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_TUNNEL_KEY_IP6_DST]) + return -EINVAL; + + if (tb[NFTA_TUNNEL_KEY_IP6_SRC]) { + memcpy(&info->key.u.ipv6.src, + nla_data(tb[NFTA_TUNNEL_KEY_IP6_SRC]), + sizeof(struct in6_addr)); + } + if (tb[NFTA_TUNNEL_KEY_IP6_DST]) { + memcpy(&info->key.u.ipv6.dst, + nla_data(tb[NFTA_TUNNEL_KEY_IP6_DST]), + sizeof(struct in6_addr)); + } + if (tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]) + info->key.label = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]); + + info->mode |= IP_TUNNEL_INFO_IPV6; + + return 0; +} + +static const struct nla_policy nft_tunnel_opts_vxlan_policy[NFTA_TUNNEL_KEY_VXLAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_VXLAN_GBP] = { .type = NLA_U32 }, +}; + +static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, + struct nft_tunnel_opts *opts) +{ + struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr, + nft_tunnel_opts_vxlan_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_TUNNEL_KEY_VXLAN_GBP]) + return -EINVAL; + + opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); + + opts->len = sizeof(struct vxlan_metadata); + opts->flags = TUNNEL_VXLAN_OPT; + + return 0; +} + +static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, +}; + +static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, + struct nft_tunnel_opts *opts) +{ + struct nlattr *tb[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1]; + uint8_t hwid, dir; + int err, version; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr, + nft_tunnel_opts_erspan_policy, NULL); + if (err < 0) + return err; + + version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])); + switch (version) { + case ERSPAN_VERSION: + if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]) + return -EINVAL; + + opts->u.erspan.u.index = + nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]); + break; + case ERSPAN_VERSION2: + if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] || + !tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]) + return -EINVAL; + + hwid = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]); + dir = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]); + + set_hwid(&opts->u.erspan.u.md2, hwid); + opts->u.erspan.u.md2.dir = dir; + break; + default: + return -EOPNOTSUPP; + } + opts->u.erspan.version = version; + + opts->len = sizeof(struct erspan_metadata); + opts->flags = TUNNEL_ERSPAN_OPT; + + return 0; +} + +static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = { + [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, }, + [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, }, +}; + +static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct ip_tunnel_info *info, + struct nft_tunnel_opts *opts) +{ + struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr, + nft_tunnel_opts_policy, NULL); + if (err < 0) + return err; + + if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) { + err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN], + opts); + } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) { + err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN], + opts); + } else { + return -EOPNOTSUPP; + } + + return err; +} + +static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = { + [NFTA_TUNNEL_KEY_IP] = { .type = NLA_NESTED, }, + [NFTA_TUNNEL_KEY_IP6] = { .type = NLA_NESTED, }, + [NFTA_TUNNEL_KEY_ID] = { .type = NLA_U32, }, + [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, }, + [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, }, + [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, }, + [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, }, +}; + +static int nft_tunnel_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_tunnel_obj *priv = nft_obj_data(obj); + struct ip_tunnel_info info; + struct metadata_dst *md; + int err; + + if (!tb[NFTA_TUNNEL_KEY_ID]) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + info.mode = IP_TUNNEL_INFO_TX; + info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); + info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + + if (tb[NFTA_TUNNEL_KEY_IP]) { + err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); + if (err < 0) + return err; + } else if (tb[NFTA_TUNNEL_KEY_IP6]) { + err = nft_tunnel_obj_ip6_init(ctx, tb[NFTA_TUNNEL_KEY_IP6], &info); + if (err < 0) + return err; + } else { + return -EINVAL; + } + + if (tb[NFTA_TUNNEL_KEY_SPORT]) { + info.key.tp_src = + ntohs(nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT])); + } + if (tb[NFTA_TUNNEL_KEY_DPORT]) { + info.key.tp_dst = + ntohs(nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT])); + } + + if (tb[NFTA_TUNNEL_KEY_FLAGS]) { + u32 tun_flags; + + tun_flags = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_FLAGS])); + if (tun_flags & ~NFT_TUNNEL_F_MASK) + return -EOPNOTSUPP; + + if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) + info.key.tun_flags &= ~TUNNEL_CSUM; + if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) + info.key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) + info.key.tun_flags |= TUNNEL_SEQ; + } + if (tb[NFTA_TUNNEL_KEY_TOS]) + info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); + if (tb[NFTA_TUNNEL_KEY_TTL]) + info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]); + else + info.key.ttl = U8_MAX; + + if (tb[NFTA_TUNNEL_KEY_OPTS]) { + err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS], + &info, &priv->opts); + if (err < 0) + return err; + } + + md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL); + if (!md) + return -ENOMEM; + + memcpy(&md->u.tun_info, &info, sizeof(info)); + ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len, + priv->opts.flags); + priv->md = md; + + return 0; +} + +static inline void nft_tunnel_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_tunnel_obj *priv = nft_obj_data(obj); + struct sk_buff *skb = pkt->skb; + + skb_dst_drop(skb); + dst_hold((struct dst_entry *) priv->md); + skb_dst_set(skb, (struct dst_entry *) priv->md); +} + +static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) +{ + struct nlattr *nest; + + if (info->mode & IP_TUNNEL_INFO_IPV6) { + nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6); + if (!nest) + return -1; + + if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || + nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || + nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) + return -1; + + nla_nest_end(skb, nest); + } else { + nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP); + if (!nest) + return -1; + + if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || + nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) + return -1; + + nla_nest_end(skb, nest); + } + + return 0; +} + +static int nft_tunnel_opts_dump(struct sk_buff *skb, + struct nft_tunnel_obj *priv) +{ + struct nft_tunnel_opts *opts = &priv->opts; + struct nlattr *nest; + + nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS); + if (!nest) + return -1; + + if (opts->flags & TUNNEL_VXLAN_OPT) { + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, + htonl(opts->u.vxlan.gbp))) + return -1; + } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + switch (opts->u.erspan.version) { + case ERSPAN_VERSION: + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, + opts->u.erspan.u.index)) + return -1; + break; + case ERSPAN_VERSION2: + if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, + get_hwid(&opts->u.erspan.u.md2)) || + nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, + opts->u.erspan.u.md2.dir)) + return -1; + break; + } + } + nla_nest_end(skb, nest); + + return 0; +} + +static int nft_tunnel_ports_dump(struct sk_buff *skb, + struct ip_tunnel_info *info) +{ + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + return -1; + + return 0; +} + +static int nft_tunnel_flags_dump(struct sk_buff *skb, + struct ip_tunnel_info *info) +{ + u32 flags = 0; + + if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + flags |= NFT_TUNNEL_F_DONT_FRAGMENT; + if (!(info->key.tun_flags & TUNNEL_CSUM)) + flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; + if (info->key.tun_flags & TUNNEL_SEQ) + flags |= NFT_TUNNEL_F_SEQ_NUMBER; + + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) + return -1; + + return 0; +} + +static int nft_tunnel_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + struct nft_tunnel_obj *priv = nft_obj_data(obj); + struct ip_tunnel_info *info = &priv->md->u.tun_info; + + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ID, + tunnel_id_to_key32(info->key.tun_id)) || + nft_tunnel_ip_dump(skb, info) < 0 || + nft_tunnel_ports_dump(skb, info) < 0 || + nft_tunnel_flags_dump(skb, info) < 0 || + nla_put_u8(skb, NFTA_TUNNEL_KEY_TOS, info->key.tos) || + nla_put_u8(skb, NFTA_TUNNEL_KEY_TTL, info->key.ttl) || + nft_tunnel_opts_dump(skb, priv) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_tunnel_obj *priv = nft_obj_data(obj); + + metadata_dst_free(priv->md); +} + +static struct nft_object_type nft_tunnel_obj_type; +static const struct nft_object_ops nft_tunnel_obj_ops = { + .type = &nft_tunnel_obj_type, + .size = sizeof(struct nft_tunnel_obj), + .eval = nft_tunnel_obj_eval, + .init = nft_tunnel_obj_init, + .destroy = nft_tunnel_obj_destroy, + .dump = nft_tunnel_obj_dump, +}; + +static struct nft_object_type nft_tunnel_obj_type __read_mostly = { + .type = NFT_OBJECT_TUNNEL, + .ops = &nft_tunnel_obj_ops, + .maxattr = NFTA_TUNNEL_KEY_MAX, + .policy = nft_tunnel_key_policy, + .owner = THIS_MODULE, +}; + +static int __init nft_tunnel_module_init(void) +{ + return nft_register_obj(&nft_tunnel_obj_type); +} + +static void __exit nft_tunnel_module_exit(void) +{ + nft_unregister_obj(&nft_tunnel_obj_type); +} + +module_init(nft_tunnel_module_init); +module_exit(nft_tunnel_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL); -- cgit v1.2.3 From aaecfdb5c5dd8bac2dfd112166844a9f2d5711f0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Aug 2018 20:51:46 +0200 Subject: netfilter: nf_tables: match on tunnel metadata This patch allows us to match on the tunnel metadata that is available of the packet. We can use this to validate if the packet comes from/goes to tunnel and the corresponding tunnel ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 15 +++++ net/netfilter/nft_tunnel.c | 112 ++++++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3ee1198eeac1..357862d948de 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1647,4 +1647,19 @@ enum nft_tunnel_key_attributes { }; #define NFTA_TUNNEL_KEY_MAX (__NFTA_TUNNEL_KEY_MAX - 1) +enum nft_tunnel_keys { + NFT_TUNNEL_PATH, + NFT_TUNNEL_ID, + __NFT_TUNNEL_MAX +}; +#define NFT_TUNNEL_MAX (__NFT_TUNNEL_MAX - 1) + +enum nft_tunnel_attributes { + NFTA_TUNNEL_UNSPEC, + NFTA_TUNNEL_KEY, + NFTA_TUNNEL_DREG, + __NFTA_TUNNEL_MAX +}; +#define NFTA_TUNNEL_MAX (__NFTA_TUNNEL_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 715613d99c20..9332d7933dd5 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -12,6 +12,104 @@ #include #include +struct nft_tunnel { + enum nft_tunnel_keys key:8; + enum nft_registers dreg:8; +}; + +static void nft_tunnel_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_tunnel *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct ip_tunnel_info *tun_info; + + tun_info = skb_tunnel_info(pkt->skb); + + switch (priv->key) { + case NFT_TUNNEL_PATH: + nft_reg_store8(dest, !!tun_info); + break; + case NFT_TUNNEL_ID: + if (!tun_info) { + regs->verdict.code = NFT_BREAK; + return; + } + *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id)); + break; + default: + WARN_ON(1); + regs->verdict.code = NFT_BREAK; + } +} + +static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { + [NFTA_TUNNEL_KEY] = { .type = NLA_U32 }, + [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, +}; + +static int nft_tunnel_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_tunnel *priv = nft_expr_priv(expr); + u32 len; + + if (!tb[NFTA_TUNNEL_KEY] && + !tb[NFTA_TUNNEL_DREG]) + return -EINVAL; + + priv->key = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY])); + switch (priv->key) { + case NFT_TUNNEL_PATH: + len = sizeof(u8); + break; + case NFT_TUNNEL_ID: + len = sizeof(u32); + break; + default: + return -EOPNOTSUPP; + } + + priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]); + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +static int nft_tunnel_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_tunnel *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_TUNNEL_KEY, htonl(priv->key))) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_tunnel_type; +static const struct nft_expr_ops nft_tunnel_get_ops = { + .type = &nft_tunnel_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_tunnel)), + .eval = nft_tunnel_get_eval, + .init = nft_tunnel_get_init, + .dump = nft_tunnel_get_dump, +}; + +static struct nft_expr_type nft_tunnel_type __read_mostly = { + .name = "tunnel", + .ops = &nft_tunnel_get_ops, + .policy = nft_tunnel_policy, + .maxattr = NFTA_TUNNEL_MAX, + .owner = THIS_MODULE, +}; + struct nft_tunnel_opts { union { struct vxlan_metadata vxlan; @@ -442,12 +540,23 @@ static struct nft_object_type nft_tunnel_obj_type __read_mostly = { static int __init nft_tunnel_module_init(void) { - return nft_register_obj(&nft_tunnel_obj_type); + int err; + + err = nft_register_expr(&nft_tunnel_type); + if (err < 0) + return err; + + err = nft_register_obj(&nft_tunnel_obj_type); + if (err < 0) + nft_unregister_expr(&nft_tunnel_type); + + return err; } static void __exit nft_tunnel_module_exit(void) { nft_unregister_obj(&nft_tunnel_obj_type); + nft_unregister_expr(&nft_tunnel_type); } module_init(nft_tunnel_module_init); @@ -455,4 +564,5 @@ module_exit(nft_tunnel_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_EXPR("tunnel"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL); -- cgit v1.2.3 From 94276fa8a2a4c08ccb2e9d55e88b95dc972ccea3 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Fri, 3 Aug 2018 13:36:13 +0200 Subject: netfilter: bridge: Expose nf_tables bridge hook priorities through uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Netfilter exposes standard hook priorities in case of ipv4, ipv6 and arp but not in case of bridge. This patch exposes the hook priority values of the bridge family (which are different from the formerly mentioned) via uapi so that they can be used by user-space applications just like the others. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 11 ----------- include/uapi/linux/netfilter_bridge.h | 11 +++++++++++ net/bridge/br_netfilter_hooks.c | 1 + net/bridge/netfilter/ebtable_filter.c | 1 + net/bridge/netfilter/ebtable_nat.c | 1 + 5 files changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index b671fdfd212b..fa0686500970 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -5,17 +5,6 @@ #include #include -enum nf_br_hook_priorities { - NF_BR_PRI_FIRST = INT_MIN, - NF_BR_PRI_NAT_DST_BRIDGED = -300, - NF_BR_PRI_FILTER_BRIDGED = -200, - NF_BR_PRI_BRNF = 0, - NF_BR_PRI_NAT_DST_OTHER = 100, - NF_BR_PRI_FILTER_OTHER = 200, - NF_BR_PRI_NAT_SRC = 300, - NF_BR_PRI_LAST = INT_MAX, -}; - #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index 12fb77633f83..156ccd089df1 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -26,4 +26,15 @@ #define NF_BR_BROUTING 5 #define NF_BR_NUMHOOKS 6 +enum nf_br_hook_priorities { + NF_BR_PRI_FIRST = INT_MIN, + NF_BR_PRI_NAT_DST_BRIDGED = -300, + NF_BR_PRI_FILTER_BRIDGED = -200, + NF_BR_PRI_BRNF = 0, + NF_BR_PRI_NAT_DST_OTHER = 100, + NF_BR_PRI_FILTER_OTHER = 200, + NF_BR_PRI_NAT_SRC = 300, + NF_BR_PRI_LAST = INT_MAX, +}; + #endif /* _UAPI__LINUX_BRIDGE_NETFILTER_H */ diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9b16eaf33819..6e0dc6bcd32a 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index c41da5fac84f..550324c516ee 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -9,6 +9,7 @@ */ #include +#include #include #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 08df7406ecb3..c0fb3ca518af 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -9,6 +9,7 @@ */ #include +#include #include #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ -- cgit v1.2.3 From 445509eb9b00278b31c92f16b05260176a41c27f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 3 Aug 2018 13:35:36 +0200 Subject: netfilter: nf_tables: simplify NLM_F_CREATE handling * From nf_tables_newchain(), codepath provides context that allows us to infer if we are updating a chain (in that case, no module autoload is required) or adding a new one (then, module autoload is indeed needed). * We only need it in one single spot in nf_tables_newrule(). * Not needed for nf_tables_newset() at all. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index debc1680607c..67cdd5c4f4f5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1442,7 +1442,7 @@ struct nft_chain_hook { static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, - bool create) + bool autoload) { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; @@ -1467,7 +1467,7 @@ static int nft_chain_parse_hook(struct net *net, type = chain_type[family][NFT_CHAIN_T_DEFAULT]; if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], - family, create); + family, autoload); if (IS_ERR(type)) return PTR_ERR(type); } @@ -1534,7 +1534,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha } static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy, bool create) + u8 policy) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1552,7 +1552,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_chain_hook hook; struct nf_hook_ops *ops; - err = nft_chain_parse_hook(net, nla, &hook, family, create); + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1643,8 +1643,7 @@ err1: return err; } -static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, - bool create) +static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1661,7 +1660,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, - create); + false); if (err < 0) return err; @@ -1761,9 +1760,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; - bool create; - - create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; lockdep_assert_held(&net->nft.commit_mutex); @@ -1828,10 +1824,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updchain(&ctx, genmask, policy, create); + return nf_tables_updchain(&ctx, genmask, policy); } - return nf_tables_addchain(&ctx, family, genmask, policy, create); + return nf_tables_addchain(&ctx, family, genmask, policy); } static int nf_tables_delchain(struct net *net, struct sock *nlsk, @@ -2529,13 +2525,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, struct nlattr *tmp; unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; - bool create; u64 handle, pos_handle; lockdep_assert_held(&net->nft.commit_mutex); - create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); @@ -2565,7 +2558,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, else return -EOPNOTSUPP; } else { - if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) + if (!(nlh->nlmsg_flags & NLM_F_CREATE) || + nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); @@ -3361,7 +3355,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_ctx ctx; char *name; unsigned int size; - bool create; u64 timeout; u32 ktype, dtype, flags, policy, gc_int, objtype; struct nft_set_desc desc; @@ -3462,8 +3455,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } - create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); -- cgit v1.2.3 From 7bdfcea875ad42b6fd00413882fbc657c751f13a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Aug 2018 17:56:12 +0200 Subject: netfilter: kconfig: remove ct zone/label dependencies connection tracking zones currently depend on the xtables CT target. The reasoning was that it makes no sense to support zones if they can't be configured (which needed CT target). Nowadays zones can also be used by OVS and configured via nftables, so remove the dependency. connection tracking labels are handled via hidden dependency that gets auto-selected by the connlabel match. Make it a visible knob, as labels can be attached via ctnetlink or via nftables rules (nft_ct expression) too. This allows to use conntrack labels and zones with nftables-only build. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 654588088676..71709c104081 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -106,7 +106,6 @@ config NF_CONNTRACK_SECMARK config NF_CONNTRACK_ZONES bool 'Connection tracking zones' depends on NETFILTER_ADVANCED - depends on NETFILTER_XT_TARGET_CT help This option enables support for connection tracking zones. Normally, each connection needs to have a unique system wide @@ -158,10 +157,11 @@ config NF_CONNTRACK_TIMESTAMP If unsure, say `N'. config NF_CONNTRACK_LABELS - bool + bool "Connection tracking labels" help This option enables support for assigning user-defined flag bits - to connection tracking entries. It selected by the connlabel match. + to connection tracking entries. It can be used with xtables connlabel + match and the nftables ct expression. config NF_CT_PROTO_DCCP bool 'DCCP protocol connection tracking support' -- cgit v1.2.3 From 020f6cc5f75511c5974cfd454f224365bc0c2df4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Aug 2018 18:40:21 +0200 Subject: netfilter: conntrack: avoid use-after free on rmmod When the conntrack module is removed, we call nf_ct_iterate_destroy via nf_ct_l4proto_unregister(). Problem is that nf_conntrack_proto_fini() gets called after the conntrack hash table has already been freed. Just remove the l4proto unregister call, its unecessary as the nf_ct_protos[] array gets free'd right after anyway. v2: add comment wrt. missing unreg call. Fixes: a0ae2562c6c4b2 ("netfilter: conntrack: remove l3proto abstraction") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 803607a90102..30070732ee50 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -940,14 +940,13 @@ void nf_conntrack_proto_fini(void) { unsigned int i; - nf_ct_l4proto_unregister(builtin_l4proto, - ARRAY_SIZE(builtin_l4proto)); nf_unregister_sockopt(&so_getorigdst); #if IS_ENABLED(CONFIG_IPV6) nf_unregister_sockopt(&so_getorigdst6); #endif - - /* free l3proto protocol tables */ + /* No need to call nf_ct_l4proto_unregister(), the register + * tables are free'd here anyway. + */ for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) kfree(nf_ct_protos[i]); } -- cgit v1.2.3 From eb9950eb31f56e57582a61c92073336d04a26542 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 3 Aug 2018 17:06:56 +0100 Subject: rxrpc: Push iov_iter up from rxrpc_kernel_recv_data() to caller Push iov_iter up from rxrpc_kernel_recv_data() to its caller to allow non-contiguous iovs to be passed down, thereby permitting file reading to be simplified in the AFS filesystem in a future patch. Signed-off-by: David Howells Signed-off-by: David S. Miller --- fs/afs/rxrpc.c | 28 +++++++++++++++++----------- include/net/af_rxrpc.h | 2 +- net/rxrpc/recvmsg.c | 33 +++++++++++---------------------- 3 files changed, 29 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a1b18082991b..19db5f672a9d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -346,7 +346,6 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; - size_t offset; s64 tx_total_len; int ret; @@ -433,10 +432,10 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { - offset = 0; - rxrpc_kernel_recv_data(call->net->socket, rxcall, NULL, - 0, &offset, false, &call->abort_code, - &call->service_id); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0); + rxrpc_kernel_recv_data(call->net->socket, rxcall, + &msg.msg_iter, false, + &call->abort_code, &call->service_id); ac->abort_code = call->abort_code; ac->responded = true; } @@ -467,13 +466,14 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { - size_t offset = 0; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, - call->rxcall, - NULL, 0, &offset, false, + call->rxcall, &iter, false, &remote_abort, &call->service_id); - trace_afs_recv_data(call, 0, offset, false, ret); + trace_afs_recv_data(call, 0, 0, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; @@ -894,6 +894,8 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, bool want_more) { struct afs_net *net = call->net; + struct iov_iter iter; + struct kvec iov; enum afs_call_state state; u32 remote_abort = 0; int ret; @@ -903,10 +905,14 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, ASSERTCMP(call->offset, <=, count); - ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, - buf, count, &call->offset, + iov.iov_base = buf + call->offset; + iov.iov_len = count - call->offset; + iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset); + + ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter, want_more, &remote_abort, &call->service_id); + call->offset += (count - call->offset) - iov_iter_count(&iter); trace_afs_recv_data(call, count, call->offset, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 8ae8ee004258..f53edb3754bc 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -61,7 +61,7 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, - void *, size_t, size_t *, bool, u32 *, u16 *); + struct iov_iter *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a57ea96c84ea..816b19a78809 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -611,9 +611,7 @@ wait_error: * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info * @sock: The socket that the call exists on * @call: The call to send data through - * @buf: The buffer to receive into - * @size: The size of the buffer, including data already read - * @_offset: The running offset into the buffer. + * @iter: The buffer to receive into * @want_more: True if more data is expected to be read * @_abort: Where the abort code is stored if -ECONNABORTED is returned * @_service: Where to store the actual service ID (may be upgraded) @@ -626,39 +624,30 @@ wait_error: * Note that we may return -EAGAIN to drain empty packets at the end of the * data, even if we've already copied over the requested data. * - * This function adds the amount it transfers to *_offset, so this should be - * precleared as appropriate. Note that the amount remaining in the buffer is - * taken to be size - *_offset. - * * *_abort should also be initialised to 0. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, - void *buf, size_t size, size_t *_offset, + struct iov_iter *iter, bool want_more, u32 *_abort, u16 *_service) { - struct iov_iter iter; - struct kvec iov; + size_t offset = 0; int ret; - _enter("{%d,%s},%zu/%zu,%d", + _enter("{%d,%s},%zu,%d", call->debug_id, rxrpc_call_states[call->state], - *_offset, size, want_more); + iov_iter_count(iter), want_more); - ASSERTCMP(*_offset, <=, size); ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); - iov.iov_base = buf + *_offset; - iov.iov_len = size - *_offset; - iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); - mutex_lock(&call->user_mutex); switch (READ_ONCE(call->state)) { case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0, - _offset); + ret = rxrpc_recvmsg_data(sock, call, NULL, iter, + iov_iter_count(iter), 0, + &offset); if (ret < 0) goto out; @@ -667,7 +656,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, * full buffer or have been given -EAGAIN. */ if (ret == 1) { - if (*_offset < size) + if (iov_iter_count(iter) > 0) goto short_data; if (!want_more) goto read_phase_complete; @@ -704,7 +693,7 @@ out: if (_service) *_service = call->service_id; mutex_unlock(&call->user_mutex); - _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); + _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort); return ret; short_data: @@ -720,7 +709,7 @@ call_complete: ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; - if (size > 0) + if (iov_iter_count(iter) > 0) ret = -ECONNRESET; } goto out; -- cgit v1.2.3 From 483f3fdcc70b3c3a1f314235ab0066f3dbd4cfbe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Aug 2018 00:31:48 +0200 Subject: netfilter: nft_tunnel: fix sparse errors [...] net/netfilter/nft_tunnel.c:117:25: expected unsigned int [unsigned] [usertype] flags net/netfilter/nft_tunnel.c:117:25: got restricted __be16 [usertype] [...] net/netfilter/nft_tunnel.c:246:33: expected restricted __be16 [addressable] [assigned] [usertype] tp_dst net/netfilter/nft_tunnel.c:246:33: got int Fixes: af308b94a2a4 ("netfilter: nf_tables: add tunnel support") Reported-by: kbuild test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 9332d7933dd5..3a15f219e4e7 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -116,7 +116,7 @@ struct nft_tunnel_opts { struct erspan_metadata erspan; } u; u32 len; - u32 flags; + __be16 flags; }; struct nft_tunnel_obj { @@ -337,12 +337,10 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, } if (tb[NFTA_TUNNEL_KEY_SPORT]) { - info.key.tp_src = - ntohs(nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT])); + info.key.tp_src = nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT]); } if (tb[NFTA_TUNNEL_KEY_DPORT]) { - info.key.tp_dst = - ntohs(nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT])); + info.key.tp_dst = nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT]); } if (tb[NFTA_TUNNEL_KEY_FLAGS]) { -- cgit v1.2.3 From 51f7e95187f127d5eadf50541943813ff57f12ba Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 3 Aug 2018 17:24:53 -0400 Subject: af_unix: ensure POLLOUT on remote close() for connected dgram socket Applications use -ECONNREFUSED as returned from write() in order to determine that a socket should be closed. However, when using connected dgram unix sockets in a poll/write loop, a final POLLOUT event can be missed when the remote end closes. Thus, the poll is stuck forever: thread 1 (client) thread 2 (server) connect() to server write() returns -EAGAIN unix_dgram_poll() -> unix_recvq_full() is true close() ->unix_release_sock() ->wake_up_interruptible_all() unix_dgram_poll() (due to the wake_up_interruptible_all) -> unix_recvq_full() still is true ->free all skbs Now thread 1 is stuck and will not receive anymore wakeups. In this case, when thread 1 gets the -EAGAIN, it has not queued any skbs otherwise the 'free all skbs' step would in fact cause a wakeup and a POLLOUT return. So the race here is probably fairly rare because it means there are no skbs that thread 1 queued and that thread 1 schedules before the 'free all skbs' step. This issue was reported as a hang when /dev/log is closed. The fix is to signal POLLOUT if the socket is marked as SOCK_DEAD, which means a subsequent write() will get -ECONNREFUSED. Reported-by: Ian Lance Taylor Cc: David Rientjes Cc: Rainer Weikusat Cc: Eric Dumazet Signed-off-by: Jason Baron Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1772a0e32665..d1edfa3cad61 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -430,7 +430,12 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) connected = unix_dgram_peer_wake_connect(sk, other); - if (unix_recvq_full(other)) + /* If other is SOCK_DEAD, we want to make sure we signal + * POLLOUT, such that a subsequent write() can get a + * -ECONNREFUSED. Otherwise, if we haven't queued any skbs + * to other and its full, we will hang waiting for POLLOUT. + */ + if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) -- cgit v1.2.3 From a01512b14d4faa9f6f7501201d7033216d2e563a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 3 Aug 2018 16:28:48 +0800 Subject: tcp: remove unneeded variable 'err' variable 'err' is unmodified after initalization, so simply cleans up it and returns 0. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 31fa1c080f28..b8af2fec5ad5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2531,7 +2531,6 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int err = 0; int old_state = sk->sk_state; if (old_state != TCP_CLOSE) @@ -2612,7 +2611,7 @@ int tcp_disconnect(struct sock *sk, int flags) } sk->sk_error_report(sk); - return err; + return 0; } EXPORT_SYMBOL(tcp_disconnect); -- cgit v1.2.3 From 07d53ae4fbdf7458f4d51249aa24d75c76fe52a8 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 4 Aug 2018 19:41:41 +0800 Subject: net: Remove some unneeded semicolon These semicolons are not needed. Just remove them. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- net/core/utils.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- net/packet/af_packet.c | 4 ++-- net/sunrpc/auth_gss/auth_gss.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index d47863b07a60..2a597ac7808e 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -397,7 +397,7 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, break; default: pr_err("unexpected address family %d\n", af); - }; + } return ret; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index dd21782e2f12..62eefea48973 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -134,7 +134,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) } else { atomic_set(&ipvs->dropentry, 0); ipvs->sysctl_drop_entry = 1; - }; + } break; case 3: atomic_set(&ipvs->dropentry, 1); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e3e00d3a972e..345e38058ae5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1582,7 +1582,7 @@ static int fanout_set_data(struct packet_sock *po, char __user *data, return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; - }; + } } static void fanout_release_data(struct packet_fanout *f) @@ -1591,7 +1591,7 @@ static void fanout_release_data(struct packet_fanout *f) case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); - }; + } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index be8f103d22fd..0fc397fae42b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -517,7 +517,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); if (err) goto err_put_pipe_version; - }; + } kref_get(&gss_auth->kref); return gss_msg; err_put_pipe_version: -- cgit v1.2.3 From 5f379ef51bc967567bbddacdcdecb772d4d7c3b3 Mon Sep 17 00:00:00 2001 From: Georg Kohmann Date: Thu, 2 Aug 2018 13:56:58 +0200 Subject: ipv6: icmp: Updating pmtu for link local route When a ICMPV6_PKT_TOOBIG is received from a link local address the pmtu will be updated on a route with an arbitrary interface index. Subsequent packets sent back to the same link local address may therefore end up not considering the updated pmtu. Current behavior breaks TAHI v6LC4.1.4 Reduce PMTU On-link. Referring to RFC 1981: Section 3: "Note that Path MTU Discovery must be performed even in cases where a node "thinks" a destination is attached to the same link as itself. In a situation such as when a neighboring router acts as proxy [ND] for some destination, the destination can to appear to be directly connected but is in fact more than one hop away." Using the interface index from the incoming ICMPV6_PKT_TOOBIG when updating the pmtu. Signed-off-by: Georg Kohmann Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 00d159d431dc..7f6b1f81c200 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); if (type == ICMPV6_PKT_TOOBIG) - ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); -- cgit v1.2.3 From cfb4099fb4c101dad283a163c9525240ef4a1a99 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Thu, 2 Aug 2018 20:43:10 +0530 Subject: net/tls: Mark the end in scatterlist table Function zerocopy_from_iter() unmarks the 'end' in input sgtable while adding new entries in it. The last entry in sgtable remained unmarked. This results in KASAN error report on using apis like sg_nents(). Before returning, the function needs to mark the 'end' in the last entry it adds. Signed-off-by: Vakul Garg Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index ff3a6904a722..83d67df33f0c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -311,6 +311,9 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } } + /* Mark the end in the last sg entry if newly added */ + if (num_elem > *pages_used) + sg_mark_end(&to[num_elem - 1]); out: if (rc) iov_iter_revert(from, size - *size_used); -- cgit v1.2.3 From 7969e5c40dfd04799d4341f1b7cd266b6e47f227 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:37 +0000 Subject: ip: discard IPv4 datagrams with overlapping segments. This behavior is required in IPv6, and there is little need to tolerate overlapping fragments in IPv4. This change simplifies the code and eliminates potential DDoS attack vectors. Tested: ran ip_defrag selftest (not yet available uptream). Suggested-by: David S. Miller Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/ip_fragment.c | 75 ++++++++++++----------------------------------- net/ipv4/proc.c | 1 + 3 files changed, 21 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index e5ebc83827ab..f80135e5feaa 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -56,6 +56,7 @@ enum IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */ IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ IPSTATS_MIB_CEPKTS, /* InCEPkts */ + IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */ __IPSTATS_MIB_MAX }; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d14d741fb05e..960bf5eab59f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -277,6 +277,7 @@ static int ip_frag_reinit(struct ipq *qp) /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct sk_buff *prev, *next; struct net_device *dev; unsigned int fragsize; @@ -357,65 +358,23 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } found: - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * We do the same here for IPv4. */ - if (prev) { - int i = (prev->ip_defrag_offset + prev->len) - offset; - if (i > 0) { - offset += i; - err = -EINVAL; - if (end <= offset) - goto err; - err = -ENOMEM; - if (!pskb_pull(skb, i)) - goto err; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } + /* Is there an overlap with the previous fragment? */ + if (prev && + (prev->ip_defrag_offset + prev->len) > offset) + goto discard_qp; - err = -ENOMEM; - - while (next && next->ip_defrag_offset < end) { - int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ - - if (i < next->len) { - int delta = -next->truesize; - - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - if (!pskb_pull(next, i)) - goto err; - delta += next->truesize; - if (delta) - add_frag_mem_limit(qp->q.net, delta); - next->ip_defrag_offset += i; - qp->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragment is completely overridden with - * new one drop it. - */ - next = next->next; - - if (prev) - prev->next = next; - else - qp->q.fragments = next; - - qp->q.meat -= free_it->len; - sub_frag_mem_limit(qp->q.net, free_it->truesize); - kfree_skb(free_it); - } - } + /* Is there an overlap with the next fragment? */ + if (next && next->ip_defrag_offset < end) + goto discard_qp; /* Note : skb->ip_defrag_offset and skb->dev share the same location */ dev = skb->dev; @@ -463,6 +422,10 @@ found: skb_dst_drop(skb); return -EINPROGRESS; +discard_qp: + inet_frag_kill(&qp->q); + err = -EINVAL; + __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); err: kfree_skb(skb); return err; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b46e4cf9a55a..70289682a670 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), + SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), SNMP_MIB_SENTINEL }; -- cgit v1.2.3 From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:38 +0000 Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs. Tested: see the next patch is the series. Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fd3cb1b247df..47848367c816 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2585,7 +2585,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -void skb_rbtree_purge(struct rb_root *root); +unsigned int skb_rbtree_purge(struct rb_root *root); void *netdev_alloc_frag(unsigned int fragsz); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 51b0a9126e12..8d574a88125d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2858,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge); /** * skb_rbtree_purge - empty a skb rbtree * @root: root of the rbtree to empty + * Return value: the sum of truesizes of all purged skbs. * * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from * the list and one reference dropped. This function does not take * any lock. Synchronization should be handled by the caller (e.g., TCP * out-of-order queue is protected by the socket lock). */ -void skb_rbtree_purge(struct rb_root *root) +unsigned int skb_rbtree_purge(struct rb_root *root) { struct rb_node *p = rb_first(root); + unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); + sum += skb->truesize; kfree_skb(skb); } + return sum; } /** -- cgit v1.2.3 From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:39 +0000 Subject: ip: use rb trees for IP frag queue. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn Reported-by: Juha-Matti Tilli Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +- include/net/inet_frag.h | 3 +- net/ipv4/inet_fragment.c | 16 +-- net/ipv4/ip_fragment.c | 182 ++++++++++++++++++-------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + net/ipv6/reassembly.c | 1 + 6 files changed, 121 insertions(+), 91 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47848367c816..7ebdf158a795 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,16 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; - int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; }; - struct sock *sk; + + union { + struct sock *sk; + int ip_defrag_offset; + }; union { ktime_t tstamp; diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index f4272a29dc44..b86d14528188 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -75,7 +75,8 @@ struct inet_frag_queue { struct timer_list timer; spinlock_t lock; refcount_t refcnt; - struct sk_buff *fragments; + struct sk_buff *fragments; /* Used in IPv6. */ + struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; ktime_t stamp; int len; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index ccd140e4082d..6d258a5669e7 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q) fp = q->fragments; nf = q->net; f = nf->f; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; + if (fp) { + do { + struct sk_buff *xp = fp->next; + + sum_truesize += fp->truesize; + kfree_skb(fp); + fp = xp; + } while (fp); + } else { + sum_truesize = skb_rbtree_purge(&q->rb_fragments); } sum = sum_truesize + f->qsize; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 960bf5eab59f..0e8f8de77e71 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; - struct sk_buff *head; + struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; @@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - - head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + if (!qp->q.flags & INET_FRAG_FIRST_IN) goto out; + /* sk_buff::dev and sk_buff::rbnode are unionized. So we + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ + if (qp->q.fragments) { + head = qp->q.fragments; + qp->q.fragments = head->next; + } else { + head = skb_rb_first(&qp->q.rb_fragments); + if (!head) + goto out; + rb_erase(&head->rbnode, &qp->q.rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == qp->q.fragments_tail) + qp->q.fragments_tail = NULL; + + sub_frag_mem_limit(qp->q.net, head->truesize); + head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - skb_get(head); spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - kfree_skb(head); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); + if (head) + kfree_skb(head); ipq_put(qp); } @@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp) end = atomic_inc_return(&peer->rid); qp->rid = end; - rc = qp->q.fragments && (end - start) > max; + rc = qp->q.fragments_tail && (end - start) > max; if (rc) { struct net *net; @@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { - struct sk_buff *fp; unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { @@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - fp = qp->q.fragments; - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); + sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->iif = 0; qp->ecn = 0; @@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct sk_buff *prev, *next; + struct rb_node **rbn, *parent; + struct sk_buff *skb1; struct net_device *dev; unsigned int fragsize; int flags, offset; @@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (err) goto err; - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = qp->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = qp->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); -found: /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * - * We do the same here for IPv4. + * We do the same here for IPv4 (and increment an snmp counter). */ - /* Is there an overlap with the previous fragment? */ - if (prev && - (prev->ip_defrag_offset + prev->len) > offset) - goto discard_qp; - - /* Is there an overlap with the next fragment? */ - if (next && next->ip_defrag_offset < end) - goto discard_qp; + /* Find out where to put this fragment. */ + skb1 = qp->q.fragments_tail; + if (!skb1) { + /* This is the first fragment we've received. */ + rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); + qp->q.fragments_tail = skb; + } else if ((skb1->ip_defrag_offset + skb1->len) < end) { + /* This is the common/special case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < (skb1->ip_defrag_offset + skb1->len)) + goto discard_qp; + /* Insert after skb1. */ + rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); + qp->q.fragments_tail = skb; + } else { + /* Binary search. Note that skb can become the first fragment, but + * not the last (covered above). */ + rbn = &qp->q.rb_fragments.rb_node; + do { + parent = *rbn; + skb1 = rb_to_skb(parent); + if (end <= skb1->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= skb1->ip_defrag_offset + skb1->len) + rbn = &parent->rb_right; + else /* Found an overlap with skb1. */ + goto discard_qp; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. */ + rb_link_node(&skb->rbnode, parent, rbn); + } + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ - dev = skb->dev; if (dev) qp->iif = dev->ifindex; - /* Makes sure compiler wont do silly aliasing games */ - barrier(); skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - qp->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - qp->q.fragments = skb; - qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -414,7 +425,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, prev, dev); + err = ip_frag_reasm(qp, skb, dev); skb->_skb_refdst = orefdst; return err; } @@ -431,15 +442,15 @@ err: return err; } - /* Build a new IP datagram from all its fragments. */ - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = qp->q.fragments; + struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); + struct sk_buff **nextp; /* To build frag_list. */ + struct rb_node *rbn; int len; int ihlen; int err; @@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_fail; } /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); if (!fp) goto out_nomem; - - fp->next = head->next; - if (!fp->next) + rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); + if (qp->q.fragments_tail == skb) qp->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, qp->q.fragments); - head->next = qp->q.fragments->next; - - consume_skb(qp->q.fragments); - qp->q.fragments = head; + skb_morph(skb, head); + rb_replace_node(&head->rbnode, &skb->rbnode, + &qp->q.rb_fragments); + consume_skb(head); + head = skb; } - WARN_ON(!head); WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ @@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; - clone->next = head->next; - head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; + skb->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { + /* Traverse the tree in order, to build frag_list. */ + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &qp->q.rb_fragments); + while (rbn) { + struct rb_node *rbnext = rb_next(rbn); + fp = rb_to_skb(rbn); + rb_erase(rbn, &qp->q.rb_fragments); + rbn = rbnext; + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) @@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } sub_frag_mem_limit(qp->q.net, head->truesize); + *nextp = NULL; head->next = NULL; + head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); @@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0610bdab721c..38d69ef516d5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic head->csum); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return true; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 6edd2ac8ae4b..b4e558ab39fa 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return 1; -- cgit v1.2.3 From 0ed4229b08c13c84a3c301a08defdc9e7f4467e6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Aug 2018 02:22:20 +0200 Subject: ipv6: defrag: drop non-last frags smaller than min mtu don't bother with pathological cases, they only waste cycles. IPv6 requires a minimum MTU of 1280 so we should never see fragments smaller than this (except last frag). v3: don't use awkward "-offset + len" v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68). There were concerns that there could be even smaller frags generated by intermediate nodes, e.g. on radio networks. Cc: Peter Oskolkov Cc: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++++ net/ipv6/reassembly.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 38d69ef516d5..2a14d8b65924 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -558,6 +558,10 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && + fhdr->frag_off & htons(IP6_MF)) + return -EINVAL; + skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b4e558ab39fa..5c5b4f79296e 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -456,6 +456,10 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } + if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && + fhdr->frag_off & htons(IP6_MF)) + goto fail_hdr; + iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { -- cgit v1.2.3 From a6bcfc89694ed8cb482a82cdc8b93aae63a8b691 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 3 Aug 2018 15:45:21 +0800 Subject: net: check extack._msg before print dev_set_mtu_ext is able to fail with a valid mtu value, at that condition, extack._msg is not set and random since it is in stack, then kernel will crash when print it. Fixes: 7a4c53bee3324a ("net: report invalid mtu value via netlink extack") Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 36e994519488..f68122f0ab02 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7583,8 +7583,9 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) struct netlink_ext_ack extack; int err; + memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); - if (err) + if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } -- cgit v1.2.3 From 1cbc36a53b60d43daa686280385b1ddbe51d5809 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Aug 2018 22:27:55 +0300 Subject: net: sched: cls_flower: Fix an error code in fl_tmplt_create() We forgot to set the error code on this path, so we return NULL instead of an error pointer. In the current code kzalloc() won't fail for small allocations so this doesn't really affect runtime. Fixes: b95ec7eb3b4d ("net: sched: cls_flower: implement chain templates") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e8bd08ba998a..a3b69bb6f4b0 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1250,8 +1250,10 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, goto errout_tb; tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL); - if (!tmplt) + if (!tmplt) { + err = -ENOMEM; goto errout_tb; + } tmplt->chain = chain; err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); if (err) -- cgit v1.2.3 From ac74f87c789af40936a80131c4759f3e72579c3a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 14 Jul 2018 12:52:10 -0400 Subject: net: 6lowpan: fix reserved space for single frames This patch fixes patch add handling to take care tail and headroom for single 6lowpan frames. We need to be sure we have a skb with the right head and tailroom for single frames. This patch do it by using skb_copy_expand() if head and tailroom is not enough allocated by upper layer. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195059 Reported-by: David Palma Reported-by: Rabi Narayan Sahoo Cc: stable@vger.kernel.org Signed-off-by: Alexander Aring Signed-off-by: Stefan Schmidt --- net/ieee802154/6lowpan/tx.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index e6ff5128e61a..ca53efa17be1 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -265,9 +265,24 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) - return NET_XMIT_DROP; + if (unlikely(skb_headroom(skb) < ldev->needed_headroom || + skb_tailroom(skb) < ldev->needed_tailroom)) { + struct sk_buff *nskb; + + nskb = skb_copy_expand(skb, ldev->needed_headroom, + ldev->needed_tailroom, GFP_ATOMIC); + if (likely(nskb)) { + consume_skb(skb); + skb = nskb; + } else { + kfree_skb(skb); + return NET_XMIT_DROP; + } + } else { + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + } ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset); if (ret < 0) { -- cgit v1.2.3 From f9c52831133050c6b82aa8b6831c92da2bbf2a0b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 2 Jul 2018 16:32:03 -0400 Subject: net: mac802154: tx: expand tailroom if necessary This patch is necessary if case of AF_PACKET or other socket interface which I am aware of it and didn't allocated the necessary room. Reported-by: David Palma Reported-by: Rabi Narayan Sahoo Cc: stable@vger.kernel.org Signed-off-by: Alexander Aring Signed-off-by: Stefan Schmidt --- net/mac802154/tx.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 7e253455f9dd..bcd1a5e6ebf4 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -63,8 +63,21 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) int ret; if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) { - u16 crc = crc_ccitt(0, skb->data, skb->len); + struct sk_buff *nskb; + u16 crc; + + if (unlikely(skb_tailroom(skb) < IEEE802154_FCS_LEN)) { + nskb = skb_copy_expand(skb, 0, IEEE802154_FCS_LEN, + GFP_ATOMIC); + if (likely(nskb)) { + consume_skb(skb); + skb = nskb; + } else { + goto err_tx; + } + } + crc = crc_ccitt(0, skb->data, skb->len); put_unaligned_le16(crc, skb_put(skb, 2)); } -- cgit v1.2.3 From 4e54acb2022099e41231271b5a8ec58ca8ac2d86 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 31 Jul 2018 16:45:07 +0100 Subject: net: ieee802154: 6lowpan: remove redundant pointers 'fq' and 'net' Pointers fq and net are being assigned but are never used hence they are redundant and can be removed. Cleans up clang warnings: warning: variable 'fq' set but not used [-Wunused-but-set-variable] warning: variable 'net' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Stefan Schmidt --- net/ieee802154/6lowpan/reassembly.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index ec7a5da56129..e7857a8ac86d 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -40,9 +40,6 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { const struct frag_lowpan_compare_key *key = a; - struct lowpan_frag_queue *fq; - - fq = container_of(q, struct lowpan_frag_queue, q); BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); memcpy(&q->key, key, sizeof(*key)); @@ -52,10 +49,8 @@ static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags); spin_lock(&fq->q.lock); -- cgit v1.2.3 From ad3e0b2f3c9483fa79bb4148464dba52ce33ae46 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 6 Aug 2018 19:08:51 +0800 Subject: Bluetooth: remove redundant variables 'adv_set' and 'cp' Variables 'adv_set' and 'cp' are being assigned but are never used hence they are redundant and can be removed. Cleans up clang warnings: net/bluetooth/hci_event.c:1135:29: warning: variable 'adv_set' set but not used [-Wunused-but-set-variable] net/bluetooth/mgmt.c:3359:39: warning: variable 'cp' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: Johan Hedberg --- net/bluetooth/hci_event.c | 3 --- net/bluetooth/mgmt.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 754714c8d752..8078587572fe 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1132,7 +1132,6 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_cp_le_set_ext_adv_enable *cp; - struct hci_cp_ext_adv_set *adv_set; __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1144,8 +1143,6 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, if (!cp) return; - adv_set = (void *) cp->data; - hci_dev_lock(hdev); if (cp->enable) { diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 231602f7cb66..3bdc8f3ca259 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3356,7 +3356,6 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) static void set_default_phy_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { - struct mgmt_cp_set_phy_confguration *cp; struct mgmt_pending_cmd *cmd; BT_DBG("status 0x%02x", status); @@ -3367,8 +3366,6 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status, if (!cmd) goto unlock; - cp = cmd->param; - if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, -- cgit v1.2.3 From 9c2e955c48363a6a000a684aa49be7f4ac1120ad Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Mon, 6 Aug 2018 11:07:23 +0800 Subject: net/bridge/br_multicast: remove redundant variable "err" The err is not modified after initalization, So remove it and make it to be void function. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 920665dd92db..20ed7adcf1cc 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1423,10 +1423,10 @@ static void br_multicast_query_received(struct net_bridge *br, br_multicast_mark_router(br, port); } -static int br_ip4_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, - struct sk_buff *skb, - u16 vid) +static void br_ip4_multicast_query(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb, + u16 vid) { const struct iphdr *iph = ip_hdr(skb); struct igmphdr *ih = igmp_hdr(skb); @@ -1439,7 +1439,6 @@ static int br_ip4_multicast_query(struct net_bridge *br, unsigned long now = jiffies; unsigned int offset = skb_transport_offset(skb); __be32 group; - int err = 0; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || @@ -1498,7 +1497,6 @@ static int br_ip4_multicast_query(struct net_bridge *br, out: spin_unlock(&br->multicast_lock); - return err; } #if IS_ENABLED(CONFIG_IPV6) @@ -1828,7 +1826,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb_trimmed, vid); + br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid, src); -- cgit v1.2.3 From 9dae34978d83df06fc59aff5cf0d88ce41b80643 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 6 Aug 2018 11:57:02 +0800 Subject: net: avoid unnecessary sock_flag() check when enable timestamp The sock_flag() check is alreay inside sock_enable_timestamp(), so it is unnecessary checking it in the caller. void sock_enable_timestamp(struct sock *sk, int flag) { if (!sock_flag(sk, flag)) { ... } } Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- net/compat.c | 6 ++---- net/core/sock.c | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 7242cce5631b..3b2105f6549d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -466,8 +466,7 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) ctv = (struct compat_timeval __user *) userstamp; err = -ENOENT; - if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk, SOCK_TIMESTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); tv = ktime_to_timeval(sk->sk_stamp); if (tv.tv_sec == -1) return err; @@ -494,8 +493,7 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta ctv = (struct compat_timespec __user *) userstamp; err = -ENOENT; - if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk, SOCK_TIMESTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec(sk->sk_stamp); if (ts.tv_sec == -1) return err; diff --git a/net/core/sock.c b/net/core/sock.c index e31233f5ba39..3730eb855095 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2900,8 +2900,8 @@ EXPORT_SYMBOL(lock_sock_fast); int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) { struct timeval tv; - if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk, SOCK_TIMESTAMP); + + sock_enable_timestamp(sk, SOCK_TIMESTAMP); tv = ktime_to_timeval(sk->sk_stamp); if (tv.tv_sec == -1) return -ENOENT; @@ -2916,8 +2916,8 @@ EXPORT_SYMBOL(sock_get_timestamp); int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) { struct timespec ts; - if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk, SOCK_TIMESTAMP); + + sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec(sk->sk_stamp); if (ts.tv_sec == -1) return -ENOENT; -- cgit v1.2.3 From 70837ffe3085c9a91488b52ca13ac84424da1042 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Aug 2018 22:17:35 +0300 Subject: ipv4: frags: precedence bug in ip_expire() We accidentally removed the parentheses here, but they are required because '!' has higher precedence than '&'. Fixes: fa0f527358bd ("ip: use rb trees for IP frag queue.") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0e8f8de77e71..7cb7ed761d8c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -154,7 +154,7 @@ static void ip_expire(struct timer_list *t) __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!qp->q.flags & INET_FRAG_FIRST_IN) + if (!(qp->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we -- cgit v1.2.3 From 35a8a3bd1c2e29bb6baec501c6f56abaaa10a48a Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 7 Aug 2018 11:43:02 +0200 Subject: netfilter: nft_osf: use NFT_OSF_MAXGENRELEN instead of IFNAMSIZ As no "genre" on pf.os exceed 16 bytes of length, we reduce NFT_OSF_MAXGENRELEN parameter to 16 bytes and use it instead of IFNAMSIZ. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nft_osf.c | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 357862d948de..94657c701f22 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -8,6 +8,7 @@ #define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_OBJ_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_USERDATA_MAXLEN 256 +#define NFT_OSF_MAXGENRELEN 16 /** * enum nft_registers - nf_tables registers diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 9b2f3de7be4f..5af74b37f423 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -4,8 +4,6 @@ #include #include -#define OSF_GENRE_SIZE 32 - struct nft_osf { enum nft_registers dreg:8; }; @@ -37,9 +35,9 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, os_name = nf_osf_find(skb, nf_osf_fingers); if (!os_name) - strncpy((char *)dest, "unknown", IFNAMSIZ); + strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); else - strncpy((char *)dest, os_name, IFNAMSIZ); + strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN); } static int nft_osf_init(const struct nft_ctx *ctx, @@ -51,7 +49,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFTA_DATA_VALUE, OSF_GENRE_SIZE); + NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); if (err < 0) return err; -- cgit v1.2.3 From 4e665afbd7bee29b44b5d22821b56207f8459e39 Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Tue, 7 Aug 2018 17:14:10 +0200 Subject: netfilter: cttimeout: move ctnl_untimeout to nf_conntrack As, ctnl_untimeout is required by nft_ct, so move ctnl_timeout from nfnetlink_cttimeout to nf_conntrack_timeout and rename as nf_ct_timeout. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 1 + net/netfilter/nf_conntrack_timeout.c | 17 +++++++++++++++++ net/netfilter/nfnetlink_cttimeout.c | 20 ++------------------ 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 80ceb3d0291d..7a21bc0f00eb 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -83,6 +83,7 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); +void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout); #else static inline int nf_conntrack_timeout_init(void) { diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 46aee65f339b..401c2cce4a61 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -31,6 +31,23 @@ EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); +static int untimeout(struct nf_conn *ct, void *timeout) +{ + struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); + + if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + + /* We are not intended to delete this conntrack. */ + return 0; +} + +void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout) +{ + nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); +} +EXPORT_SYMBOL_GPL(nf_ct_untimeout); + static const struct nf_ct_ext_type timeout_extend = { .len = sizeof(struct nf_conn_timeout), .align = __alignof__(struct nf_conn_timeout), diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 4199e5300575..df53aef2d642 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -297,22 +297,6 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, return ret; } -static int untimeout(struct nf_conn *ct, void *timeout) -{ - struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); - - if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) - RCU_INIT_POINTER(timeout_ext->timeout, NULL); - - /* We are not intended to delete this conntrack. */ - return 0; -} - -static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) -{ - nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); -} - /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { @@ -325,7 +309,7 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); - ctnl_untimeout(net, timeout); + nf_ct_untimeout(net, timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; @@ -573,7 +557,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) struct ctnl_timeout *cur, *tmp; nf_ct_unconfirmed_destroy(net); - ctnl_untimeout(net, NULL); + nf_ct_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { list_del_rcu(&cur->head); -- cgit v1.2.3 From 6c1fd7dc489d9bf64196f5b0fa33e059f64460c8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Aug 2018 17:14:15 +0200 Subject: netfilter: cttimeout: decouple timeout policy from nfnetlink_cttimeout object The timeout policy is currently embedded into the nfnetlink_cttimeout object, move the policy into an independent object. This allows us to reuse part of the existing conntrack timeout extension from nf_tables without adding dependencies with the nfnetlink_cttimeout object layout. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 22 ++++++++++------- net/netfilter/nf_conntrack_timeout.c | 6 ++--- net/netfilter/nfnetlink_cttimeout.c | 37 ++++++++++++++++------------ net/netfilter/xt_CT.c | 4 +-- 4 files changed, 39 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 7a21bc0f00eb..d5f62cc6c2ae 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -11,24 +11,28 @@ #define CTNL_TIMEOUT_NAME_MAX 32 +struct nf_ct_timeout { + __u16 l3num; + const struct nf_conntrack_l4proto *l4proto; + char data[0]; +}; + struct ctnl_timeout { struct list_head head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; - __u16 l3num; - const struct nf_conntrack_l4proto *l4proto; - char data[0]; + struct nf_ct_timeout timeout; }; struct nf_conn_timeout { - struct ctnl_timeout __rcu *timeout; + struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(struct nf_conn_timeout *t) { - struct ctnl_timeout *timeout; + struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) @@ -49,7 +53,7 @@ struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, - struct ctnl_timeout *timeout, + struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT @@ -83,7 +87,7 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); -void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout); +void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); #else static inline int nf_conntrack_timeout_init(void) { @@ -97,8 +101,8 @@ static inline void nf_conntrack_timeout_fini(void) #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -extern struct ctnl_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); -extern void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout); +extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); +extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout); #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 401c2cce4a61..91fbd183da2d 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -24,11 +24,11 @@ #include #include -struct ctnl_timeout * +struct nf_ct_timeout * (*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); -void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; +void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); static int untimeout(struct nf_conn *ct, void *timeout) @@ -42,7 +42,7 @@ static int untimeout(struct nf_conn *ct, void *timeout) return 0; } -void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout) +void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout) { nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index df53aef2d642..d46a236cdf31 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -113,13 +113,13 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, /* You cannot replace one timeout policy by another of * different kind, sorry. */ - if (matching->l3num != l3num || - matching->l4proto->l4proto != l4num) + if (matching->timeout.l3num != l3num || + matching->timeout.l4proto->l4proto != l4num) return -EINVAL; - return ctnl_timeout_parse_policy(&matching->data, - matching->l4proto, net, - cda[CTA_TIMEOUT_DATA]); + return ctnl_timeout_parse_policy(&matching->timeout.data, + matching->timeout.l4proto, + net, cda[CTA_TIMEOUT_DATA]); } return -EBUSY; @@ -140,14 +140,14 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, goto err_proto_put; } - ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net, + ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); - timeout->l3num = l3num; - timeout->l4proto = l4proto; + timeout->timeout.l3num = l3num; + timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); @@ -166,7 +166,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; - const struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -179,8 +179,9 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nfmsg->res_id = 0; if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || - nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || - nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || + nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, + htons(timeout->timeout.l3num)) || + nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; @@ -194,7 +195,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, + &timeout->timeout.data); if (ret < 0) goto nla_put_failure; @@ -308,8 +310,8 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); - nf_ct_l4proto_put(timeout->l4proto); - nf_ct_untimeout(net, timeout); + nf_ct_l4proto_put(timeout->timeout.l4proto); + nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; @@ -510,8 +512,11 @@ err: return matching; } -static void ctnl_timeout_put(struct ctnl_timeout *timeout) +static void ctnl_timeout_put(struct nf_ct_timeout *t) { + struct ctnl_timeout *timeout = + container_of(t, struct ctnl_timeout, timeout); + if (refcount_dec_and_test(&timeout->refcnt)) kfree_rcu(timeout, rcu_head); @@ -561,7 +566,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { list_del_rcu(&cur->head); - nf_ct_l4proto_put(cur->l4proto); + nf_ct_l4proto_put(cur->timeout.l4proto); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 7ba454e9e3fa..89457efd2e00 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -104,7 +104,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) +static void __xt_ct_tg_timeout_put(struct nf_ct_timeout *timeout) { typeof(nf_ct_timeout_put_hook) timeout_put; @@ -121,7 +121,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT typeof(nf_ct_timeout_find_get_hook) timeout_find_get; const struct nf_conntrack_l4proto *l4proto; - struct ctnl_timeout *timeout; + struct nf_ct_timeout *timeout; struct nf_conn_timeout *timeout_ext; const char *errmsg = NULL; int ret = 0; -- cgit v1.2.3 From 7e0b2b57f01d183e1c84114f1f2287737358d748 Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Tue, 7 Aug 2018 17:14:23 +0200 Subject: netfilter: nft_ct: add ct timeout support This patch allows to add, list and delete connection tracking timeout policies via nft objref infrastructure and assigning these timeout via nft rule. %./libnftnl/examples/nft-ct-timeout-add ip raw cttime tcp Ruleset: table ip raw { ct timeout cttime { protocol tcp; policy = {established: 111, close: 13 } } chain output { type filter hook output priority -300; policy accept; ct timeout set "cttime" } } %./libnftnl/examples/nft-rule-ct-timeout-add ip raw output cttime %conntrack -E [NEW] tcp 6 111 ESTABLISHED src=172.16.19.128 dst=172.16.19.1 sport=22 dport=41360 [UNREPLIED] src=172.16.19.1 dst=172.16.19.128 sport=41360 dport=22 %nft delete rule ip raw output handle %./libnftnl/examples/nft-ct-timeout-del ip raw cttime Joint work with Pablo Neira. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 14 ++- net/netfilter/nft_ct.c | 204 ++++++++++++++++++++++++++++++- 2 files changed, 216 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 94657c701f22..e23290ffdc77 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -958,6 +958,7 @@ enum nft_socket_keys { * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address) * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) + * @NFT_CT_TIMEOUT: connection tracking timeout policy assigned to conntrack */ enum nft_ct_keys { NFT_CT_STATE, @@ -983,6 +984,7 @@ enum nft_ct_keys { NFT_CT_DST_IP, NFT_CT_SRC_IP6, NFT_CT_DST_IP6, + NFT_CT_TIMEOUT, __NFT_CT_MAX }; #define NFT_CT_MAX (__NFT_CT_MAX - 1) @@ -1411,6 +1413,15 @@ enum nft_ct_helper_attributes { }; #define NFTA_CT_HELPER_MAX (__NFTA_CT_HELPER_MAX - 1) +enum nft_ct_timeout_timeout_attributes { + NFTA_CT_TIMEOUT_UNSPEC, + NFTA_CT_TIMEOUT_L3PROTO, + NFTA_CT_TIMEOUT_L4PROTO, + NFTA_CT_TIMEOUT_DATA, + __NFTA_CT_TIMEOUT_MAX, +}; +#define NFTA_CT_TIMEOUT_MAX (__NFTA_CT_TIMEOUT_MAX - 1) + #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 @@ -1418,7 +1429,8 @@ enum nft_ct_helper_attributes { #define NFT_OBJECT_LIMIT 4 #define NFT_OBJECT_CONNLIMIT 5 #define NFT_OBJECT_TUNNEL 6 -#define __NFT_OBJECT_MAX 7 +#define NFT_OBJECT_CT_TIMEOUT 7 +#define __NFT_OBJECT_MAX 8 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 3bc82ee5464d..4788458a0931 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include struct nft_ct { enum nft_ct_keys key:8; @@ -765,6 +767,194 @@ static struct nft_expr_type nft_notrack_type __read_mostly = { .owner = THIS_MODULE, }; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static int +nft_ct_timeout_parse_policy(void *timeouts, + const struct nf_conntrack_l4proto *l4proto, + struct net *net, const struct nlattr *attr) +{ + struct nlattr **tb; + int ret = 0; + + if (!l4proto->ctnl_timeout.nlattr_to_obj) + return 0; + + tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), + GFP_KERNEL); + + if (!tb) + return -ENOMEM; + + ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, + attr, l4proto->ctnl_timeout.nla_policy, + NULL); + if (ret < 0) + goto err; + + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + +err: + kfree(tb); + return ret; +} + +struct nft_ct_timeout_obj { + struct nf_conn *tmpl; + u8 l4proto; +}; + +static void nft_ct_timeout_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); + struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); + struct sk_buff *skb = pkt->skb; + + if (ct || + priv->l4proto != pkt->tprot) + return; + + nf_ct_set(skb, priv->tmpl, IP_CT_NEW); +} + +static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + struct nft_ct_timeout_obj *priv = nft_obj_data(obj); + const struct nf_conntrack_l4proto *l4proto; + struct nf_conn_timeout *timeout_ext; + struct nf_ct_timeout *timeout; + int l3num = ctx->family; + struct nf_conn *tmpl; + __u8 l4num; + int ret; + + if (!tb[NFTA_CT_TIMEOUT_L3PROTO] || + !tb[NFTA_CT_TIMEOUT_L4PROTO] || + !tb[NFTA_CT_TIMEOUT_DATA]) + return -EINVAL; + + l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); + priv->l4proto = l4num; + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; + goto err_proto_put; + } + + timeout = kzalloc(sizeof(struct nf_ct_timeout) + + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); + if (timeout == NULL) { + ret = -ENOMEM; + goto err_proto_put; + } + + ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net, + tb[NFTA_CT_TIMEOUT_DATA]); + if (ret < 0) + goto err_free_timeout; + + timeout->l3num = l3num; + timeout->l4proto = l4proto; + tmpl = nf_ct_tmpl_alloc(ctx->net, zone, GFP_ATOMIC); + if (!tmpl) { + ret = -ENOMEM; + goto err_free_timeout; + } + + timeout_ext = nf_ct_timeout_ext_add(tmpl, timeout, GFP_ATOMIC); + if (!timeout_ext) { + ret = -ENOMEM; + goto err_free_tmpl; + } + + ret = nf_ct_netns_get(ctx->net, ctx->family); + if (ret < 0) + goto err_free_tmpl; + + priv->tmpl = tmpl; + + return 0; + +err_free_tmpl: + nf_ct_tmpl_free(tmpl); +err_free_timeout: + kfree(timeout); +err_proto_put: + nf_ct_l4proto_put(l4proto); + return ret; +} + +static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_ct_timeout_obj *priv = nft_obj_data(obj); + struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl); + struct nf_ct_timeout *timeout; + + timeout = rcu_dereference_raw(t->timeout); + nf_ct_untimeout(ctx->net, timeout); + nf_ct_l4proto_put(timeout->l4proto); + nf_ct_netns_put(ctx->net, ctx->family); + nf_ct_tmpl_free(priv->tmpl); +} + +static int nft_ct_timeout_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); + const struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl); + const struct nf_ct_timeout *timeout = rcu_dereference_raw(t->timeout); + struct nlattr *nest_params; + int ret; + + if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || + nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num))) + return -1; + + nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_params) + return -1; + + ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); + if (ret < 0) + return -1; + nla_nest_end(skb, nest_params); + return 0; +} + +static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = { + [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 }, + [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 }, + [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED }, +}; + +static struct nft_object_type nft_ct_timeout_obj_type; + +static const struct nft_object_ops nft_ct_timeout_obj_ops = { + .type = &nft_ct_timeout_obj_type, + .size = sizeof(struct nft_ct_timeout_obj), + .eval = nft_ct_timeout_obj_eval, + .init = nft_ct_timeout_obj_init, + .destroy = nft_ct_timeout_obj_destroy, + .dump = nft_ct_timeout_obj_dump, +}; + +static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = { + .type = NFT_OBJECT_CT_TIMEOUT, + .ops = &nft_ct_timeout_obj_ops, + .maxattr = NFTA_CT_TIMEOUT_MAX, + .policy = nft_ct_timeout_policy, + .owner = THIS_MODULE, +}; +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) @@ -949,9 +1139,17 @@ static int __init nft_ct_module_init(void) err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; - +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + err = nft_register_obj(&nft_ct_timeout_obj_type); + if (err < 0) + goto err3; +#endif return 0; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err3: + nft_unregister_obj(&nft_ct_helper_obj_type); +#endif err2: nft_unregister_expr(&nft_notrack_type); err1: @@ -961,6 +1159,9 @@ err1: static void __exit nft_ct_module_exit(void) { +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + nft_unregister_obj(&nft_ct_timeout_obj_type); +#endif nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); @@ -974,3 +1175,4 @@ MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); -- cgit v1.2.3 From f699edb12a25a3dc8ecf72fe0a9b2fa42bd6a5da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Aug 2018 17:14:27 +0200 Subject: netfilter: nft_ct: enable conntrack for helpers Enable conntrack if the user defines a helper to be used from the ruleset policy. Fixes: 1a64edf54f55 ("netfilter: nft_ct: add helper set support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 4788458a0931..4855d4ce1c8f 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -963,6 +963,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; int family = ctx->family; + int err; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; @@ -1013,7 +1014,18 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, priv->helper4 = help4; priv->helper6 = help6; + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err < 0) + goto err_put_helper; + return 0; + +err_put_helper: + if (priv->helper4) + nf_conntrack_helper_put(priv->helper4); + if (priv->helper6) + nf_conntrack_helper_put(priv->helper6); + return err; } static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, @@ -1025,6 +1037,8 @@ static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); + + nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_helper_obj_eval(struct nft_object *obj, -- cgit v1.2.3 From 92e2c4053623f21d61a683f7ef7bd61c8300ac7d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 7 Aug 2018 17:36:00 +0200 Subject: flow_dissector: allow dissection of tunnel options from metadata Allow the existing 'dissection' of tunnel metadata to 'dissect' options already present in tunnel metadata. This dissection is controlled by a new dissector key, FLOW_DISSECTOR_KEY_ENC_OPTS. This dissection only occurs when skb_flow_dissect_tunnel_info() is called, currently only the Flower classifier makes that call. So there should be no impact on other users of the flow dissector. This is in preparation for allowing the flower classifier to match on Geneve options. Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 17 +++++++++++++++++ net/core/flow_dissector.c | 19 ++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 2a17f041f7a1..6a4586dcdede 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -57,6 +57,21 @@ struct flow_dissector_key_mpls { mpls_label:20; }; +#define FLOW_DIS_TUN_OPTS_MAX 255 +/** + * struct flow_dissector_key_enc_opts: + * @data: tunnel option data + * @len: length of tunnel option data + * @dst_opt_type: tunnel option type + */ +struct flow_dissector_key_enc_opts { + u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired + * here but seems difficult to #include + */ + u8 len; + __be16 dst_opt_type; +}; + struct flow_dissector_key_keyid { __be32 keyid; }; @@ -208,6 +223,8 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ + FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ + FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 08a5184f4b34..ce9eeeb7c024 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -154,7 +154,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_IP)) + FLOW_DISSECTOR_KEY_ENC_IP) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); @@ -224,6 +226,21 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, ip->tos = key->tos; ip->ttl = key->ttl; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { + struct flow_dissector_key_enc_opts *enc_opt; + + enc_opt = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS, + target_container); + + if (info->options_len) { + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + enc_opt->dst_opt_type = info->key.tun_flags & + TUNNEL_OPTIONS_PRESENT; + } + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); -- cgit v1.2.3 From 0a6e77784f490912d81b92cfd48424541c04691e Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 7 Aug 2018 17:36:01 +0200 Subject: net/sched: allow flower to match tunnel options Allow matching on options in Geneve tunnel headers. This makes use of existing tunnel metadata support. The options can be described in the form CLASS:TYPE:DATA/CLASS_MASK:TYPE_MASK:DATA_MASK, where CLASS is represented as a 16bit hexadecimal value, TYPE as an 8bit hexadecimal value and DATA as a variable length hexadecimal value. e.g. # ip link add name geneve0 type geneve dstport 0 external # tc qdisc add dev geneve0 ingress # tc filter add dev geneve0 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ geneve_opts 0102:80:1122334421314151/ffff:ff:ffffffffffffffff \ ip_proto udp \ action mirred egress redirect dev eth1 This patch adds support for matching Geneve options in the order supplied by the user. This leads to an efficient implementation in the software datapath (and in our opinion hardware datapaths that offload this feature). It is also compatible with Geneve options matching provided by the Open vSwitch kernel datapath which is relevant here as the Flower classifier may be used as a mechanism to program flows into hardware as a form of Open vSwitch datapath offload (sometimes referred to as OVS-TC). The netlink Kernel/Userspace API may be extended, for example by adding a flag, if other matching options are desired, for example matching given options in any order. This would require an implementation in the TC software datapath. And be done in a way that drivers that facilitate offload of the Flower classifier can reject or accept such flows based on hardware datapath capabilities. This approach was discussed and agreed on at Netconf 2017 in Seoul. Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 26 +++++ net/sched/cls_flower.c | 244 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 269 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 48e5b5d49a34..be382fb0592d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -480,11 +480,37 @@ enum { TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + __TCA_FLOWER_MAX, }; #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index a3b69bb6f4b0..9da244235170 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ struct fl_flow_key { struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; + struct flow_dissector_key_enc_opts enc_opts; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -482,6 +484,21 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, + .len = 128 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -603,6 +620,145 @@ static void fl_set_key_ip(struct nlattr **tb, bool encap, fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)); } +static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1]; + struct nlattr *class = NULL, *type = NULL, *data = NULL; + struct geneve_opt *opt; + int err, data_len = 0; + + if (option_len > sizeof(struct geneve_opt)) + data_len = option_len - sizeof(struct geneve_opt); + + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; + memset(opt, 0xff, option_len); + opt->length = data_len / 4; + opt->r1 = 0; + opt->r2 = 0; + opt->r3 = 0; + + /* If no mask has been prodived we assume an exact match. */ + if (!depth) + return sizeof(struct geneve_opt) + data_len; + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GENEVE) { + NL_SET_ERR_MSG(extack, "Non-geneve option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); + if (err < 0) + return err; + + /* We are not allowed to omit any of CLASS, TYPE or DATA + * fields from the key. + */ + if (!option_len && + (!tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] || + !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] || + !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA])) { + NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); + return -EINVAL; + } + + /* Omitting any of CLASS, TYPE or DATA fields is allowed + * for the mask. + */ + if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]) { + int new_len = key->enc_opts.len; + + data = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]; + data_len = nla_len(data); + if (data_len < 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); + return -ERANGE; + } + if (data_len % 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); + return -ERANGE; + } + + new_len += sizeof(struct geneve_opt) + data_len; + BUILD_BUG_ON(FLOW_DIS_TUN_OPTS_MAX != IP_TUNNEL_OPTS_MAX); + if (new_len > FLOW_DIS_TUN_OPTS_MAX) { + NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size"); + return -ERANGE; + } + opt->length = data_len / 4; + memcpy(opt->opt_data, nla_data(data), data_len); + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]) { + class = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(class); + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]) { + type = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(type); + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; + int option_len, key_depth, msk_depth = 0; + + nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); + + if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { + nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + } + + nla_for_each_attr(nla_opt_key, nla_enc_key, + nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) { + switch (nla_type(nla_opt_key)) { + case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + option_len = fl_set_geneve_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + option_len = fl_set_geneve_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; + default: + NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); + return -EINVAL; + } + } + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -799,6 +955,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { + ret = fl_set_enc_opt(tb, key, mask, extack); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -894,6 +1056,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); skb_flow_dissector_init(dissector, keys, cnt); } @@ -1414,6 +1578,83 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); } +static int fl_dump_key_geneve_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct geneve_opt *opt; + struct nlattr *nest; + int opt_off = 0; + + nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); + if (!nest) + goto nla_put_failure; + + while (enc_opts->len > opt_off) { + opt = (struct geneve_opt *)&enc_opts->data[opt_off]; + + if (nla_put_be16(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, + opt->opt_class)) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, + opt->type)) + goto nla_put_failure; + if (nla_put(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, + opt->length * 4, opt->opt_data)) + goto nla_put_failure; + + opt_off += sizeof(struct geneve_opt) + opt->length * 4; + } + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct nlattr *nest; + int err; + + if (!enc_opts->len) + return 0; + + nest = nla_nest_start(skb, enc_opt_type); + if (!nest) + goto nla_put_failure; + + switch (enc_opts->dst_opt_type) { + case TUNNEL_GENEVE_OPT: + err = fl_dump_key_geneve_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; + default: + goto nla_put_failure; + } + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_enc_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *key_opts, + struct flow_dissector_key_enc_opts *msk_opts) +{ + int err; + + err = fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS, key_opts); + if (err) + return err; + + return fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS_MASK, msk_opts); +} + static int fl_dump_key(struct sk_buff *skb, struct net *net, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -1594,7 +1835,8 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)) || - fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip)) + fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip) || + fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts)) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) -- cgit v1.2.3 From 9ca6163005e6abdf2a39eb581abc6060f677a2d7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 6 Aug 2018 11:27:10 +0300 Subject: net: sched: cls_flower: set correct offload data in fl_reoffload fl_reoffload implementation sets following members of struct tc_cls_flower_offload incorrectly: - masked key instead of mask - key instead of masked key Fix fl_reoffload to provide correct data to offload callback. Fixes: 31533cba4327 ("net: sched: cls_flower: implement offload tcf_proto_op") Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9da244235170..6fd9bdd93796 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1338,8 +1338,8 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long)f; cls_flower.dissector = &mask->dissector; - cls_flower.mask = &f->mkey; - cls_flower.key = &f->key; + cls_flower.mask = &mask->key; + cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; -- cgit v1.2.3 From 3789cabaab1a939eb56edd76bbde2c2e49f081da Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 6 Aug 2018 15:00:59 +0300 Subject: ip6_tunnel: collect_md xmit: Use ip_tunnel_key's provided src address When using an ip6tnl device in collect_md mode, the xmit methods ignore the ipv6.src field present in skb_tunnel_info's key, both for route calculation purposes (flowi6 construction) and for assigning the packet's final ipv6h->saddr. This makes it impossible specifying a desired ipv6 local address in the encapsulating header (for example, when using tc action tunnel_key). This is also not aligned with behavior of ipip (ipv4) in collect_md mode, where the key->u.ipv4.src gets used. Fix, by assigning fl6.saddr with given key->u.ipv6.src. In case ipv6.src is not specified, ip6_tnl_xmit uses existing saddr selection code. Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") Signed-off-by: Shmulik Ladkani Reviewed-by: Eyal Birger Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 00e138a44cbb..820cebe0c687 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1113,7 +1113,7 @@ route_lookup: dst = NULL; goto tx_err_link_failure; } - if (t->parms.collect_md && + if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; @@ -1255,6 +1255,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPIP; + fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; @@ -1326,6 +1327,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPV6; + fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; -- cgit v1.2.3 From 5941923da29e84bc9e2a1abb2c14fffaf8d71e2f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 7 Aug 2018 19:34:16 +0800 Subject: RDS: IB: fix 'passing zero to ERR_PTR()' warning Fix a static code checker warning: net/rds/ib_frmr.c:82 rds_ib_alloc_frmr() warn: passing zero to 'ERR_PTR' The error path for ib_alloc_mr failure should set err to PTR_ERR. Fixes: 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") Signed-off-by: YueHaibing Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_frmr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index d152e48ea371..8596eed6d9a8 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -61,6 +61,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, pool->fmr_attr.max_pages); if (IS_ERR(frmr->mr)) { pr_warn("RDS/IB: %s failed to allocate MR", __func__); + err = PTR_ERR(frmr->mr); goto out_no_cigar; } -- cgit v1.2.3 From fb3b467e067385748445c5295fdbde548b631cb2 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 7 Aug 2018 19:20:08 +0800 Subject: net:af_iucv: get rid of the unneeded variable 'err' in afiucv_pm_freeze We will not use the variable 'err' after initalization, So remove it and return 0. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 92ee91e34395..a21d8ed0a325 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -150,7 +150,6 @@ static int afiucv_pm_freeze(struct device *dev) { struct iucv_sock *iucv; struct sock *sk; - int err = 0; #ifdef CONFIG_PM_DEBUG printk(KERN_WARNING "afiucv_pm_freeze\n"); @@ -175,7 +174,7 @@ static int afiucv_pm_freeze(struct device *dev) skb_queue_purge(&iucv->backlog_skb_q); } read_unlock(&iucv_sk_list.lock); - return err; + return 0; } /** -- cgit v1.2.3 From 5a0c6cee1767a551dacfa6e266ac4795bba0555e Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 7 Aug 2018 19:20:09 +0800 Subject: net:mod: remove unneeded variable 'ret' in init_p9 The ret is modified after initalization, so just remove it and return 0. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- net/9p/mod.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/mod.c b/net/9p/mod.c index eb9777f05755..253ba824a325 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -171,13 +171,11 @@ void v9fs_put_trans(struct p9_trans_module *m) */ static int __init init_p9(void) { - int ret = 0; - p9_error_init(); pr_info("Installing 9P2000 support\n"); p9_trans_fd_init(); - return ret; + return 0; } /** -- cgit v1.2.3 From e7ea2a52ffaf60a211edc0df97dcf194d1257714 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 8 Aug 2018 03:23:44 +0000 Subject: netfilter: nfnetlink_osf: fix using plain integer as NULL warning Fixes the following sparse warning: net/netfilter/nfnetlink_osf.c:274:24: warning: Using plain integer as NULL pointer Signed-off-by: Wei Yongjun Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f9dba62c450f..00db27dfd2ff 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -271,7 +271,7 @@ const char *nf_osf_find(const struct sk_buff *skb, tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); if (!tcp) - return false; + return NULL; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; -- cgit v1.2.3 From eb95f52fc72d15566fe06807c9e0cabbcd3480f1 Mon Sep 17 00:00:00 2001 From: Maria Pasechnik Date: Wed, 8 Aug 2018 11:46:30 +0300 Subject: net: ipv6_gre: Fix GRO to work on IPv6 over GRE tap IPv6 GRO over GRE tap is not working while GRO is not set over the native interface. gro_list_prepare function updates the same_flow variable of existing sessions to 1 if their mac headers match the one of the incoming packet. same_flow is used to filter out non-matching sessions and keep potential ones for aggregation. The number of bytes to compare should be the number of bytes in the mac headers. In gro_list_prepare this number is set to be skb->dev->hard_header_len. For GRE interfaces this hard_header_len should be as it is set in the initialization process (when GRE is created), it should not be overridden. But currently it is being overridden by the value that is actually supposed to represent the needed_headroom. Therefore, the number of bytes compared in order to decide whether the the mac headers are the same is greater than the length of the headers. As it's documented in netdevice.h, hard_header_len is the maximum hardware header length, and needed_headroom is the extra headroom the hardware may need. hard_header_len is basically all the bytes received by the physical till layer 3 header of the packet received by the interface. For example, if the interface is a GRE tap then the needed_headroom should be the total length of the following headers: IP header of the physical, GRE header, mac header of GRE. It is often used to calculate the MTU of the created interface. This patch removes the override of the hard_header_len, and assigns the calculated value to needed_headroom. This way, the comparison in gro_list_prepare is really of the mac headers, and if the packets have the same mac headers the same_flow will be set to 1. Performance testing: 45% higher bandwidth. Measuring bandwidth of single-stream IPv4 TCP traffic over IPv6 GRE tap while GRO is not set on the native. NIC: ConnectX-4LX Before (GRO not working) : 7.2 Gbits/sec After (GRO working): 10.5 Gbits/sec Signed-off-by: Maria Pasechnik Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index fc7dd3a04360..18a3794b0f52 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1129,7 +1129,7 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, return; if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + + dev->needed_headroom = rt->dst.dev->hard_header_len + t_hlen; if (set_mtu) { @@ -1155,7 +1155,7 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; + tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; return t_hlen; } @@ -1825,7 +1825,7 @@ static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel) erspan_hdr_len(tunnel->parms.erspan_ver); t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; + tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; return t_hlen; } -- cgit v1.2.3 From 0bab1cdc8c4d5392a5d9037cfbdbb2cacc91ce5e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Aug 2018 19:59:32 +0800 Subject: decnet: fix using plain integer as NULL warning Fixes the following sparse warning: net/decnet/dn_route.c:407:30: warning: Using plain integer as NULL pointer net/decnet/dn_route.c:1923:22: warning: Using plain integer as NULL pointer Signed-off-by: YueHaibing Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 3107a2e24e6b..1c002c0fb712 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -404,7 +404,7 @@ void dn_rt_cache_flush(int delay) if (delay <= 0) { spin_unlock_bh(&dn_rt_flush_lock); - dn_run_flush(0); + dn_run_flush(NULL); return; } @@ -1920,7 +1920,7 @@ void __init dn_route_init(void) void __exit dn_route_cleanup(void) { del_timer(&dn_route_timer); - dn_run_flush(0); + dn_run_flush(NULL); remove_proc_entry("decnet_cache", init_net.proc_net); dst_entries_destroy(&dn_dst_ops); -- cgit v1.2.3 From 63cc5bcc9fc467bbe61cc9ee52509294bdf04c4b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 8 Aug 2018 14:04:13 +0200 Subject: net: sched: fix block->refcnt decrement Currently the refcnt is never decremented in case the value is not 1. Fix it by adding decrement in case the refcnt is not 1. Reported-by: Vlad Buslov Fixes: f71e0ca4db18 ("net: sched: Avoid implicit chain 0 creation") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 194c2e0b2737..f922ce27ed5e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -780,6 +780,8 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, block->refcnt--; if (list_empty(&block->chain_list)) kfree(block); + } else { + block->refcnt--; } } EXPORT_SYMBOL(tcf_block_put_ext); -- cgit v1.2.3 From 4d99f6602cb552fb58db0c3b1d935bb6fa017f24 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 8 Aug 2018 20:07:35 -0700 Subject: net: allow to call netif_reset_xps_queues() under cpus_read_lock The definition of static_key_slow_inc() has cpus_read_lock in place. In the virtio_net driver, XPS queues are initialized after setting the queue:cpu affinity in virtnet_set_affinity() which is already protected within cpus_read_lock. Lockdep prints a warning when we are trying to acquire cpus_read_lock when it is already held. This patch adds an ability to call __netif_set_xps_queue under cpus_read_lock(). Acked-by: Jason Wang ============================================ WARNING: possible recursive locking detected 4.18.0-rc3-next-20180703+ #1 Not tainted -------------------------------------------- swapper/0/1 is trying to acquire lock: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: static_key_slow_inc+0xe/0x20 but task is already holding lock: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: init_vqs+0x513/0x5a0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(cpu_hotplug_lock.rw_sem); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by swapper/0/1: #0: 00000000244bc7da (&dev->mutex){....}, at: __driver_attach+0x5a/0x110 #1: 00000000cf973d46 (cpu_hotplug_lock.rw_sem){++++}, at: init_vqs+0x513/0x5a0 #2: 000000005cd8463f (xps_map_mutex){+.+.}, at: __netif_set_xps_queue+0x8d/0xc60 v2: move cpus_read_lock() out of __netif_set_xps_queue() Cc: "Nambiar, Amritha" Cc: "Michael S. Tsirkin" Cc: Jason Wang Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue") Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 4 +++- net/core/dev.c | 20 +++++++++++++++----- net/core/net-sysfs.c | 4 ++++ 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 62311dde6e71..39a7f4452587 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1903,9 +1903,11 @@ static void virtnet_set_affinity(struct virtnet_info *vi) i = 0; for_each_online_cpu(cpu) { + const unsigned long *mask = cpumask_bits(cpumask_of(cpu)); + virtqueue_set_affinity(vi->rq[i].vq, cpu); virtqueue_set_affinity(vi->sq[i].vq, cpu); - netif_set_xps_queue(vi->dev, cpumask_of(cpu), i); + __netif_set_xps_queue(vi->dev, mask, i, false); i++; } diff --git a/net/core/dev.c b/net/core/dev.c index f68122f0ab02..325fc5088370 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2176,6 +2176,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (!static_key_false(&xps_needed)) return; + cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) { @@ -2199,10 +2200,11 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, out_no_maps: if (static_key_enabled(&xps_rxqs_needed)) - static_key_slow_dec(&xps_rxqs_needed); + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - static_key_slow_dec(&xps_needed); + static_key_slow_dec_cpuslocked(&xps_needed); mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) @@ -2250,6 +2252,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, return new_map; } +/* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, bool is_rxqs_map) { @@ -2317,9 +2320,9 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) goto out_no_new_maps; - static_key_slow_inc(&xps_needed); + static_key_slow_inc_cpuslocked(&xps_needed); if (is_rxqs_map) - static_key_slow_inc(&xps_rxqs_needed); + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { @@ -2448,11 +2451,18 @@ error: kfree(new_dev_maps); return -ENOMEM; } +EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { - return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + int ret; + + cpus_read_lock(); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + cpus_read_unlock(); + + return ret; } EXPORT_SYMBOL(netif_set_xps_queue); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0a95bcf64cdc..bd67c4d0fcfd 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -1400,7 +1401,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); + cpus_read_unlock(); + kfree(mask); return err ? : len; } -- cgit v1.2.3 From eb91e4d4db06adef06e7f50c02813c13c6ca5a5b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 10 Aug 2018 11:28:02 +0200 Subject: Revert "xdp: add NULL pointer check in __xdp_return()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 36e0f12bbfd3016f495904b35e41c5711707509f. The reverted commit adds a WARN to check against NULL entries in the mem_id_ht rhashtable. Any kernel path implementing the XDP (generic or driver) fast path is required to make a paired xdp_rxq_info_reg/xdp_rxq_info_unreg call for proper function. In addition, a driver using a different allocation scheme than the default MEM_TYPE_PAGE_SHARED is required to additionally call xdp_rxq_info_reg_mem_model. For MEM_TYPE_ZERO_COPY, an xdp_rxq_info_reg_mem_model call ensures that the mem_id_ht rhashtable has a properly inserted allocator id. If not, this would be a driver bug. A NULL pointer kernel OOPS is preferred to the WARN. Suggested-by: Jesper Dangaard Brouer Signed-off-by: Björn Töpel Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/xdp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/xdp.c b/net/core/xdp.c index c013b836006b..57285383ed00 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -348,8 +348,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, rcu_read_lock(); /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - if (!WARN_ON_ONCE(!xa)) - xa->zc_alloc->free(xa->zc_alloc, handle); + xa->zc_alloc->free(xa->zc_alloc, handle); rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ -- cgit v1.2.3 From b0768a86585d4d951a30ff565f19598dbbd67897 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:09 +0900 Subject: net: Export skb_headers_offset_update This is needed for veth XDP which does skb_copy_expand()-like operation. v2: - Drop skb_copy_header part because it has already been exported now. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ebdf158a795..e93b157f526c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8d574a88125d..c996c09d095f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); -static void skb_headers_offset_update(struct sk_buff *skb, int off) +void skb_headers_offset_update(struct sk_buff *skb, int off) { /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_network_header += off; skb->inner_mac_header += off; } +EXPORT_SYMBOL(skb_headers_offset_update); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { -- cgit v1.2.3 From 0b19cc0a8694d4295383f88bc3441765875a57bc Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:15 +0900 Subject: bpf: Make redirect_info accessible from modules We are going to add kern_flags field in redirect_info for kernel internal use. In order to avoid function call to access the flags, make redirect_info accessible from modules. Also as it is now non-static, add prefix bpf_ to redirect_info. v6: - Fix sparse warning around EXPORT_SYMBOL. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 10 ++++++++++ net/core/filter.c | 29 +++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..4717af8b95e6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -537,6 +537,16 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) diff --git a/net/core/filter.c b/net/core/filter.c index 587bbfbd7db3..2de7dd9f2a57 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2082,19 +2082,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -struct redirect_info { - u32 ifindex; - u32 flags; - struct bpf_map *map; - struct bpf_map *map_to_flush; - unsigned long map_owner; -}; - -static DEFINE_PER_CPU(struct redirect_info, redirect_info); +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; @@ -2107,7 +2100,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) int skb_do_redirect(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); @@ -3200,7 +3193,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; @@ -3245,7 +3238,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3285,7 +3278,7 @@ err: int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *fwd; u32 index = ri->ifindex; int err; @@ -3317,7 +3310,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3368,7 +3361,7 @@ err: int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; @@ -3399,7 +3392,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3423,7 +3416,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, unsigned long, map_owner) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; -- cgit v1.2.3 From 2539650fadbf63a431e76535a9de7bff6ea5e409 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:16 +0900 Subject: xdp: Helpers for disabling napi_direct of xdp_return_frame We need some mechanism to disable napi_direct on calling xdp_return_frame_rx_napi() from some context. When veth gets support of XDP_REDIRECT, it will redirects packets which are redirected from other devices. On redirection veth will reuse xdp_mem_info of the redirection source device to make return_frame work. But in this case .ndo_xdp_xmit() called from veth redirection uses xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit() is not called directly from the rxq which owns the xdp_mem_info. This approach introduces a flag in bpf_redirect_info to indicate that napi_direct should be disabled even when _rx_napi variant is used as well as helper functions to use it. A NAPI handler who wants to use this flag needs to call xdp_set_return_frame_no_direct() before processing packets, and call xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before exiting NAPI. v4: - Use bpf_redirect_info for storing the flag instead of xdp_mem_info to avoid per-frame copy cost. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 25 +++++++++++++++++++++++++ net/core/xdp.c | 6 ++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4717af8b95e6..2b072dab32c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,10 +543,14 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; unsigned long map_owner; + u32 kern_flags; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 57285383ed00..3dd99e1c04f5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) + if (xa) { + napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); - else + } else { put_page(page); + } rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: -- cgit v1.2.3 From aa12af77aae05008b3e637b85944dcd512f75eba Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Tue, 7 Aug 2018 13:16:35 +0530 Subject: Bluetooth: Add definitions for LE set address resolution Add the definitions for LE address resolution enable HCI commands. When the LE address resolution enable gets changed via HCI commands make sure that flag gets updated. Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 3 +++ net/bluetooth/hci_event.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4619a79b1bbb..cdd9f1fe7cfa 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -269,6 +269,7 @@ enum { HCI_VENDOR_DIAG, HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, + HCI_LL_RPA_RESOLUTION, __HCI_NUM_FLAGS, }; @@ -1524,6 +1525,8 @@ struct hci_rp_le_read_resolv_list_size { __u8 size; } __packed; +#define HCI_OP_LE_SET_ADDR_RESOLV_ENABLE 0x202d + #define HCI_OP_LE_READ_MAX_DATA_LEN 0x202f struct hci_rp_le_read_max_data_len { __u8 status; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 8078587572fe..f12555f23a49 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1480,6 +1480,30 @@ static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, hdev->le_resolv_list_size = rp->size; } +static void hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 *sent, status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE); + if (!sent) + return; + + hci_dev_lock(hdev); + + if (*sent) + hci_dev_set_flag(hdev, HCI_LL_RPA_RESOLUTION); + else + hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION); + + hci_dev_unlock(hdev); +} + static void hci_cc_le_read_max_data_len(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3263,6 +3287,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_read_resolv_list_size(hdev, skb); break; + case HCI_OP_LE_SET_ADDR_RESOLV_ENABLE: + hci_cc_le_set_addr_resolution_enable(hdev, skb); + break; + case HCI_OP_LE_READ_MAX_DATA_LEN: hci_cc_le_read_max_data_len(hdev, skb); break; -- cgit v1.2.3 From 0d86caff06363151df21603eb1f4e3207ea91bd2 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 10 Aug 2018 17:45:11 +0200 Subject: net/smc: send response to test link signal With SMC-D z/OS sends a test link signal every 10 seconds. Linux is supposed to answer, otherwise the SMC-D connection breaks. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ism.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'net') diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index cfade7fdcc6d..e36f21ce7252 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -184,6 +184,37 @@ struct smc_ism_event_work { struct smcd_event event; }; +#define ISM_EVENT_REQUEST 0x0001 +#define ISM_EVENT_RESPONSE 0x0002 +#define ISM_EVENT_REQUEST_IR 0x00000001 +#define ISM_EVENT_CODE_TESTLINK 0x83 + +static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) +{ + union { + u64 info; + struct { + u32 uid; + unsigned short vlanid; + u16 code; + }; + } ev_info; + + switch (wrk->event.code) { + case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ + ev_info.info = wrk->event.info; + if (ev_info.code == ISM_EVENT_REQUEST) { + ev_info.code = ISM_EVENT_RESPONSE; + wrk->smcd->ops->signal_event(wrk->smcd, + wrk->event.tok, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } + break; + } +} + /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { @@ -196,6 +227,9 @@ static void smc_ism_event_work(struct work_struct *work) break; case ISM_EVENT_DMB: break; + case ISM_EVENT_SWR: /* Software defined event */ + smcd_handle_sw_event(wrk); + break; } kfree(wrk); } -- cgit v1.2.3 From 40a1227ea845a37ab197dd1caffb60b047fa36b1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:21 -0700 Subject: tcp: Avoid TCP syncookie rejected by SO_REUSEPORT socket Although the actual cookie check "__cookie_v[46]_check()" does not involve sk specific info, it checks whether the sk has recent synq overflow event in "tcp_synq_no_recent_overflow()". The tcp_sk(sk)->rx_opt.ts_recent_stamp is updated every second when it has sent out a syncookie (through "tcp_synq_overflow()"). The above per sk "recent synq overflow event timestamp" works well for non SO_REUSEPORT use case. However, it may cause random connection request reject/discard when SO_REUSEPORT is used with syncookie because it fails the "tcp_synq_no_recent_overflow()" test. When SO_REUSEPORT is used, it usually has multiple listening socks serving TCP connection requests destinated to the same local IP:PORT. There are cases that the TCP-ACK-COOKIE may not be received by the same sk that sent out the syncookie. For example, if reuse->socks[] began with {sk0, sk1}, 1) sk1 sent out syncookies and tcp_sk(sk1)->rx_opt.ts_recent_stamp was updated. 2) the reuse->socks[] became {sk1, sk2} later. e.g. sk0 was first closed and then sk2 was added. Here, sk2 does not have ts_recent_stamp set. There are other ordering that will trigger the similar situation below but the idea is the same. 3) When the TCP-ACK-COOKIE comes back, sk2 was selected. "tcp_synq_no_recent_overflow(sk2)" returns true. In this case, all syncookies sent by sk1 will be handled (and rejected) by sk2 while sk1 is still alive. The userspace may create and remove listening SO_REUSEPORT sockets as it sees fit. E.g. Adding new thread (and SO_REUSEPORT sock) to handle incoming requests, old process stopping and new process starting...etc. With or without SO_ATTACH_REUSEPORT_[CB]BPF, the sockets leaving and joining a reuseport group makes picking the same sk to check the syncookie very difficult (if not impossible). The later patches will allow bpf prog more flexibility in deciding where a sk should be located in a bpf map and selecting a particular SO_REUSEPORT sock as it sees fit. e.g. Without closing any sock, replace the whole bpf reuseport_array in one map_update() by using map-in-map. Getting the syncookie check working smoothly across socks in the same "reuse->socks[]" is important. A partial solution is to set the newly added sk's ts_recent_stamp to the max ts_recent_stamp of a reuseport group but that will require to iterate through reuse->socks[] OR pessimistically set it to "now - TCP_SYNCOOKIE_VALID" when a sk is joining a reuseport group. However, neither of them will solve the existing sk getting moved around the reuse->socks[] and that sk may not have ts_recent_stamp updated, unlikely under continuous synflood but not impossible. This patch opts to treat the reuseport group as a whole when considering the last synq overflow timestamp since they are serving the same IP:PORT from the userspace (and BPF program) perspective. "synq_overflow_ts" is added to "struct sock_reuseport". The tcp_synq_overflow() and tcp_synq_no_recent_overflow() will update/check reuse->synq_overflow_ts if the sk is in a reuseport group. Similar to the reuseport decision in __inet_lookup_listener(), both sk->sk_reuseport and sk->sk_reuseport_cb are tested for SO_REUSEPORT usage. Update on "synq_overflow_ts" happens at roughly once every second. A synflood test was done with a 16 rx-queues and 16 reuseport sockets. No meaningful performance change is observed. Before and after the change is ~9Mpps in IPv4. Cc: Eric Dumazet Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/sock_reuseport.h | 4 ++++ include/net/tcp.h | 30 ++++++++++++++++++++++++++++-- net/core/sock_reuseport.c | 1 + 3 files changed, 33 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0054b3a9b923..6bef7a0052f2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -12,6 +12,10 @@ struct sock_reuseport { u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ + /* The last synq overflow event timestamp of this + * reuse->socks[] group. + */ + unsigned int synq_overflow_ts; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; diff --git a/include/net/tcp.h b/include/net/tcp.h index d769dc20359b..d196901c9dba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -473,9 +474,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + if (time_after32(now, last_overflow + HZ)) + WRITE_ONCE(reuse->synq_overflow_ts, now); + return; + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } @@ -483,9 +497,21 @@ static inline void tcp_synq_overflow(const struct sock *sk) /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + return time_after32(now, last_overflow + + TCP_SYNCOOKIE_VALID); + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 064acb04be0f..3f188fad0162 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -81,6 +81,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); for (i = 0; i < reuse->num_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, -- cgit v1.2.3 From 736b46027eb4a4c602d3b8b93d2f48c9facbd915 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:22 -0700 Subject: net: Add ID (if needed) to sock_reuseport and expose reuseport_lock A later patch will introduce a BPF_MAP_TYPE_REUSEPORT_ARRAY which allows a SO_REUSEPORT sk to be added to a bpf map. When a sk is removed from reuse->socks[], it also needs to be removed from the bpf map. Also, when adding a sk to a bpf map, the bpf map needs to ensure it is indeed in a reuse->socks[]. Hence, reuseport_lock is needed by the bpf map to ensure its map_update_elem() and map_delete_elem() operations are in-sync with the reuse->socks[]. The BPF_MAP_TYPE_REUSEPORT_ARRAY map will only acquire the reuseport_lock after ensuring the adding sk is already in a reuseport group (i.e. reuse->socks[]). The map_lookup_elem() will be lockless. This patch also adds an ID to sock_reuseport. A later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT which allows a bpf prog to select a sk from a bpf map. It is inflexible to statically enforce a bpf map can only contain the sk belonging to a particular reuse->socks[] (i.e. same IP:PORT) during the bpf verification time. For example, think about the the map-in-map situation where the inner map can be dynamically changed in runtime and the outer map may have inner maps belonging to different reuseport groups. Hence, when the bpf prog (in the new BPF_PROG_TYPE_SK_REUSEPORT type) selects a sk, this selected sk has to be checked to ensure it belongs to the requesting reuseport group (i.e. the group serving that IP:PORT). The "sk->sk_reuseport_cb" pointer cannot be used for this checking purpose because the pointer value will change after reuseport_grow(). Instead of saving all checking conditions like the ones preced calling "reuseport_add_sock()" and compare them everytime a bpf_prog is run, a 32bits ID is introduced to survive the reuseport_grow(). The ID is only acquired if any of the reuse->socks[] is added to the newly introduced "BPF_MAP_TYPE_REUSEPORT_ARRAY" map. If "BPF_MAP_TYPE_REUSEPORT_ARRAY" is not used, the changes in this patch is a no-op. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/sock_reuseport.h | 6 ++++++ net/core/sock_reuseport.c | 27 ++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 6bef7a0052f2..e1a7681856f7 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -5,8 +5,11 @@ #include #include #include +#include #include +extern spinlock_t reuseport_lock; + struct sock_reuseport { struct rcu_head rcu; @@ -16,6 +19,8 @@ struct sock_reuseport { * reuse->socks[] group. */ unsigned int synq_overflow_ts; + /* ID stays the same even after the size of socks[] grows. */ + unsigned int reuseport_id; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; @@ -29,5 +34,6 @@ extern struct sock *reuseport_select_sock(struct sock *sk, int hdr_len); extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); +int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 3f188fad0162..cf2e4d305af9 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -8,11 +8,33 @@ #include #include +#include #include #define INIT_SOCKS 128 -static DEFINE_SPINLOCK(reuseport_lock); +DEFINE_SPINLOCK(reuseport_lock); + +#define REUSEPORT_MIN_ID 1 +static DEFINE_IDA(reuseport_ida); + +int reuseport_get_id(struct sock_reuseport *reuse) +{ + int id; + + if (reuse->reuseport_id) + return reuse->reuseport_id; + + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, + /* Called under reuseport_lock */ + GFP_ATOMIC); + if (id < 0) + return id; + + reuse->reuseport_id = id; + + return reuse->reuseport_id; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -78,6 +100,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -102,6 +125,8 @@ static void reuseport_free_rcu(struct rcu_head *head) reuse = container_of(head, struct sock_reuseport, rcu); if (reuse->prog) bpf_prog_destroy(reuse->prog); + if (reuse->reuseport_id) + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } -- cgit v1.2.3 From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 28 ++++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/arraymap.c | 2 +- kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + net/core/sock_reuseport.c | 8 + 8 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/reuseport_array.c (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..db11662faea6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..14fd6c02d258 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd5758dc35d3..40f584bc7da0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c77..f6ca3e712831 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include +#include +#include +#include + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722d..57f4d076141b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index cf2e4d305af9..8235f2439816 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -186,6 +186,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) { -- cgit v1.2.3 From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 + include/linux/filter.h | 15 +++ include/net/addrconf.h | 1 + include/net/sock_reuseport.h | 6 +- include/uapi/linux/bpf.h | 36 +++++- kernel/bpf/verifier.c | 9 ++ net/core/filter.c | 269 +++++++++++++++++++++++++++++++++++++++- net/core/sock_reuseport.c | 20 ++- net/ipv4/inet_connection_sock.c | 9 ++ net/ipv4/inet_hashtables.c | 5 +- net/ipv4/udp.c | 5 +- 11 files changed, 365 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b072dab32c0..70e9d57677fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5f43f7a70fe6..6def0351bcc3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index e1a7681856f7..73b569556be6 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -21,12 +21,14 @@ struct sock_reuseport { unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40f584bc7da0..3102a2a23c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -151,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2114,6 +2115,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2197,7 +2206,8 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2414,6 +2424,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 2de7dd9f2a57..142595b4e0d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -7013,3 +7013,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 8235f2439816..d260167f5f77 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -136,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3647167c8fa3..370e24463fb7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -567,10 +567,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 060e841dde40..038dd7909051 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** -- cgit v1.2.3 From 8217ca653ec601246832d562207bc24bdf652d2f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:26 -0700 Subject: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in the earlier patch. "bpf_run_sk_reuseport()" will return -ECONNREFUSED when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP. The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to handle this case and return NULL immediately instead of continuing the sk search from its hashtable. It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach BPF_PROG_TYPE_SK_REUSEPORT. The "sk_reuseport_attach_bpf()" will check if the attaching bpf prog is in the new SK_REUSEPORT or the existing SOCKET_FILTER type and then check different things accordingly. One level of "__reuseport_attach_prog()" call is removed. The "sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed back to "reuseport_attach_prog()" in sock_reuseport.c. sock_reuseport.c seems to have more knowledge on those test requirements than filter.c. In "reuseport_attach_prog()", after new_prog is attached to reuse->prog, the old_prog (if any) is also directly freed instead of returning the old_prog to the caller and asking the caller to free. The sysctl_optmem_max check is moved back to the "sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()". As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only bounded by the usual "bpf_prog_charge_memlock()" during load time instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock_reuseport.h | 3 +- net/core/filter.c | 87 ++++++++++++++++++++++++++------------------ net/core/sock_reuseport.c | 36 +++++++++++++----- net/ipv4/inet_hashtables.c | 14 ++++--- net/ipv4/udp.c | 4 ++ net/ipv6/inet6_hashtables.c | 14 ++++--- net/ipv6/udp.c | 4 ++ 8 files changed, 106 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e9d57677fe..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -753,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 73b569556be6..8a5f70c7cdf2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -34,8 +34,7 @@ extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); -extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, - struct bpf_prog *prog); +extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/filter.c b/net/core/filter.c index 142595b4e0d1..22906b31d43f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) -{ - struct bpf_prog *old_prog; - int err; - - if (bpf_prog_size(prog->len) > sysctl_optmem_max) - return -ENOMEM; - - if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk, false); - if (err) - return err; - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { - /* The socket wasn't bound with SO_REUSEPORT */ - return -EINVAL; - } - - old_prog = reuseport_attach_prog(sk, prog); - if (old_prog) - bpf_prog_destroy(old_prog); - - return 0; -} - static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) __bpf_prog_release(prog); - return err; - } - return 0; + return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) @@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog = __get_bpf(ufd, sk); + struct bpf_prog *prog; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - bpf_prog_put(prog); - return err; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } } - return 0; + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); } struct bpf_scratchpad { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index d260167f5f77..ba5cba56f574 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #define INIT_SOCKS 128 @@ -133,8 +134,7 @@ static void reuseport_free_rcu(struct rcu_head *head) struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); if (reuse->reuseport_id) ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); @@ -219,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk) } EXPORT_SYMBOL(reuseport_detach_sock); -static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, - struct bpf_prog *prog, struct sk_buff *skb, - int hdr_len) +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) { struct sk_buff *nskb = NULL; u32 index; @@ -282,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk, /* paired with smp_wmb() in reuseport_add_sock() */ smp_rmb(); - if (prog && skb) - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); +select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; @@ -296,12 +302,21 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); -struct bpf_prog * -reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; + if (sk_unhashed(sk) && sk->sk_reuseport) { + int err = reuseport_alloc(sk, false); + + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); @@ -310,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); - return old_prog; + sk_reuseport_prog_free(old_prog); + return 0; } EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 370e24463fb7..f5c9ef2586de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each_rcu(sk, &ilb->head) { @@ -352,12 +353,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 038dd7909051..f4e35b2ff8b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -499,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -513,6 +515,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 595ad408dba0..3d7c7460a0c5 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with in6addr_any */ @@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each(sk, &ilb->head) { @@ -214,12 +215,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6b96956a8ed..83f4c77c79d8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -249,6 +251,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } -- cgit v1.2.3 From 98ed1e642c451b3ed9f4c5785b291a3fc9e82166 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 9 Aug 2018 12:00:49 +0100 Subject: rxrpc: remove redundant static int 'zero' The static int 'zero' is defined but is never used hence it is redundant and can be removed. The use of this variable was removed with commit a158bdd3247b ("rxrpc: Fix call timeouts"). Cleans up clang warning: warning: 'zero' defined but not used [-Wunused-const-variable=] Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/rxrpc/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 4a7af7aff37d..d75bd15151e6 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -15,7 +15,6 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; -static const unsigned int zero = 0; static const unsigned int one = 1; static const unsigned int four = 4; static const unsigned int thirtytwo = 32; -- cgit v1.2.3 From 466466dc6c28ca9dc401f10e235b9cde9a7c9162 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Aug 2018 09:38:09 -0700 Subject: tcp: mandate a one-time immediate ACK Add a new flag to indicate a one-time immediate ACK. This flag is occasionaly set under specific TCP protocol states in addition to the more common quickack mechanism for interactive application. In several cases in the TCP code we want to force an immediate ACK but do not want to call tcp_enter_quickack_mode() because we do not want to forget the icsk_ack.pingpong or icsk_ack.ato state. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp_input.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0a6c9e0f2b5a..fa43b82607d9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -167,7 +167,8 @@ enum inet_csk_ack_state_t { ICSK_ACK_SCHED = 1, ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, - ICSK_ACK_PUSHED2 = 8 + ICSK_ACK_PUSHED2 = 8, + ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ }; void inet_csk_init_xmit_timers(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 715d541b52dd..b8849588c440 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5179,7 +5179,9 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk)) { + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { send_now: tcp_send_ack(sk); return; -- cgit v1.2.3 From d2ccd7bc8acdcb9166c07a0255fb85bf877edb1f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Aug 2018 09:38:10 -0700 Subject: tcp: avoid resetting ACK timer in DCTCP The recent fix of acking immediately in DCTCP on CE status change has an undesirable side-effect: it also resets TCP ack timer and disables pingpong mode (interactive session). But the CE status change has nothing to do with them. This patch addresses that by using the new one-time immediate ACK flag instead of calling tcp_enter_quickack_mode(). Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 8b637f9f23a2..ca61e2a659e7 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -136,7 +136,7 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) __tcp_send_ack(sk, ca->prior_rcv_nxt); - tcp_enter_quickack_mode(sk, 1); + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -157,7 +157,7 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) __tcp_send_ack(sk, ca->prior_rcv_nxt); - tcp_enter_quickack_mode(sk, 1); + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } ca->prior_rcv_nxt = tp->rcv_nxt; -- cgit v1.2.3 From 15bdd5686c2c61373680b9015e95abf31778e4fd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Aug 2018 09:38:11 -0700 Subject: tcp: always ACK immediately on hole repairs RFC 5681 sec 4.2: To provide feedback to senders recovering from losses, the receiver SHOULD send an immediate ACK when it receives a data segment that fills in all or part of a gap in the sequence space. When a gap is partially filled, __tcp_ack_snd_check already checks the out-of-order queue and correctly send an immediate ACK. However when a gap is fully filled, the previous implementation only resets pingpong mode which does not guarantee an immediate ACK because the quick ACK counter may be zero. This patch addresses this issue by marking the one-time immediate ACK flag instead. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b8849588c440..9a09ff3afef2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4735,11 +4735,11 @@ queue_and_out: if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); - /* RFC2581. 4.2. SHOULD send immediate ACK, when + /* RFC5681. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) - inet_csk(sk)->icsk_ack.pingpong = 0; + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } if (tp->rx_opt.num_sacks) -- cgit v1.2.3 From fd2123a3d7527d4c7092633d55e877c0cc1d84a3 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Aug 2018 09:38:12 -0700 Subject: tcp: avoid resetting ACK timer upon receiving packet with ECN CWR flag Previously commit 9aee40006190 ("tcp: ack immediately when a cwr packet arrives") calls tcp_enter_quickack_mode to force sending two immediate ACKs upon receiving a packet w/ CWR flag. The side effect is it'll also reset the delayed ACK timer and interactive session tracking. This patch removes that side effect by using the new ACK_NOW flag to force an immmediate ACK. Packetdrill to demonstrate: 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < [ect0] SEW 0:0(0) win 32792 +0 > SE. 0:0(0) ack 1 +.1 < [ect0] . 1:1(0) ack 1 win 257 +0 accept(3, ..., ...) = 4 +0 < [ect0] . 1:1001(1000) ack 1 win 257 +0 > [ect01] . 1:1(0) ack 1001 +0 write(4, ..., 1) = 1 +0 > [ect01] P. 1:2(1) ack 1001 +0 < [ect0] . 1001:2001(1000) ack 2 win 257 +0 write(4, ..., 1) = 1 +0 > [ect01] P. 2:3(1) ack 2001 +0 < [ect0] . 2001:3001(1000) ack 3 win 257 +0 < [ect0] . 3001:4001(1000) ack 3 win 257 // Ack delayed ... +.01 < [ce] P. 4001:4501(500) ack 3 win 257 +0 > [ect01] . 3:3(0) ack 4001 +0 > [ect01] E. 3:3(0) ack 4501 +.001 read(4, ..., 4500) = 4500 +0 write(4, ..., 1) = 1 +0 > [ect01] PE. 3:4(1) ack 4501 win 100 +.01 < [ect0] W. 4501:5501(1000) ack 4 win 257 // No delayed ACK on CWR flag +0 > [ect01] . 4:4(0) ack 5501 +.31 < [ect0] . 5501:6501(1000) ack 4 win 257 +0 > [ect01] . 4:4(0) ack 6501 Fixes: 9aee40006190 ("tcp: ack immediately when a cwr packet arrives") Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9a09ff3afef2..4c2dd9f863f7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -245,16 +245,16 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) { - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ - tcp_enter_quickack_mode((struct sock *)tp, 2); + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } @@ -4703,7 +4703,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - tcp_ecn_accept_cwr(tp, skb); + tcp_ecn_accept_cwr(sk, skb); tp->rx_opt.dsack = 0; -- cgit v1.2.3 From d6a61ec936676dbe25a6eb76e1229787dc2fbba8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:21:55 +0200 Subject: l2tp: define l2tp_tunnel_uses_xfrm() Use helper function to figure out if a tunnel is using ipsec. Also, avoid accessing ->sk_policy directly since it's RCU protected. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.h | 19 +++++++++++++++++++ net/l2tp/l2tp_netlink.c | 7 +------ net/l2tp/l2tp_ppp.c | 5 +---- 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 5804065dfbfb..04a9488c54b4 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -15,6 +15,10 @@ #include #include +#ifdef CONFIG_XFRM +#include +#endif + /* Just some random numbers */ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D @@ -284,6 +288,21 @@ static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel) return mtu; } +#ifdef CONFIG_XFRM +static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) +{ + struct sock *sk = tunnel->sock; + + return sk && (rcu_access_pointer(sk->sk_policy[0]) || + rcu_access_pointer(sk->sk_policy[1])); +} +#else +static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) +{ + return false; +} +#endif + #define l2tp_printk(ptr, type, func, fmt, ...) \ do { \ if (((ptr)->debug) & (type)) \ diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 2e1e92651545..357503e5acd5 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -710,9 +710,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl void *hdr; struct nlattr *nest; struct l2tp_tunnel *tunnel = session->tunnel; - struct sock *sk = NULL; - - sk = tunnel->sock; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) @@ -738,10 +735,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || -#ifdef CONFIG_XFRM - (((sk) && (sk->sk_policy[0] || sk->sk_policy[1])) && + (l2tp_tunnel_uses_xfrm(tunnel) && nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) || -#endif (session->reorder_timeout && nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout, L2TP_ATTR_PAD))) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 6e2c8e7595e0..c33ef9a3f3b5 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -95,7 +95,6 @@ #include #include #include -#include #include #include @@ -1153,9 +1152,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, l2tp_session_dec_refcount(session); break; } -#ifdef CONFIG_XFRM - stats.using_ipsec = (sk->sk_policy[0] || sk->sk_policy[1]) ? 1 : 0; -#endif + stats.using_ipsec = l2tp_tunnel_uses_xfrm(tunnel); pppol2tp_copy_stats(&stats, &tunnel->stats); if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) { err = -EFAULT; -- cgit v1.2.3 From 01e28b921b19cb99a09dda89ab0e5dc49bf4ab38 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:21:57 +0200 Subject: l2tp: split l2tp_session_get() l2tp_session_get() is used for two different purposes. If 'tunnel' is NULL, the session is searched globally in the supplied network namespace. Otherwise it is searched exclusively in the tunnel context. Callers always know the context in which they need to search the session. But some of them do provide both a namespace and a tunnel, making the semantic of the call unclear. This patch defines l2tp_tunnel_get_session() for lookups done in a tunnel and restricts l2tp_session_get() to namespace searches. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 50 ++++++++++++++++++++++++------------------------- net/l2tp/l2tp_core.h | 6 +++--- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/l2tp/l2tp_netlink.c | 4 ++-- net/l2tp/l2tp_ppp.c | 8 ++++---- 6 files changed, 36 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ac6a00bcec71..2bd701a58aa6 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -203,44 +203,44 @@ struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) } EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth); -/* Lookup a session. A new reference is held on the returned session. */ -struct l2tp_session *l2tp_session_get(const struct net *net, - struct l2tp_tunnel *tunnel, - u32 session_id) +struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, + u32 session_id) { struct hlist_head *session_list; struct l2tp_session *session; - if (!tunnel) { - struct l2tp_net *pn = l2tp_pernet(net); - - session_list = l2tp_session_id_hash_2(pn, session_id); + session_list = l2tp_session_id_hash(tunnel, session_id); - rcu_read_lock_bh(); - hlist_for_each_entry_rcu(session, session_list, global_hlist) { - if (session->session_id == session_id) { - l2tp_session_inc_refcount(session); - rcu_read_unlock_bh(); + read_lock_bh(&tunnel->hlist_lock); + hlist_for_each_entry(session, session_list, hlist) + if (session->session_id == session_id) { + l2tp_session_inc_refcount(session); + read_unlock_bh(&tunnel->hlist_lock); - return session; - } + return session; } - rcu_read_unlock_bh(); + read_unlock_bh(&tunnel->hlist_lock); - return NULL; - } + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_get_session); - session_list = l2tp_session_id_hash(tunnel, session_id); - read_lock_bh(&tunnel->hlist_lock); - hlist_for_each_entry(session, session_list, hlist) { +struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id) +{ + struct hlist_head *session_list; + struct l2tp_session *session; + + session_list = l2tp_session_id_hash_2(l2tp_pernet(net), session_id); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(session, session_list, global_hlist) if (session->session_id == session_id) { l2tp_session_inc_refcount(session); - read_unlock_bh(&tunnel->hlist_lock); + rcu_read_unlock_bh(); return session; } - } - read_unlock_bh(&tunnel->hlist_lock); + rcu_read_unlock_bh(); return NULL; } @@ -872,7 +872,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) } /* Find the session context */ - session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id); + session = l2tp_tunnel_get_session(tunnel, session_id); if (!session || !session->recv_skb) { if (session) l2tp_session_dec_refcount(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 04a9488c54b4..8480a0af973e 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -196,12 +196,12 @@ static inline void *l2tp_session_priv(struct l2tp_session *session) struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); +struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, + u32 session_id); void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); -struct l2tp_session *l2tp_session_get(const struct net *net, - struct l2tp_tunnel *tunnel, - u32 session_id); +struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0bc39cc20a3f..35f6f86d4dcc 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -144,7 +144,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_get(net, NULL, session_id); + session = l2tp_session_get(net, session_id); if (!session) goto discard; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 42f828cf62fb..237f1a4a0b0c 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -157,7 +157,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_get(net, NULL, session_id); + session = l2tp_session_get(net, session_id); if (!session) goto discard; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 357503e5acd5..edbd5d1fbcde 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -66,7 +66,7 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (tunnel) { - session = l2tp_session_get(net, tunnel, session_id); + session = l2tp_tunnel_get_session(tunnel, session_id); l2tp_tunnel_dec_refcount(tunnel); } } @@ -627,7 +627,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf &cfg); if (ret >= 0) { - session = l2tp_session_get(net, tunnel, session_id); + session = l2tp_tunnel_get_session(tunnel, session_id); if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index c33ef9a3f3b5..cd43d02484e4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -757,7 +757,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = info.peer_tunnel_id; - session = l2tp_session_get(sock_net(sk), tunnel, info.session_id); + session = l2tp_tunnel_get_session(tunnel, info.session_id); if (session) { drop_refcnt = true; @@ -1134,10 +1134,10 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, } if (stats.session_id != 0) { /* resend to session ioctl handler */ - struct l2tp_session *session = - l2tp_session_get(sock_net(sk), tunnel, - stats.session_id); + struct l2tp_session *session; + session = l2tp_tunnel_get_session(tunnel, + stats.session_id); if (!session) { err = -EBADR; break; -- cgit v1.2.3 From bdd0292f96e43de46283ea0efdef8d13b4ffe895 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:21:58 +0200 Subject: l2tp: simplify pppol2tp_ioctl() * Drop test on 'sk': sock->sk cannot be NULL, or pppox_ioctl() could not have called us. * Drop test on 'SOCK_DEAD' state: if this flag was set, the socket would be in the process of being released and no ioctl could be running anymore. * Drop test on 'PPPOX_*' state: we depend on ->sk_user_data to get the session structure. If it is non-NULL, then the socket is connected. Testing for PPPOX_* is redundant. * Retrieve session using ->sk_user_data directly, instead of going through pppol2tp_sock_to_session(). This avoids grabbing a useless reference on the socket. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index cd43d02484e4..e3ed8d473d91 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1179,28 +1179,12 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; - int err; - - if (!sk) - return 0; - - err = -EBADF; - if (sock_flag(sk, SOCK_DEAD) != 0) - goto end; - - err = -ENOTCONN; - if ((sk->sk_user_data == NULL) || - (!(sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)))) - goto end; - /* Get session context from the socket */ - err = -EBADF; - session = pppol2tp_sock_to_session(sk); - if (session == NULL) - goto end; + session = sock->sk->sk_user_data; + if (!session) + return -ENOTCONN; /* Special case: if session's session_id is zero, treat ioctl as a * tunnel ioctl @@ -1208,16 +1192,11 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, if ((session->session_id == 0) && (session->peer_session_id == 0)) { tunnel = session->tunnel; - err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg); - goto end_put_sess; - } - err = pppol2tp_session_ioctl(session, cmd, arg); + return pppol2tp_tunnel_ioctl(tunnel, cmd, arg); + } -end_put_sess: - sock_put(sk); -end: - return err; + return pppol2tp_session_ioctl(session, cmd, arg); } /***************************************************************************** -- cgit v1.2.3 From 79e6760e64d1b69a20af4d97ead291159d4c11c2 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:21:58 +0200 Subject: l2tp: handle PPPIOC[GS]MRU and PPPIOC[GS]FLAGS in pppol2tp_ioctl() Let pppol2tp_ioctl() handle ioctl commands directly. It still relies on pppol2tp_{session,tunnel}_ioctl() for PPPIOCGL2TPSTATS. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 73 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index e3ed8d473d91..f4ec6b2a093e 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1045,7 +1045,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, { int err = 0; struct sock *sk; - int val = (int) arg; struct l2tp_tunnel *tunnel = session->tunnel; struct pppol2tp_ioc_stats stats; @@ -1058,22 +1057,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, return -EBADR; switch (cmd) { - case PPPIOCGMRU: - case PPPIOCGFLAGS: - err = -EFAULT; - if (put_user(0, (int __user *)arg)) - break; - err = 0; - break; - - case PPPIOCSMRU: - case PPPIOCSFLAGS: - err = -EFAULT; - if (get_user(val, (int __user *)arg)) - break; - err = 0; - break; - case PPPIOCGL2TPSTATS: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) @@ -1180,23 +1163,55 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct l2tp_session *session; - struct l2tp_tunnel *tunnel; + int val; + + switch (cmd) { + case PPPIOCGMRU: + case PPPIOCGFLAGS: + session = sock->sk->sk_user_data; + if (!session) + return -ENOTCONN; - session = sock->sk->sk_user_data; - if (!session) - return -ENOTCONN; + /* Not defined for tunnels */ + if (!session->session_id && !session->peer_session_id) + return -ENOSYS; - /* Special case: if session's session_id is zero, treat ioctl as a - * tunnel ioctl - */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { - tunnel = session->tunnel; + if (put_user(0, (int __user *)arg)) + return -EFAULT; + break; + + case PPPIOCSMRU: + case PPPIOCSFLAGS: + session = sock->sk->sk_user_data; + if (!session) + return -ENOTCONN; - return pppol2tp_tunnel_ioctl(tunnel, cmd, arg); + /* Not defined for tunnels */ + if (!session->session_id && !session->peer_session_id) + return -ENOSYS; + + if (get_user(val, (int __user *)arg)) + return -EFAULT; + break; + + case PPPIOCGL2TPSTATS: + session = sock->sk->sk_user_data; + if (!session) + return -ENOTCONN; + + /* Session 0 represents the parent tunnel */ + if (!session->session_id && !session->peer_session_id) + return pppol2tp_tunnel_ioctl(session->tunnel, cmd, + arg); + else + return pppol2tp_session_ioctl(session, cmd, arg); + break; + + default: + return -ENOSYS; } - return pppol2tp_session_ioctl(session, cmd, arg); + return 0; } /***************************************************************************** -- cgit v1.2.3 From 528534f0deda05c668756313a22974429a9df05a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:22:00 +0200 Subject: l2tp: remove pppol2tp_tunnel_ioctl() Handle PPPIOCGL2TPSTATS in pppol2tp_ioctl() if the socket represents a tunnel. This one is a bit special because the caller may use the tunnel socket to retrieve statistics of one of its sessions. If the session_id is set, the corresponding session's statistics are returned, instead of those of the tunnel. This is handled by the new pppol2tp_tunnel_copy_stats() helper function. Set ->tunnel_id and ->using_ipsec out of the conditional, so that it can be used by the 'else' branch in the following patch. We cannot do that for ->session_id, because tunnel sockets have to report the value that was originally passed in 'stats.session_id', while session sockets have to report their own session_id. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 132 +++++++++++++++++++++------------------------------- 1 file changed, 53 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f4ec6b2a093e..2afd3ab8a551 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1038,6 +1038,36 @@ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, dest->rx_errors = atomic_long_read(&stats->rx_errors); } +static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, + struct l2tp_tunnel *tunnel) +{ + struct l2tp_session *session; + + if (!stats->session_id) { + memset(stats, 0, sizeof(*stats)); + pppol2tp_copy_stats(stats, &tunnel->stats); + return 0; + } + + /* If session_id is set, search the corresponding session in the + * context of this tunnel and record the session's statistics. + */ + session = l2tp_tunnel_get_session(tunnel, stats->session_id); + if (!session) + return -EBADR; + + if (session->pwtype != L2TP_PWTYPE_PPP) { + l2tp_session_dec_refcount(session); + return -EBADR; + } + + memset(stats, 0, sizeof(*stats)); + pppol2tp_copy_stats(stats, &session->stats); + l2tp_session_dec_refcount(session); + + return 0; +} + /* Session ioctl helper. */ static int pppol2tp_session_ioctl(struct l2tp_session *session, @@ -1084,84 +1114,10 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session, return err; } -/* Tunnel ioctl helper. - * - * Note the special handling for PPPIOCGL2TPSTATS below. If the ioctl data - * specifies a session_id, the session ioctl handler is called. This allows an - * application to retrieve session stats via a tunnel socket. - */ -static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, - unsigned int cmd, unsigned long arg) -{ - int err = 0; - struct sock *sk; - struct pppol2tp_ioc_stats stats; - - l2tp_dbg(tunnel, L2TP_MSG_CONTROL, - "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n", - tunnel->name, cmd, arg); - - sk = tunnel->sock; - sock_hold(sk); - - switch (cmd) { - case PPPIOCGL2TPSTATS: - err = -ENXIO; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - if (copy_from_user(&stats, (void __user *) arg, - sizeof(stats))) { - err = -EFAULT; - break; - } - if (stats.session_id != 0) { - /* resend to session ioctl handler */ - struct l2tp_session *session; - - session = l2tp_tunnel_get_session(tunnel, - stats.session_id); - if (!session) { - err = -EBADR; - break; - } - if (session->pwtype != L2TP_PWTYPE_PPP) { - l2tp_session_dec_refcount(session); - err = -EBADR; - break; - } - - err = pppol2tp_session_ioctl(session, cmd, arg); - l2tp_session_dec_refcount(session); - break; - } - stats.using_ipsec = l2tp_tunnel_uses_xfrm(tunnel); - pppol2tp_copy_stats(&stats, &tunnel->stats); - if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) { - err = -EFAULT; - break; - } - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n", - tunnel->name); - err = 0; - break; - - default: - err = -ENOSYS; - break; - } - - sock_put(sk); - - return err; -} - -/* Main ioctl() handler. - * Dispatch to tunnel or session helpers depending on the socket. - */ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + struct pppol2tp_ioc_stats stats; struct l2tp_session *session; int val; @@ -1200,11 +1156,29 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, return -ENOTCONN; /* Session 0 represents the parent tunnel */ - if (!session->session_id && !session->peer_session_id) - return pppol2tp_tunnel_ioctl(session->tunnel, cmd, - arg); - else + if (!session->session_id && !session->peer_session_id) { + u32 session_id; + int err; + + if (copy_from_user(&stats, (void __user *)arg, + sizeof(stats))) + return -EFAULT; + + session_id = stats.session_id; + err = pppol2tp_tunnel_copy_stats(&stats, + session->tunnel); + if (err < 0) + return err; + + stats.session_id = session_id; + } else { return pppol2tp_session_ioctl(session, cmd, arg); + } + stats.tunnel_id = session->tunnel->tunnel_id; + stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel); + + if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) + return -EFAULT; break; default: -- cgit v1.2.3 From b0e29063dcb3bf14f515f95e748b60e4bab45e7c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:22:01 +0200 Subject: l2tp: remove pppol2tp_session_ioctl() pppol2tp_ioctl() has everything in place for handling PPPIOCGL2TPSTATS on session sockets. We just need to copy the stats and set ->session_id. As a side effect of sharing session and tunnel code, ->using_ipsec is properly set even when the request was made using a session socket. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/ppp-ioctl.h | 2 +- net/l2tp/l2tp_ppp.c | 50 +++--------------------------------------- 2 files changed, 4 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/ppp-ioctl.h b/include/uapi/linux/ppp-ioctl.h index 784c2e3e572e..88b5f9990320 100644 --- a/include/uapi/linux/ppp-ioctl.h +++ b/include/uapi/linux/ppp-ioctl.h @@ -68,7 +68,7 @@ struct ppp_option_data { struct pppol2tp_ioc_stats { __u16 tunnel_id; /* redundant */ __u16 session_id; /* if zero, get tunnel stats */ - __u32 using_ipsec:1; /* valid only for session_id == 0 */ + __u32 using_ipsec:1; __aligned_u64 tx_packets; __aligned_u64 tx_bytes; __aligned_u64 tx_errors; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 2afd3ab8a551..bdfbd3ed7e14 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1068,52 +1068,6 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, return 0; } -/* Session ioctl helper. - */ -static int pppol2tp_session_ioctl(struct l2tp_session *session, - unsigned int cmd, unsigned long arg) -{ - int err = 0; - struct sock *sk; - struct l2tp_tunnel *tunnel = session->tunnel; - struct pppol2tp_ioc_stats stats; - - l2tp_dbg(session, L2TP_MSG_CONTROL, - "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n", - session->name, cmd, arg); - - sk = pppol2tp_session_get_sock(session); - if (!sk) - return -EBADR; - - switch (cmd) { - case PPPIOCGL2TPSTATS: - err = -ENXIO; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - memset(&stats, 0, sizeof(stats)); - stats.tunnel_id = tunnel->tunnel_id; - stats.session_id = session->session_id; - pppol2tp_copy_stats(&stats, &session->stats); - if (copy_to_user((void __user *) arg, &stats, - sizeof(stats))) - break; - l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n", - session->name); - err = 0; - break; - - default: - err = -ENOSYS; - break; - } - - sock_put(sk); - - return err; -} - static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { @@ -1172,7 +1126,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, stats.session_id = session_id; } else { - return pppol2tp_session_ioctl(session, cmd, arg); + memset(&stats, 0, sizeof(stats)); + pppol2tp_copy_stats(&stats, &session->stats); + stats.session_id = session->session_id; } stats.tunnel_id = session->tunnel->tunnel_id; stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel); -- cgit v1.2.3 From 7390ed8a405013d0a7e1f4dc8ac495e0ac04996f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:22:02 +0200 Subject: l2tp: zero out stats in pppol2tp_copy_stats() Integrate memset(0) in pppol2tp_copy_stats() to avoid calling it manually every time. While there, constify 'stats'. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index bdfbd3ed7e14..e2eea60bf875 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1026,8 +1026,10 @@ end: ****************************************************************************/ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, - struct l2tp_stats *stats) + const struct l2tp_stats *stats) { + memset(dest, 0, sizeof(*dest)); + dest->tx_packets = atomic_long_read(&stats->tx_packets); dest->tx_bytes = atomic_long_read(&stats->tx_bytes); dest->tx_errors = atomic_long_read(&stats->tx_errors); @@ -1044,7 +1046,6 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, struct l2tp_session *session; if (!stats->session_id) { - memset(stats, 0, sizeof(*stats)); pppol2tp_copy_stats(stats, &tunnel->stats); return 0; } @@ -1061,7 +1062,6 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, return -EBADR; } - memset(stats, 0, sizeof(*stats)); pppol2tp_copy_stats(stats, &session->stats); l2tp_session_dec_refcount(session); @@ -1126,7 +1126,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, stats.session_id = session_id; } else { - memset(&stats, 0, sizeof(stats)); pppol2tp_copy_stats(&stats, &session->stats); stats.session_id = session->session_id; } -- cgit v1.2.3 From 4f5f85e9a70e13c8919e26609914253d18fbf858 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 10 Aug 2018 13:22:03 +0200 Subject: l2tp: let pppol2tp_ioctl() fallback to dev_ioctl() Return -ENOIOCTLCMD for unknown ioctl commands. This lets dev_ioctl() handle generic socket ioctls like SIOCGIFNAME or SIOCGIFINDEX. PF_PPPOX/PX_PROTO_OL2TP was one of the few socket types not honouring this mechanism. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index e2eea60bf875..62f2d3f1e431 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1137,7 +1137,7 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, break; default: - return -ENOSYS; + return -ENOIOCTLCMD; } return 0; -- cgit v1.2.3 From 05364ca03cfd419caecb292fede20eb39667eaae Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Fri, 10 Aug 2018 20:11:42 +0300 Subject: net/sctp: Make wrappers for accessing in/out streams This patch introduces wrappers for accessing in/out streams indirectly. This will enable to replace physically contiguous memory arrays of streams with flexible arrays (or maybe any other appropriate mechanism) which do memory allocation on a per-page basis. Signed-off-by: Oleg Babin Signed-off-by: Konstantin Khorenko Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 35 +++++++++++++++++------- net/sctp/chunk.c | 6 ++-- net/sctp/outqueue.c | 11 ++++---- net/sctp/socket.c | 4 +-- net/sctp/stream.c | 65 +++++++++++++++++++++++--------------------- net/sctp/stream_interleave.c | 20 +++++++------- net/sctp/stream_sched.c | 13 +++++---- net/sctp/stream_sched_prio.c | 22 +++++++-------- net/sctp/stream_sched_rr.c | 8 +++--- 9 files changed, 103 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ab869e0d8326..6b2b8df8a1d2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -398,37 +398,35 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); /* What is the current SSN number for this stream? */ #define sctp_ssn_peek(stream, type, sid) \ - ((stream)->type[sid].ssn) + (sctp_stream_##type((stream), (sid))->ssn) /* Return the next SSN number for this stream. */ #define sctp_ssn_next(stream, type, sid) \ - ((stream)->type[sid].ssn++) + (sctp_stream_##type((stream), (sid))->ssn++) /* Skip over this ssn and all below. */ #define sctp_ssn_skip(stream, type, sid, ssn) \ - ((stream)->type[sid].ssn = ssn + 1) + (sctp_stream_##type((stream), (sid))->ssn = ssn + 1) /* What is the current MID number for this stream? */ #define sctp_mid_peek(stream, type, sid) \ - ((stream)->type[sid].mid) + (sctp_stream_##type((stream), (sid))->mid) /* Return the next MID number for this stream. */ #define sctp_mid_next(stream, type, sid) \ - ((stream)->type[sid].mid++) + (sctp_stream_##type((stream), (sid))->mid++) /* Skip over this mid and all below. */ #define sctp_mid_skip(stream, type, sid, mid) \ - ((stream)->type[sid].mid = mid + 1) - -#define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid]) + (sctp_stream_##type((stream), (sid))->mid = mid + 1) /* What is the current MID_uo number for this stream? */ #define sctp_mid_uo_peek(stream, type, sid) \ - ((stream)->type[sid].mid_uo) + (sctp_stream_##type((stream), (sid))->mid_uo) /* Return the next MID_uo number for this stream. */ #define sctp_mid_uo_next(stream, type, sid) \ - ((stream)->type[sid].mid_uo++) + (sctp_stream_##type((stream), (sid))->mid_uo++) /* * Pointers to address related SCTP functions. @@ -1463,6 +1461,23 @@ struct sctp_stream { struct sctp_stream_interleave *si; }; +static inline struct sctp_stream_out *sctp_stream_out( + const struct sctp_stream *stream, + __u16 sid) +{ + return ((struct sctp_stream_out *)(stream->out)) + sid; +} + +static inline struct sctp_stream_in *sctp_stream_in( + const struct sctp_stream *stream, + __u16 sid) +{ + return ((struct sctp_stream_in *)(stream->in)) + sid; +} + +#define SCTP_SO(s, i) sctp_stream_out((s), (i)) +#define SCTP_SI(s, i) sctp_stream_in((s), (i)) + #define SCTP_STREAM_CLOSED 0x00 #define SCTP_STREAM_OPEN 0x01 diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index bfb9f812e2ef..ce8087846f05 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -325,7 +325,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = - &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; + SCTP_SO(&chunk->asoc->stream, + chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; @@ -339,7 +340,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = - &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; + SCTP_SO(&chunk->asoc->stream, + chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d68aa33485a9..d74d00b29942 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -80,7 +80,7 @@ static inline void sctp_outq_head_data(struct sctp_outq *q, q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); - oute = q->asoc->stream.out[stream].ext; + oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add(&ch->stream_list, &oute->outq); } @@ -101,7 +101,7 @@ static inline void sctp_outq_tail_data(struct sctp_outq *q, q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); - oute = q->asoc->stream.out[stream].ext; + oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add_tail(&ch->stream_list, &oute->outq); } @@ -372,7 +372,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); - streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; + streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; @@ -416,7 +416,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { struct sctp_stream_out *streamout = - &asoc->stream.out[chk->sinfo.sinfo_stream]; + SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } @@ -1082,6 +1082,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); + __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state; /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { @@ -1091,7 +1092,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, continue; } - if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { + if (stream_state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(ctx->q, chunk); break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 502c0d7cb105..e96b15a66aba 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1911,7 +1911,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, goto err; } - if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) { err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); if (err) goto err; @@ -7154,7 +7154,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamoute = asoc->stream.out[params.sprstat_sid].ext; + streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext; if (!streamoute) { /* Not allocated yet, means all stats are 0 */ params.sprstat_abandoned_unsent = 0; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f1f1d1b232ba..7ca6fe4e7882 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -162,7 +162,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; sched->init(stream); @@ -193,7 +193,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; - stream->out[sid].ext = soute; + SCTP_SO(stream, sid)->ext = soute; return sctp_sched_init_sid(stream, sid, GFP_KERNEL); } @@ -205,7 +205,7 @@ void sctp_stream_free(struct sctp_stream *stream) sched->free(stream); for (i = 0; i < stream->outcnt; i++) - kfree(stream->out[i].ext); + kfree(SCTP_SO(stream, i)->ext); kfree(stream->out); kfree(stream->in); } @@ -215,12 +215,12 @@ void sctp_stream_clear(struct sctp_stream *stream) int i; for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + SCTP_SO(stream, i)->mid = 0; + SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) @@ -273,8 +273,8 @@ static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); - if (stream->out[sid].ext && - !list_empty(&stream->out[sid].ext->outq)) + if (SCTP_SO(stream, sid)->ext && + !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } @@ -361,11 +361,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc, if (out) { if (str_nums) for (i = 0; i < str_nums; i++) - stream->out[str_list[i]].state = + SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_CLOSED; + SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; @@ -380,11 +380,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc, if (str_nums) for (i = 0; i < str_nums; i++) - stream->out[str_list[i]].state = + SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } @@ -418,7 +418,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_CLOSED; + SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); @@ -429,7 +429,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } @@ -609,10 +609,10 @@ struct sctp_chunk *sctp_process_strreset_outreq( } for (i = 0; i < nums; i++) - stream->in[ntohs(str_p[i])].mid = 0; + SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; } else { for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; } result = SCTP_STRRESET_PERFORMED; @@ -683,11 +683,11 @@ struct sctp_chunk *sctp_process_strreset_inreq( if (nums) for (i = 0; i < nums; i++) - stream->out[ntohs(str_p[i])].state = + SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_CLOSED; + SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; @@ -786,11 +786,11 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + SCTP_SO(stream, i)->mid = 0; + SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; @@ -979,15 +979,18 @@ struct sctp_chunk *sctp_process_strreset_resp( sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { + struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { - stream->out[ntohs(str_p[i])].mid = 0; - stream->out[ntohs(str_p[i])].mid_uo = 0; + sout = SCTP_SO(stream, ntohs(str_p[i])); + sout->mid = 0; + sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + sout = SCTP_SO(stream, i); + sout->mid = 0; + sout->mid_uo = 0; } } @@ -995,7 +998,7 @@ struct sctp_chunk *sctp_process_strreset_resp( } for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); @@ -1050,15 +1053,15 @@ struct sctp_chunk *sctp_process_strreset_resp( asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + SCTP_SO(stream, i)->mid = 0; + SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); @@ -1072,7 +1075,7 @@ struct sctp_chunk *sctp_process_strreset_resp( if (result == SCTP_STRRESET_PERFORMED) for (i = number; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; else stream->outcnt = number; diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index d3764c181299..0a78cdf86463 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -197,7 +197,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial( __u32 next_fsn = 0; int is_last = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -278,7 +278,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( __u32 pd_len = 0; __u32 mid = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -368,7 +368,7 @@ static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, sctp_intl_store_reasm(ulpq, event); - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); @@ -575,7 +575,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( __u32 next_fsn = 0; int is_last = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -659,7 +659,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( __u32 pd_len = 0; __u32 mid = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -750,7 +750,7 @@ static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, sctp_intl_store_reasm_uo(ulpq, event); - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); @@ -774,7 +774,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); - csin = sctp_stream_in(ulpq->asoc, cevent->stream); + csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; @@ -875,7 +875,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); - csin = sctp_stream_in(ulpq->asoc, cevent->stream); + csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; @@ -1053,7 +1053,7 @@ static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { - struct sctp_stream_in *sin = &stream->in[sid]; + struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { @@ -1247,7 +1247,7 @@ static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { - struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid); + struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index f5fcd425232a..a6c04a94b08f 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -161,7 +161,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, /* Give the next scheduler a clean slate. */ for (i = 0; i < asoc->stream.outcnt; i++) { - void *p = asoc->stream.out[i].ext; + void *p = SCTP_SO(&asoc->stream, i)->ext; if (!p) continue; @@ -175,7 +175,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { - if (!asoc->stream.out[i].ext) + if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); @@ -217,7 +217,7 @@ int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, if (sid >= asoc->stream.outcnt) return -EINVAL; - if (!asoc->stream.out[sid].ext) { + if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); @@ -234,7 +234,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, if (sid >= asoc->stream.outcnt) return -EINVAL; - if (!asoc->stream.out[sid].ext) + if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); @@ -252,7 +252,7 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); - sout = &q->asoc->stream.out[sid]; + sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } @@ -272,8 +272,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; - INIT_LIST_HEAD(&stream->out[sid].ext->outq); + INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 7997d35dd0fd..2245083a98f2 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -75,10 +75,10 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head( /* No luck. So we search on all streams now. */ for (i = 0; i < stream->outcnt; i++) { - if (!stream->out[i].ext) + if (!SCTP_SO(stream, i)->ext) continue; - p = stream->out[i].ext->prio_head; + p = SCTP_SO(stream, i)->ext->prio_head; if (!p) /* Means all other streams won't be initialized * as well. @@ -165,7 +165,7 @@ static void sctp_sched_prio_sched(struct sctp_stream *stream, static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, __u16 prio, gfp_t gfp) { - struct sctp_stream_out *sout = &stream->out[sid]; + struct sctp_stream_out *sout = SCTP_SO(stream, sid); struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; @@ -186,7 +186,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, return 0; for (i = 0; i < stream->outcnt; i++) { - soute = stream->out[i].ext; + soute = SCTP_SO(stream, i)->ext; if (soute && soute->prio_head == old) /* It's still in use, nothing else to do here. */ return 0; @@ -201,7 +201,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { - *value = stream->out[sid].ext->prio_head->prio; + *value = SCTP_SO(stream, sid)->ext->prio_head->prio; return 0; } @@ -215,7 +215,7 @@ static int sctp_sched_prio_init(struct sctp_stream *stream) static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { - INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); + INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list); return sctp_sched_prio_set(stream, sid, 0, gfp); } @@ -233,9 +233,9 @@ static void sctp_sched_prio_free(struct sctp_stream *stream) */ sctp_sched_prio_unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { - if (!stream->out[i].ext) + if (!SCTP_SO(stream, i)->ext) continue; - prio = stream->out[i].ext->prio_head; + prio = SCTP_SO(stream, i)->ext->prio_head; if (prio && list_empty(&prio->prio_sched)) list_add(&prio->prio_sched, &list); } @@ -255,7 +255,7 @@ static void sctp_sched_prio_enqueue(struct sctp_outq *q, ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; - sctp_sched_prio_sched(stream, stream->out[sid].ext); + sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) @@ -297,7 +297,7 @@ static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, * this priority. */ sid = sctp_chunk_stream_no(ch); - soute = q->asoc->stream.out[sid].ext; + soute = SCTP_SO(&q->asoc->stream, sid)->ext; prio = soute->prio_head; sctp_sched_prio_next_stream(prio); @@ -317,7 +317,7 @@ static void sctp_sched_prio_sched_all(struct sctp_stream *stream) __u16 sid; sid = sctp_chunk_stream_no(ch); - sout = &stream->out[sid]; + sout = SCTP_SO(stream, sid); if (sout->ext) sctp_sched_prio_sched(stream, sout->ext); } diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index 1155692448f1..52ba743fa7a7 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -100,7 +100,7 @@ static int sctp_sched_rr_init(struct sctp_stream *stream) static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { - INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); + INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->rr_list); return 0; } @@ -120,7 +120,7 @@ static void sctp_sched_rr_enqueue(struct sctp_outq *q, ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; - sctp_sched_rr_sched(stream, stream->out[sid].ext); + sctp_sched_rr_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) @@ -154,7 +154,7 @@ static void sctp_sched_rr_dequeue_done(struct sctp_outq *q, /* Last chunk on that msg, move to the next stream */ sid = sctp_chunk_stream_no(ch); - soute = q->asoc->stream.out[sid].ext; + soute = SCTP_SO(&q->asoc->stream, sid)->ext; sctp_sched_rr_next_stream(&q->asoc->stream); @@ -173,7 +173,7 @@ static void sctp_sched_rr_sched_all(struct sctp_stream *stream) __u16 sid; sid = sctp_chunk_stream_no(ch); - soute = stream->out[sid].ext; + soute = SCTP_SO(stream, sid)->ext; if (soute) sctp_sched_rr_sched(stream, soute); } -- cgit v1.2.3 From 0d493b4d0be352b5e361e4fa0bc3efe952d8b10e Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Fri, 10 Aug 2018 20:11:43 +0300 Subject: net/sctp: Replace in/out stream arrays with flex_array This path replaces physically contiguous memory arrays allocated using kmalloc_array() with flexible arrays. This enables to avoid memory allocation failures on the systems under a memory stress. Signed-off-by: Oleg Babin Signed-off-by: Konstantin Khorenko Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 9 ++--- net/sctp/stream.c | 88 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6b2b8df8a1d2..28a7c8e44636 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -57,6 +57,7 @@ #include /* This gets us atomic counters. */ #include /* We need sk_buff_head. */ #include /* We need tq_struct. */ +#include /* We need flex_array. */ #include /* We need sctp* header structs. */ #include /* We need auth specific structs */ #include /* For inet_skb_parm */ @@ -1438,8 +1439,8 @@ struct sctp_stream_in { }; struct sctp_stream { - struct sctp_stream_out *out; - struct sctp_stream_in *in; + struct flex_array *out; + struct flex_array *in; __u16 outcnt; __u16 incnt; /* Current stream being sent, if any */ @@ -1465,14 +1466,14 @@ static inline struct sctp_stream_out *sctp_stream_out( const struct sctp_stream *stream, __u16 sid) { - return ((struct sctp_stream_out *)(stream->out)) + sid; + return flex_array_get(stream->out, sid); } static inline struct sctp_stream_in *sctp_stream_in( const struct sctp_stream *stream, __u16 sid) { - return ((struct sctp_stream_in *)(stream->in)) + sid; + return flex_array_get(stream->in, sid); } #define SCTP_SO(s, i) sctp_stream_out((s), (i)) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 7ca6fe4e7882..ffb940d3b57c 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -37,6 +37,53 @@ #include #include +static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count, + gfp_t gfp) +{ + struct flex_array *result; + int err; + + result = flex_array_alloc(elem_size, elem_count, gfp); + if (result) { + err = flex_array_prealloc(result, 0, elem_count, gfp); + if (err) { + flex_array_free(result); + result = NULL; + } + } + + return result; +} + +static void fa_free(struct flex_array *fa) +{ + if (fa) + flex_array_free(fa); +} + +static void fa_copy(struct flex_array *fa, struct flex_array *from, + size_t index, size_t count) +{ + void *elem; + + while (count--) { + elem = flex_array_get(from, index); + flex_array_put(fa, index, elem, 0); + index++; + } +} + +static void fa_zero(struct flex_array *fa, size_t index, size_t count) +{ + void *elem; + + while (count--) { + elem = flex_array_get(fa, index); + memset(elem, 0, fa->element_size); + index++; + } +} + /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. @@ -78,34 +125,33 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { - kfree(new->out[i].ext); - new->out[i].ext = stream->out[i].ext; - stream->out[i].ext = NULL; + kfree(SCTP_SO(new, i)->ext); + SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; + SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) - kfree(stream->out[i].ext); + kfree(SCTP_SO(stream, i)->ext); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { - struct sctp_stream_out *out; + struct flex_array *out; + size_t elem_size = sizeof(struct sctp_stream_out); - out = kmalloc_array(outcnt, sizeof(*out), gfp); + out = fa_alloc(elem_size, outcnt, gfp); if (!out) return -ENOMEM; if (stream->out) { - memcpy(out, stream->out, min(outcnt, stream->outcnt) * - sizeof(*out)); - kfree(stream->out); + fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt)); + fa_free(stream->out); } if (outcnt > stream->outcnt) - memset(out + stream->outcnt, 0, - (outcnt - stream->outcnt) * sizeof(*out)); + fa_zero(out, stream->outcnt, (outcnt - stream->outcnt)); stream->out = out; @@ -115,22 +161,20 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { - struct sctp_stream_in *in; - - in = kmalloc_array(incnt, sizeof(*stream->in), gfp); + struct flex_array *in; + size_t elem_size = sizeof(struct sctp_stream_in); + in = fa_alloc(elem_size, incnt, gfp); if (!in) return -ENOMEM; if (stream->in) { - memcpy(in, stream->in, min(incnt, stream->incnt) * - sizeof(*in)); - kfree(stream->in); + fa_copy(in, stream->in, 0, min(incnt, stream->incnt)); + fa_free(stream->in); } if (incnt > stream->incnt) - memset(in + stream->incnt, 0, - (incnt - stream->incnt) * sizeof(*in)); + fa_zero(in, stream->incnt, (incnt - stream->incnt)); stream->in = in; @@ -174,7 +218,7 @@ in: ret = sctp_stream_alloc_in(stream, incnt, gfp); if (ret) { sched->free(stream); - kfree(stream->out); + fa_free(stream->out); stream->out = NULL; stream->outcnt = 0; goto out; @@ -206,8 +250,8 @@ void sctp_stream_free(struct sctp_stream *stream) sched->free(stream); for (i = 0; i < stream->outcnt; i++) kfree(SCTP_SO(stream, i)->ext); - kfree(stream->out); - kfree(stream->in); + fa_free(stream->out); + fa_free(stream->in); } void sctp_stream_clear(struct sctp_stream *stream) -- cgit v1.2.3 From 2142236b45843dbcbe9691d24cf06caff91a78fd Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:41 +0300 Subject: net: sched: act_bpf: remove dependency on rtnl lock Use tcf spinlock to protect bpf action private data from concurrent modification during dump and init. Remove rtnl lock assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 6203eb075c9a..9e8a33f9fee3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -143,11 +143,12 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, .index = prog->tcf_index, .refcnt = refcount_read(&prog->tcf_refcnt) - ref, .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, - .action = prog->tcf_action, }; struct tcf_t tm; int ret; + spin_lock(&prog->tcf_lock); + opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, TCA_ACT_BPF_PAD)) goto nla_put_failure; + spin_unlock(&prog->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } @@ -264,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); /* updates to prog->filter are prevented, since it's called either - * with rtnl lock or during final cleanup in rcu callback + * with tcf lock or during final cleanup in rcu callback */ cfg->filter = rcu_dereference_protected(prog->filter, 1); @@ -336,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, goto out; prog = to_bpf(*act); - ASSERT_RTNL(); + spin_lock(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); @@ -349,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->tcf_action = parm->action; rcu_assign_pointer(prog->filter, cfg.filter); + spin_unlock(&prog->tcf_lock); if (res == ACT_P_CREATED) { tcf_idr_insert(tn, *act); -- cgit v1.2.3 From b6a2b971c0b00253197682fbdf1c55fc0e2610a4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:42 +0300 Subject: net: sched: act_csum: remove dependency on rtnl lock Use tcf lock to protect csum action struct private data from concurrent modification in init and dump. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_csum.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 648a3a35b720..f01c59ba6d12 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -50,7 +50,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); - struct tcf_csum_params *params_old, *params_new; + struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -88,20 +88,22 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } p = to_tcf_csum(*a); - ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { tcf_idr_release(*a, bind); return -ENOMEM; } - params_old = rtnl_dereference(p->params); + params_new->update_flags = parm->update_flags; + spin_lock(&p->tcf_lock); p->tcf_action = parm->action; - params_new->update_flags = parm->update_flags; - rcu_assign_pointer(p->params, params_new); - if (params_old) - kfree_rcu(params_old, rcu); + rcu_swap_protected(p->params, params_new, + lockdep_is_held(&p->tcf_lock)); + spin_unlock(&p->tcf_lock); + + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -599,11 +601,13 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, - .action = p->tcf_action, }; struct tcf_t t; - params = rtnl_dereference(p->params); + spin_lock(&p->tcf_lock); + params = rcu_dereference_protected(p->params, + lockdep_is_held(&p->tcf_lock)); + opt.action = p->tcf_action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; + spin_unlock(&p->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From e8917f437006686b8fa1b9e54f31d7abc0ea7e97 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:43 +0300 Subject: net: sched: act_gact: remove dependency on rtnl lock Use tcf spinlock to protect gact action private state from concurrent modification during dump and init. Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_gact.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 661b72b9147d..bfccd34a3968 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -113,7 +113,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(*a); - ASSERT_RTNL(); + spin_lock(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { @@ -126,6 +126,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact->tcfg_ptype = p_parm->ptype; } #endif + spin_unlock(&gact->tcf_lock); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; @@ -178,10 +180,11 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, - .action = gact->tcf_action, }; struct tcf_t t; + spin_lock(&gact->tcf_lock); + opt.action = gact->tcf_action; if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; #ifdef CONFIG_GACT_PROB @@ -199,9 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; + spin_unlock(&gact->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 54d0d423a48aa0e61bb39665d20376ba7b940535 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:44 +0300 Subject: net: sched: act_ife: remove dependency on rtnl lock Use tcf spinlock and rcu to protect params pointer from concurrent modification during dump and init. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Ife action has meta-actions that are compiled as standalone modules. Rtnl mutex must be released while loading a kernel module. In order to support execution without rtnl mutex, propagate 'rtnl_held' argument to meta action loading functions. When requesting meta action module, conditionally release rtnl lock depending on 'rtnl_held' argument. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_ife.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index df4060e32d43..5d200495e467 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -268,7 +268,8 @@ static const char *ife_meta_id2name(u32 metaid) * under ife->tcf_lock for existing action */ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, - void *val, int len, bool exists) + void *val, int len, bool exists, + bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; @@ -278,9 +279,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, #ifdef CONFIG_MODULES if (exists) spin_unlock_bh(&ife->tcf_lock); - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); ops = find_ife_oplist(metaid); @@ -421,7 +424,7 @@ static void tcf_ife_cleanup(struct tc_action *a) /* under ife->tcf_lock for existing action */ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, - bool exists) + bool exists, bool rtnl_held) { int len = 0; int rc = 0; @@ -433,7 +436,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = load_metaops_and_vet(ife, i, val, len, exists); + rc = load_metaops_and_vet(ife, i, val, len, exists, + rtnl_held); if (rc != 0) return rc; @@ -454,7 +458,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; - struct tcf_ife_params *p, *p_old; + struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -558,7 +562,7 @@ metadata_parse_err: return err; } - err = populate_metalist(ife, tb2, exists); + err = populate_metalist(ife, tb2, exists, rtnl_held); if (err) goto metadata_parse_err; @@ -581,13 +585,13 @@ metadata_parse_err: } ife->tcf_action = parm->action; + /* protected by tcf_lock when modifying existing action */ + rcu_swap_protected(ife->params, p, 1); + if (exists) spin_unlock_bh(&ife->tcf_lock); - - p_old = rtnl_dereference(ife->params); - rcu_assign_pointer(ife->params, p); - if (p_old) - kfree_rcu(p_old, rcu); + if (p) + kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -600,16 +604,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); - struct tcf_ife_params *p = rtnl_dereference(ife->params); + struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, - .action = ife->tcf_action, - .flags = p->flags, }; struct tcf_t t; + spin_lock_bh(&ife->tcf_lock); + opt.action = ife->tcf_action; + p = rcu_dereference_protected(ife->params, + lockdep_is_held(&ife->tcf_lock)); + opt.flags = p->flags; + if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -635,9 +643,11 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, pr_info("Failed to dump metalist\n"); } + spin_unlock_bh(&ife->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&ife->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From ff25276de997f41197ebab91935627c249a30fc4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:45 +0300 Subject: net: sched: act_ipt: remove dependency on rtnl lock Use tcf spinlock to protect ipt action private data from concurrent modification during dump. Ipt init already takes tcf spinlock when modifying ipt state. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 0dc787a57798..e149f0e66cb6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -288,6 +288,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, * for foolproof you need to not assume this */ + spin_lock_bh(&ipt->tcf_lock); t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) goto nla_put_failure; @@ -307,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; + spin_unlock_bh(&ipt->tcf_lock); kfree(t); return skb->len; nla_put_failure: + spin_unlock_bh(&ipt->tcf_lock); nlmsg_trim(skb, b); kfree(t); return -1; -- cgit v1.2.3 From 67b0c1a3c9ced3726dea73000f8900f453fc894f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:46 +0300 Subject: net: sched: act_pedit: remove dependency on rtnl lock Rearrange pedit init code to only access pedit action data while holding tcf spinlock. Change keys allocation type to atomic to allow it to execute while holding tcf spinlock. Take tcf spinlock in dump function when accessing pedit action data. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 43ba999b2d23..3f62da72ab6a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -187,44 +187,38 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, tcf_idr_cleanup(tn, parm->index); goto out_free; } - p = to_pedit(*a); - keys = kmalloc(ksize, GFP_KERNEL); - if (!keys) { - tcf_idr_release(*a, bind); - ret = -ENOMEM; - goto out_free; - } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) goto out_free; if (!ovr) { - tcf_idr_release(*a, bind); ret = -EEXIST; - goto out_free; - } - p = to_pedit(*a); - if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { - keys = kmalloc(ksize, GFP_KERNEL); - if (!keys) { - ret = -ENOMEM; - goto out_free; - } + goto out_release; } } else { return err; } + p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); - p->tcfp_flags = parm->flags; - p->tcf_action = parm->action; - if (keys) { + + if (ret == ACT_P_CREATED || + (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) { + keys = kmalloc(ksize, GFP_ATOMIC); + if (!keys) { + spin_unlock_bh(&p->tcf_lock); + ret = -ENOMEM; + goto out_release; + } kfree(p->tcfp_keys); p->tcfp_keys = keys; p->tcfp_nkeys = parm->nkeys; } memcpy(p->tcfp_keys, parm->keys, ksize); + p->tcfp_flags = parm->flags; + p->tcf_action = parm->action; + kfree(p->tcfp_keys_ex); p->tcfp_keys_ex = keys_ex; @@ -232,6 +226,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; + +out_release: + tcf_idr_release(*a, bind); out_free: kfree(keys_ex); return ret; @@ -410,6 +407,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, if (unlikely(!opt)) return -ENOBUFS; + spin_lock_bh(&p->tcf_lock); memcpy(opt->keys, p->tcfp_keys, p->tcfp_nkeys * sizeof(struct tc_pedit_key)); opt->index = p->tcf_index; @@ -432,11 +430,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; + spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: + spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; -- cgit v1.2.3 From d7728495665601658c7f94f3b5fa4e3f54d71c18 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:47 +0300 Subject: net: sched: act_sample: remove dependency on rtnl lock Use tcf spinlock to protect private sample action data from concurrent modification during dump and init. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_sample.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 2608ccc83e5e..81071afe1b43 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -80,11 +80,13 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, } s = to_sample(*a); + spin_lock(&s->tcf_lock); s->tcf_action = parm->action; s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { + spin_unlock(&s->tcf_lock); tcf_idr_release(*a, bind); return -ENOMEM; } @@ -94,6 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } + spin_unlock(&s->tcf_lock); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -105,7 +108,8 @@ static void tcf_sample_cleanup(struct tc_action *a) struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; - psample_group = rtnl_dereference(s->psample_group); + /* last reference to action, no need to lock */ + psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); @@ -174,12 +178,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, - .action = s->tcf_action, .refcnt = refcount_read(&s->tcf_refcnt) - ref, .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock(&s->tcf_lock); + opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -196,9 +201,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; + spin_unlock(&s->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 5e48180ed8bcfa60e02887ba801307caf14bbe40 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:48 +0300 Subject: net: sched: act_simple: remove dependency on rtnl lock Use tcf spinlock to protect private simple action data from concurrent modification during dump. (simple init already uses tcf spinlock when changing action state) Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_simple.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index aa51152e0066..18e4452574cd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -156,10 +156,11 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; struct tcf_t t; + spin_lock_bh(&d->tcf_lock); + opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; @@ -167,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From c8814552fe51358f5fc46bc1c4aa4bb68454f4eb Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:49 +0300 Subject: net: sched: act_skbmod: remove dependency on rtnl lock Move read of skbmod_p rcu pointer to be protected by tcf spinlock. Use tcf spinlock to protect private skbmod data from concurrent modification during dump. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_skbmod.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index c437c6d51a71..e9c86ade3b40 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -156,7 +156,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, d = to_skbmod(*a); - ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { tcf_idr_release(*a, bind); @@ -166,10 +165,10 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, p->flags = lflags; d->tcf_action = parm->action; - p_old = rtnl_dereference(d->skbmod_p); - if (ovr) spin_lock_bh(&d->tcf_lock); + /* Protected by tcf_lock if overwriting existing action. */ + p_old = rcu_dereference_protected(d->skbmod_p, 1); if (lflags & SKBMOD_F_DMAC) ether_addr_copy(p->eth_dst, daddr); @@ -205,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, { struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); + struct tcf_skbmod_params *p; struct tc_skbmod opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; struct tcf_t t; + spin_lock_bh(&d->tcf_lock); + opt.action = d->tcf_action; + p = rcu_dereference_protected(d->skbmod_p, + lockdep_is_held(&d->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -231,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 729e01260989cc06c8a78491b46545793aef323a Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:50 +0300 Subject: net: sched: act_tunnel_key: remove dependency on rtnl lock Use tcf lock to protect tunnel key action struct private data from concurrent modification in init and dump. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl lock assertion that is no longer required. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index d42d9e112789..ba2ae9f75ef5 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -204,7 +204,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; - struct tcf_tunnel_key_params *params_old; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; struct tc_tunnel_key *parm; @@ -346,24 +345,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, t = to_tunnel_key(*a); - ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } - - params_old = rtnl_dereference(t->params); - - t->tcf_action = parm->action; params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - rcu_assign_pointer(t->params, params_new); - - if (params_old) - kfree_rcu(params_old, rcu); + spin_lock(&t->tcf_lock); + t->tcf_action = parm->action; + rcu_swap_protected(t->params, params_new, + lockdep_is_held(&t->tcf_lock)); + spin_unlock(&t->tcf_lock); + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -485,12 +482,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, .index = t->tcf_index, .refcnt = refcount_read(&t->tcf_refcnt) - ref, .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, - .action = t->tcf_action, }; struct tcf_t tm; - params = rtnl_dereference(t->params); - + spin_lock(&t->tcf_lock); + params = rcu_dereference_protected(t->params, + lockdep_is_held(&t->tcf_lock)); + opt.action = t->tcf_action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -522,10 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; + spin_unlock(&t->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&t->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 764e9a24480f6ffba5493fb21e6a7b030d6b8b67 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:51 +0300 Subject: net: sched: act_vlan: remove dependency on rtnl lock Use tcf spinlock to protect vlan action private data from concurrent modification during dump and init. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 15a0ee214c9c..5bde17fe3608 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; - struct tcf_vlan_params *p, *p_old; + struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; int action; @@ -202,26 +202,24 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { tcf_idr_release(*a, bind); return -ENOMEM; } - v->tcf_action = parm->action; - - p_old = rtnl_dereference(v->vlan_p); - p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; p->tcfv_push_proto = push_proto; - rcu_assign_pointer(v->vlan_p, p); + spin_lock(&v->tcf_lock); + v->tcf_action = parm->action; + rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); + spin_unlock(&v->tcf_lock); - if (p_old) - kfree_rcu(p_old, rcu); + if (p) + kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -243,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); - struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); + struct tcf_vlan_params *p; struct tc_vlan opt = { .index = v->tcf_index, .refcnt = refcount_read(&v->tcf_refcnt) - ref, .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, - .action = v->tcf_action, - .v_action = p->tcfv_action, }; struct tcf_t t; + spin_lock(&v->tcf_lock); + opt.action = v->tcf_action; + p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -268,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; + spin_unlock(&v->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&v->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 84a75b329be84c108a21ab9c02a52a9bf9e5a919 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:52 +0300 Subject: net: sched: extend action ops with put_dev callback As a preparation for removing dependency on rtnl lock from rules update path, all users of shared objects must take reference while working with them. Extend action ops with put_dev() API to be used on net device returned by get_dev(). Modify mirred action (only action that implements get_dev callback): - Take reference to net device in get_dev. - Implement put_dev API that releases reference to net device. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_mirred.c | 12 +++++++++++- net/sched/cls_api.c | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8c9bc02d05e1..1ad5b19e83a9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,6 +101,7 @@ struct tc_action_ops { void (*stats_update)(struct tc_action *, u64, u32, u64); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a); + void (*put_dev)(struct net_device *dev); int (*delete)(struct net *net, u32 index); }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b26d060da08e..7a045cc7fe3b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -358,8 +358,17 @@ static struct notifier_block mirred_device_notifier = { static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rtnl_dereference(m->tcfm_dev); + + if (dev) + dev_hold(dev); - return rtnl_dereference(m->tcfm_dev); + return dev; +} + +static void tcf_mirred_put_dev(struct net_device *dev) +{ + dev_put(dev); } static int tcf_mirred_delete(struct net *net, u32 index) @@ -382,6 +391,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .put_dev = tcf_mirred_put_dev, .delete = tcf_mirred_delete, }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f922ce27ed5e..31bd1439cf60 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2176,6 +2176,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, if (!dev) continue; ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + a->ops->put_dev(dev); if (ret < 0) return ret; ok_count += ret; -- cgit v1.2.3 From 4e232818bd32b29f15bef532f320a14367d172b4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:53 +0300 Subject: net: sched: act_mirred: remove dependency on rtnl lock Re-introduce mirred list spinlock, that was removed some time ago, in order to protect it from concurrent modifications, instead of relying on rtnl lock. Use tcf spinlock to protect mirred action private data from concurrent modification in init and dump. Rearrange access to mirred data in order to be performed only while holding the lock. Rearrange net dev access to always hold reference while working with it, instead of relying on rntl lock. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 78 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7a045cc7fe3b..327be257033d 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,6 +30,7 @@ #include static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock); static bool tcf_mirred_is_act_redirect(int action) { @@ -62,13 +63,23 @@ static bool tcf_mirred_can_reinsert(int action) return false; } +static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) +{ + return rcu_dereference_protected(m->tcfm_dev, + lockdep_is_held(&m->tcf_lock)); +} + static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; + spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); - dev = rtnl_dereference(m->tcfm_dev); + spin_unlock(&mirred_list_lock); + + /* last reference to action, no need to lock */ + dev = rcu_dereference_protected(m->tcfm_dev, 1); if (dev) dev_put(dev); } @@ -128,22 +139,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } - if (parm->ifindex) { - dev = __dev_get_by_index(net, parm->ifindex); - if (dev == NULL) { - if (exists) - tcf_idr_release(*a, bind); - else - tcf_idr_cleanup(tn, parm->index); - return -ENODEV; - } - mac_header_xmit = dev_is_mac_header_xmit(dev); - } else { - dev = NULL; - } if (!exists) { - if (!dev) { + if (!parm->ifindex) { tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; @@ -161,19 +159,31 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } m = to_mirred(*a); - ASSERT_RTNL(); + spin_lock(&m->tcf_lock); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; - if (dev != NULL) { - if (ret != ACT_P_CREATED) - dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); - dev_hold(dev); - rcu_assign_pointer(m->tcfm_dev, dev); + + if (parm->ifindex) { + dev = dev_get_by_index(net, parm->ifindex); + if (!dev) { + spin_unlock(&m->tcf_lock); + tcf_idr_release(*a, bind); + return -ENODEV; + } + mac_header_xmit = dev_is_mac_header_xmit(dev); + rcu_swap_protected(m->tcfm_dev, dev, + lockdep_is_held(&m->tcf_lock)); + if (dev) + dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } + spin_unlock(&m->tcf_lock); if (ret == ACT_P_CREATED) { + spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); + spin_unlock(&mirred_list_lock); + tcf_idr_insert(tn, *a); } @@ -287,26 +297,33 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); - struct net_device *dev = rtnl_dereference(m->tcfm_dev); struct tc_mirred opt = { .index = m->tcf_index, - .action = m->tcf_action, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, - .eaction = m->tcfm_eaction, - .ifindex = dev ? dev->ifindex : 0, }; + struct net_device *dev; struct tcf_t t; + spin_lock(&m->tcf_lock); + opt.action = m->tcf_action; + opt.eaction = m->tcfm_eaction; + dev = tcf_mirred_dev_dereference(m); + if (dev) + opt.ifindex = dev->ifindex; + if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; + spin_unlock(&m->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } @@ -337,15 +354,19 @@ static int mirred_device_event(struct notifier_block *unused, ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { + spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { - if (rcu_access_pointer(m->tcfm_dev) == dev) { + spin_lock(&m->tcf_lock); + if (tcf_mirred_dev_dereference(m) == dev) { dev_put(dev); /* Note : no rcu grace period necessary, as * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } + spin_unlock(&m->tcf_lock); } + spin_unlock(&mirred_list_lock); } return NOTIFY_DONE; @@ -358,10 +379,13 @@ static struct notifier_block mirred_device_notifier = { static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); - struct net_device *dev = rtnl_dereference(m->tcfm_dev); + struct net_device *dev; + rcu_read_lock(); + dev = rcu_dereference(m->tcfm_dev); if (dev) dev_hold(dev); + rcu_read_unlock(); return dev; } -- cgit v1.2.3 From 51a9f5ae653979ac4bdbd81778a10431f0177e3c Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:54 +0300 Subject: net: core: protect rate estimator statistics pointer with lock Extend gen_new_estimator() to also take stats_lock when re-assigning rate estimator statistics pointer. (to be used by unlocked actions) Rename 'stats_lock' to 'lock' and change argument description to explain that it is now also used for control path. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/gen_stats.h | 4 ++-- net/core/gen_estimator.c | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 0304ba2ae353..883bb9085f15 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -59,13 +59,13 @@ int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 98fd12721221..e4e442d70c2d 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -112,7 +112,7 @@ static void est_timer(struct timer_list *t) * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics - * @stats_lock: statistics lock + * @lock: lock for statistics and control path * @running: qdisc running seqcount * @opt: rate estimator configuration TLV * @@ -128,7 +128,7 @@ static void est_timer(struct timer_list *t) int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt) { @@ -154,19 +154,22 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, seqcount_init(&est->seq); intvl_log = parm->interval + 2; est->bstats = bstats; - est->stats_lock = stats_lock; + est->stats_lock = lock; est->running = running; est->ewma_log = parm->ewma_log; est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; - if (stats_lock) + if (lock) local_bh_disable(); est_fetch_counters(est, &b); - if (stats_lock) + if (lock) local_bh_enable(); est->last_bytes = b.bytes; est->last_packets = b.packets; + + if (lock) + spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { del_timer_sync(&old->timer); @@ -179,6 +182,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, mod_timer(&est->timer, est->next_jiffies); rcu_assign_pointer(*rate_est, est); + if (lock) + spin_unlock_bh(lock); if (old) kfree_rcu(old, rcu); return 0; @@ -209,7 +214,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics - * @stats_lock: statistics lock + * @lock: lock for statistics and control path * @running: qdisc running seqcount (might be NULL) * @opt: rate estimator configuration TLV * @@ -221,11 +226,11 @@ EXPORT_SYMBOL(gen_kill_estimator); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, - stats_lock, running, opt); + lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); -- cgit v1.2.3 From e329bc427395e2d74f2bb685ef3dddda91a6695f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:55 +0300 Subject: net: sched: act_police: remove dependency on rtnl lock Use tcf spinlock to protect police action private data from concurrent modification during dump. (init already uses tcf spinlock when changing police action state) Pass tcf spinlock as estimator lock argument to gen_replace_estimator() during action init. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_police.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 1f3192ea8df7..88c16d80c1cf 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -274,14 +274,15 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_police *police = to_police(a); struct tc_police opt = { .index = police->tcf_index, - .action = police->tcf_action, - .mtu = police->tcfp_mtu, - .burst = PSCHED_NS2TICKS(police->tcfp_burst), .refcnt = refcount_read(&police->tcf_refcnt) - ref, .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock_bh(&police->tcf_lock); + opt.action = police->tcf_action; + opt.mtu = police->tcfp_mtu; + opt.burst = PSCHED_NS2TICKS(police->tcfp_burst); if (police->rate_present) psched_ratecfg_getrate(&opt.rate, &police->rate); if (police->peak_present) @@ -301,10 +302,12 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, t.expires = jiffies_to_clock_t(police->tcf_tm.expires); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; + spin_unlock_bh(&police->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&police->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Sat, 11 Aug 2018 20:27:24 +0000 Subject: ip: add helpers to process in-order fragments faster. This patch introduces several helper functions/macros that will be used in the follow-up patch. No runtime changes yet. The new logic (fully implemented in the second patch) is as follows: * Nodes in the rb-tree will now contain not single fragments, but lists of consecutive fragments ("runs"). * At each point in time, the current "active" run at the tail is maintained/tracked. Fragments that arrive in-order, adjacent to the previous tail fragment, are added to this tail run without triggering the re-balancing of the rb-tree. * If a fragment arrives out of order with the offset _before_ the tail run, it is inserted into the rb-tree as a single fragment. * If a fragment arrives after the current tail fragment (with a gap), it starts a new "tail" run, as is inserted into the rb-tree at the end as the head of the new run. skb->cb is used to store additional information needed here (suggested by Eric Dumazet). Reported-by: Willem de Bruijn Signed-off-by: Peter Oskolkov Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 6 ++++ net/ipv4/ip_fragment.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index b86d14528188..1662cbc0b46b 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -57,7 +57,9 @@ struct frag_v6_compare_key { * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head + * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail + * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far @@ -78,6 +80,7 @@ struct inet_frag_queue { struct sk_buff *fragments; /* Used in IPv6. */ struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; + struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; @@ -113,6 +116,9 @@ void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); +/* Free all skbs in the queue; return the sum of their truesizes. */ +unsigned int inet_frag_rbtree_purge(struct rb_root *root); + static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7cb7ed761d8c..26ace9d2d976 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,6 +57,57 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + struct inet_skb_parm h; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void ip4_frag_init_run(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) +{ + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + ip4_frag_init_run(skb); + q->fragments_tail = skb; + q->last_run_head = skb; +} + /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -654,6 +705,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + #ifdef CONFIG_SYSCTL static int dist_min; -- cgit v1.2.3 From a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Sat, 11 Aug 2018 20:27:25 +0000 Subject: ip: process in-order fragments efficiently This patch changes the runtime behavior of IP defrag queue: incoming in-order fragments are added to the end of the current list/"run" of in-order fragments at the tail. On some workloads, UDP stream performance is substantially improved: RX: ./udp_stream -F 10 -T 2 -l 60 TX: ./udp_stream -c -H -F 10 -T 5 -l 60 with this patchset applied on a 10Gbps receiver: throughput=9524.18 throughput_units=Mbit/s upstream (net-next): throughput=4608.93 throughput_units=Mbit/s Reported-by: Willem de Bruijn Signed-off-by: Peter Oskolkov Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 2 +- net/ipv4/ip_fragment.c | 110 +++++++++++++++++++++++++++++------------------ 2 files changed, 70 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6d258a5669e7..bcb11f3a27c0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -146,7 +146,7 @@ void inet_frag_destroy(struct inet_frag_queue *q) fp = xp; } while (fp); } else { - sum_truesize = skb_rbtree_purge(&q->rb_fragments); + sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); } sum = sum_truesize + f->qsize; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 26ace9d2d976..88281fbce88c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -126,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, - struct net_device *dev); +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -219,7 +219,12 @@ static void ip_expire(struct timer_list *t) head = skb_rb_first(&qp->q.rb_fragments); if (!head) goto out; - rb_erase(&head->rbnode, &qp->q.rb_fragments); + if (FRAG_CB(head)->next_frag) + rb_replace_node(&head->rbnode, + &FRAG_CB(head)->next_frag->rbnode, + &qp->q.rb_fragments); + else + rb_erase(&head->rbnode, &qp->q.rb_fragments); memset(&head->rbnode, 0, sizeof(head->rbnode)); barrier(); } @@ -320,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); + sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; @@ -329,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp) qp->q.fragments = NULL; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; + qp->q.last_run_head = NULL; qp->iif = 0; qp->ecn = 0; @@ -340,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct rb_node **rbn, *parent; - struct sk_buff *skb1; + struct sk_buff *skb1, *prev_tail; struct net_device *dev; unsigned int fragsize; int flags, offset; @@ -418,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) */ /* Find out where to put this fragment. */ - skb1 = qp->q.fragments_tail; - if (!skb1) { - /* This is the first fragment we've received. */ - rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); - qp->q.fragments_tail = skb; - } else if ((skb1->ip_defrag_offset + skb1->len) < end) { - /* This is the common/special case: skb goes to the end. */ + prev_tail = qp->q.fragments_tail; + if (!prev_tail) + ip4_frag_create_run(&qp->q, skb); /* First fragment. */ + else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { + /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ - if (offset < (skb1->ip_defrag_offset + skb1->len)) + if (offset < prev_tail->ip_defrag_offset + prev_tail->len) goto discard_qp; - /* Insert after skb1. */ - rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); - qp->q.fragments_tail = skb; + if (offset == prev_tail->ip_defrag_offset + prev_tail->len) + ip4_frag_append_to_last_run(&qp->q, skb); + else + ip4_frag_create_run(&qp->q, skb); } else { - /* Binary search. Note that skb can become the first fragment, but - * not the last (covered above). */ + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ rbn = &qp->q.rb_fragments.rb_node; do { parent = *rbn; skb1 = rb_to_skb(parent); if (end <= skb1->ip_defrag_offset) rbn = &parent->rb_left; - else if (offset >= skb1->ip_defrag_offset + skb1->len) + else if (offset >= skb1->ip_defrag_offset + + FRAG_CB(skb1)->frag_run_len) rbn = &parent->rb_right; else /* Found an overlap with skb1. */ goto discard_qp; } while (*rbn); /* Here we have parent properly set, and rbn pointing to - * one of its NULL left/right children. Insert skb. */ + * one of its NULL left/right children. Insert skb. + */ + ip4_frag_init_run(skb); rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); } - rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); if (dev) qp->iif = dev->ifindex; @@ -476,7 +485,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, skb, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; return err; } @@ -495,7 +504,7 @@ err: /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; @@ -519,10 +528,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, fp = skb_clone(skb, GFP_ATOMIC); if (!fp) goto out_nomem; - rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(prev_tail)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &qp->q.rb_fragments); if (qp->q.fragments_tail == skb) qp->q.fragments_tail = fp; skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, &qp->q.rb_fragments); consume_skb(head); @@ -558,7 +573,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; - skb->truesize += clone->truesize; + head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); @@ -571,24 +586,36 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, skb_push(head, head->data - skb_network_header(head)); /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; rbn = rb_next(&head->rbnode); rb_erase(&head->rbnode, &qp->q.rb_fragments); - while (rbn) { - struct rb_node *rbnext = rb_next(rbn); - fp = rb_to_skb(rbn); - rb_erase(rbn, &qp->q.rb_fragments); - rbn = rbnext; - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + fp = FRAG_CB(fp)->next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &qp->q.rb_fragments); + rbn = rbnext; + } } sub_frag_mem_limit(qp->q.net, head->truesize); @@ -624,6 +651,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, qp->q.fragments = NULL; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; + qp->q.last_run_head = NULL; return 0; out_nomem: -- cgit v1.2.3 From 7723628101aaeb1d723786747529b4ea65c5b5c5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sun, 12 Aug 2018 10:49:27 -0700 Subject: bpf: Introduce bpf_skb_ancestor_cgroup_id helper == Problem description == It's useful to be able to identify cgroup associated with skb in TC so that a policy can be applied to this skb, and existing bpf_skb_cgroup_id helper can help with this. Though in real life cgroup hierarchy and hierarchy to apply a policy to don't map 1:1. It's often the case that there is a container and corresponding cgroup, but there are many more sub-cgroups inside container, e.g. because it's delegated to containerized application to control resources for its subsystems, or to separate application inside container from infra that belongs to containerization system (e.g. sshd). At the same time it may be useful to apply a policy to container as a whole. If multiple containers like this are run on a host (what is often the case) and many of them have sub-cgroups, it may not be possible to apply per-container policy in TC with existing helpers such as bpf_skb_under_cgroup or bpf_skb_cgroup_id: * bpf_skb_cgroup_id will return id of immediate cgroup associated with skb, i.e. if it's a sub-cgroup inside container, it can't be used to identify container's cgroup; * bpf_skb_under_cgroup can work only with one cgroup and doesn't scale, i.e. if there are N containers on a host and a policy has to be applied to M of them (0 <= M <= N), it'd require M calls to bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild & load new BPF program. == Solution == The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be used to get id of cgroup v2 that is an ancestor of cgroup associated with skb at specified level of cgroup hierarchy. That way admin can place all containers on one level of cgroup hierarchy (what is a good practice in general and already used in many configurations) and identify specific cgroup on this level no matter what sub-cgroup skb is associated with. E.g. if there is a cgroup hierarchy: root/ root/container1/ root/container1/app11/ root/container1/app11/sub-app-a/ root/container1/app12/ root/container2/ root/container2/app21/ root/container2/app22/ root/container2/app22/sub-app-b/ , then having skb associated with root/container1/app11/sub-app-a/ it's possible to get ancestor at level 1, what is container1 and apply policy for this container, or apply another policy if it's container2. Policies can be kept e.g. in a hash map where key is a container cgroup id and value is an action. Levels where container cgroups are created are usually known in advance whether cgroup hierarchy inside container may be hard to predict especially in case when its creation is delegated to containerized application. == Implementation details == The helper gets ancestor by walking parents up to specified level. Another option would be to get different kind of "id" from cgroup->ancestor_ids[level] and use it with idr_find() to get struct cgroup for ancestor. But that would require radix lookup what doesn't seem to be better (at least it's not obviously better). Format of return value of the new helper is same as that of bpf_skb_cgroup_id. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- include/linux/cgroup.h | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- net/core/filter.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9fdf6f57913..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3102a2a23c31..66917a4eba27 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2093,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based @@ -2207,7 +2225,8 @@ union bpf_attr { FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ FN(get_local_storage), \ - FN(sk_select_reuseport), + FN(sk_select_reuseport), \ + FN(skb_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 22906b31d43f..15b9d2df92ca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3778,6 +3778,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *ancestor; + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return ancestor->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4966,6 +4992,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.2.3 From 0b243d004ea640875115d1500ec429a3e9f9fae9 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Fri, 10 Aug 2018 20:46:41 +0530 Subject: net/tls: Combined memory allocation for decryption request For preparing decryption request, several memory chunks are required (aead_req, sgin, sgout, iv, aad). For submitting the decrypt request to an accelerator, it is required that the buffers which are read by the accelerator must be dma-able and not come from stack. The buffers for aad and iv can be separately kmalloced each, but it is inefficient. This patch does a combined allocation for preparing decryption request and then segments into aead_req || sgin || sgout || iv || aad. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 4 - net/tls/tls_sw.c | 238 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 142 insertions(+), 100 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index d8b3b6578c01..d5c683e8bb22 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -124,10 +124,6 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 control; bool decrypted; - - char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE]; - char rx_aad_plaintext[TLS_AAD_SPACE_SIZE]; - }; struct tls_record_info { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 83d67df33f0c..52fbe727d7c1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -48,19 +48,13 @@ static int tls_do_decryption(struct sock *sk, struct scatterlist *sgout, char *iv_recv, size_t data_len, - struct sk_buff *skb, - gfp_t flags) + struct aead_request *aead_req) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct aead_request *aead_req; - int ret; - aead_req = aead_request_alloc(ctx->aead_recv, flags); - if (!aead_req) - return -ENOMEM; - + aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -69,8 +63,6 @@ static int tls_do_decryption(struct sock *sk, crypto_req_done, &ctx->async_wait); ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - - aead_request_free(aead_req); return ret; } @@ -657,8 +649,132 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +/* This function decrypts the input skb into either out_iov or in out_sg + * or in skb buffers itself. The input parameter 'zc' indicates if + * zero-copy mode needs to be tried or not. With zero-copy mode, either + * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are + * NULL, then the decryption happens inside skb buffers itself, i.e. + * zero-copy gets disabled and 'zc' is updated. + */ + +static int decrypt_internal(struct sock *sk, struct sk_buff *skb, + struct iov_iter *out_iov, + struct scatterlist *out_sg, + int *chunk, bool *zc) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0; + struct aead_request *aead_req; + struct sk_buff *unused; + u8 *aad, *iv, *mem = NULL; + struct scatterlist *sgin = NULL; + struct scatterlist *sgout = NULL; + const int data_len = rxm->full_len - tls_ctx->rx.overhead_size; + + if (*zc && (out_iov || out_sg)) { + if (out_iov) + n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1; + else + n_sgout = sg_nents(out_sg); + } else { + n_sgout = 0; + *zc = false; + } + + n_sgin = skb_cow_data(skb, 0, &unused); + if (n_sgin < 1) + return -EBADMSG; + + /* Increment to accommodate AAD */ + n_sgin = n_sgin + 1; + + nsg = n_sgin + n_sgout; + + aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); + mem_size = aead_size + (nsg * sizeof(struct scatterlist)); + mem_size = mem_size + TLS_AAD_SPACE_SIZE; + mem_size = mem_size + crypto_aead_ivsize(ctx->aead_recv); + + /* Allocate a single block of memory which contains + * aead_req || sgin[] || sgout[] || aad || iv. + * This order achieves correct alignment for aead_req, sgin, sgout. + */ + mem = kmalloc(mem_size, sk->sk_allocation); + if (!mem) + return -ENOMEM; + + /* Segment the allocated memory */ + aead_req = (struct aead_request *)mem; + sgin = (struct scatterlist *)(mem + aead_size); + sgout = sgin + n_sgin; + aad = (u8 *)(sgout + n_sgout); + iv = aad + TLS_AAD_SPACE_SIZE; + + /* Prepare IV */ + err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (err < 0) { + kfree(mem); + return err; + } + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + + /* Prepare AAD */ + tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size, + ctx->control); + + /* Prepare sgin */ + sg_init_table(sgin, n_sgin); + sg_set_buf(&sgin[0], aad, TLS_AAD_SPACE_SIZE); + err = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + if (err < 0) { + kfree(mem); + return err; + } + + if (n_sgout) { + if (out_iov) { + sg_init_table(sgout, n_sgout); + sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); + + *chunk = 0; + err = zerocopy_from_iter(sk, out_iov, data_len, &pages, + chunk, &sgout[1], + (n_sgout - 1), false); + if (err < 0) + goto fallback_to_reg_recv; + } else if (out_sg) { + memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); + } else { + goto fallback_to_reg_recv; + } + } else { +fallback_to_reg_recv: + sgout = sgin; + pages = 0; + *chunk = 0; + *zc = false; + } + + /* Prepare and submit AEAD request */ + err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req); + + /* Release the pages in case iov was mapped to pages */ + for (; pages > 0; pages--) + put_page(sg_page(&sgout[pages])); + + kfree(mem); + return err; +} + static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout, bool *zc) + struct iov_iter *dest, int *chunk, bool *zc) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -671,7 +787,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, return err; #endif if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, sgout); + err = decrypt_internal(sk, skb, dest, NULL, chunk, zc); if (err < 0) return err; } else { @@ -690,54 +806,10 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; - struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; - struct scatterlist *sgin = &sgin_arr[0]; - struct strp_msg *rxm = strp_msg(skb); - int ret, nsg = ARRAY_SIZE(sgin_arr); - struct sk_buff *unused; - - ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, - iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - tls_ctx->rx.iv_size); - if (ret < 0) - return ret; - - memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - if (!sgout) { - nsg = skb_cow_data(skb, 0, &unused) + 1; - sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); - sgout = sgin; - } - - sg_init_table(sgin, nsg); - sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE); - - nsg = skb_to_sgvec(skb, &sgin[1], - rxm->offset + tls_ctx->rx.prepend_size, - rxm->full_len - tls_ctx->rx.prepend_size); - if (nsg < 0) { - ret = nsg; - goto out; - } - - tls_make_aad(ctx->rx_aad_ciphertext, - rxm->full_len - tls_ctx->rx.overhead_size, - tls_ctx->rx.rec_seq, - tls_ctx->rx.rec_seq_size, - ctx->control); - - ret = tls_do_decryption(sk, sgin, sgout, iv, - rxm->full_len - tls_ctx->rx.overhead_size, - skb, sk->sk_allocation); - -out: - if (sgin != &sgin_arr[0]) - kfree(sgin); + bool zc = true; + int chunk; - return ret; + return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc); } static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, @@ -816,43 +888,17 @@ int tls_sw_recvmsg(struct sock *sk, } if (!ctx->decrypted) { - int page_count; - int to_copy; - - page_count = iov_iter_npages(&msg->msg_iter, - MAX_SKB_FRAGS); - to_copy = rxm->full_len - tls_ctx->rx.overhead_size; - if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS && - likely(!(flags & MSG_PEEK))) { - struct scatterlist sgin[MAX_SKB_FRAGS + 1]; - int pages = 0; + int to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (!is_kvec && to_copy <= len && + likely(!(flags & MSG_PEEK))) zc = true; - sg_init_table(sgin, MAX_SKB_FRAGS + 1); - sg_set_buf(&sgin[0], ctx->rx_aad_plaintext, - TLS_AAD_SPACE_SIZE); - - err = zerocopy_from_iter(sk, &msg->msg_iter, - to_copy, &pages, - &chunk, &sgin[1], - MAX_SKB_FRAGS, false); - if (err < 0) - goto fallback_to_reg_recv; - - err = decrypt_skb_update(sk, skb, sgin, &zc); - for (; pages > 0; pages--) - put_page(sg_page(&sgin[pages])); - if (err < 0) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } - } else { -fallback_to_reg_recv: - err = decrypt_skb_update(sk, skb, NULL, &zc); - if (err < 0) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } + + err = decrypt_skb_update(sk, skb, &msg->msg_iter, + &chunk, &zc); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; } ctx->decrypted = true; } @@ -903,7 +949,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; - bool zc; + bool zc = false; lock_sock(sk); @@ -920,7 +966,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL, &zc); + err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); -- cgit v1.2.3 From e6f86b0f7ae473969a3301b74bf98af9e42ecd0e Mon Sep 17 00:00:00 2001 From: Virgile Jarry Date: Fri, 10 Aug 2018 17:48:15 +0200 Subject: ipv6: Add icmp_echo_ignore_all support for ICMPv6 Preventing the kernel from responding to ICMP Echo Requests messages can be useful in several ways. The sysctl parameter 'icmp_echo_ignore_all' can be used to prevent the kernel from responding to IPv4 ICMP echo requests. For IPv6 pings, such a sysctl kernel parameter did not exist. Add the ability to prevent the kernel from responding to IPv6 ICMP echo requests through the use of the following sysctl parameter : /proc/sys/net/ipv6/icmp/echo_ignore_all. Update the documentation to reflect this change. Signed-off-by: Virgile Jarry Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/netns/ipv6.h | 1 + include/uapi/linux/sysctl.h | 3 ++- net/ipv6/af_inet6.c | 1 + net/ipv6/icmp.c | 16 +++++++++++++--- 5 files changed, 22 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e74515ecaa9c..8313a636dd53 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1882,6 +1882,11 @@ ratelimit - INTEGER otherwise the minimal space between responses in milliseconds. Default: 1000 +echo_ignore_all - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol. + Default: 0 + xfrm6_gc_thresh - INTEGER The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 762ac9931b62..f0e396ab9bec 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -32,6 +32,7 @@ struct netns_sysctl_ipv6 { int flowlabel_consistency; int auto_flowlabels; int icmpv6_time; + int icmpv6_echo_ignore_all; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 6b58371b1f0d..d71013fffaf6 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -575,7 +575,8 @@ enum { /* /proc/sys/net/ipv6/icmp */ enum { - NET_IPV6_ICMP_RATELIMIT=1 + NET_IPV6_ICMP_RATELIMIT = 1, + NET_IPV6_ICMP_ECHO_IGNORE_ALL = 2 }; /* /proc/sys/net//neigh/ */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 020f6e14a7af..673bba31eb18 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -832,6 +832,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7f6b1f81c200..c9c53ade55c3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -794,6 +794,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -843,7 +844,8 @@ static int icmpv6_rcv(struct sk_buff *skb) switch (type) { case ICMPV6_ECHO_REQUEST: - icmpv6_echo_reply(skb); + if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) + icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: @@ -1104,6 +1106,13 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, + { + .procname = "echo_ignore_all", + .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { }, }; @@ -1115,9 +1124,10 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) sizeof(ipv6_icmp_table_template), GFP_KERNEL); - if (table) + if (table) { table[0].data = &net->ipv6.sysctl.icmpv6_time; - + table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; + } return table; } #endif -- cgit v1.2.3 From 6d37fa49da1e8db8fb1995be22ac837ca41ac8a8 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 10 Aug 2018 11:14:56 -0700 Subject: l2tp: use sk_dst_check() to avoid race on sk->sk_dst_cache In l2tp code, if it is a L2TP_UDP_ENCAP tunnel, tunnel->sk points to a UDP socket. User could call sendmsg() on both this tunnel and the UDP socket itself concurrently. As l2tp_xmit_skb() holds socket lock and call __sk_dst_check() to refresh sk->sk_dst_cache, while udpv6_sendmsg() is lockless and call sk_dst_check() to refresh sk->sk_dst_cache, there could be a race and cause the dst cache to be freed multiple times. So we fix l2tp side code to always call sk_dst_check() to garantee xchg() is called when refreshing sk->sk_dst_cache to avoid race conditions. Syzkaller reported stack trace: BUG: KASAN: use-after-free in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline] BUG: KASAN: use-after-free in atomic_fetch_add_unless include/linux/atomic.h:575 [inline] BUG: KASAN: use-after-free in atomic_add_unless include/linux/atomic.h:597 [inline] BUG: KASAN: use-after-free in dst_hold_safe include/net/dst.h:308 [inline] BUG: KASAN: use-after-free in ip6_hold_safe+0xe6/0x670 net/ipv6/route.c:1029 Read of size 4 at addr ffff8801aea9a880 by task syz-executor129/4829 CPU: 0 PID: 4829 Comm: syz-executor129 Not tainted 4.18.0-rc7-next-20180802+ #30 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline] atomic_fetch_add_unless include/linux/atomic.h:575 [inline] atomic_add_unless include/linux/atomic.h:597 [inline] dst_hold_safe include/net/dst.h:308 [inline] ip6_hold_safe+0xe6/0x670 net/ipv6/route.c:1029 rt6_get_pcpu_route net/ipv6/route.c:1249 [inline] ip6_pol_route+0x354/0xd20 net/ipv6/route.c:1922 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2098 fib6_rule_lookup+0x283/0x890 net/ipv6/fib6_rules.c:122 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2126 ip6_dst_lookup_tail+0x1278/0x1da0 net/ipv6/ip6_output.c:978 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1079 ip6_sk_dst_lookup_flow+0x5ed/0xc50 net/ipv6/ip6_output.c:1117 udpv6_sendmsg+0x2163/0x36b0 net/ipv6/udp.c:1354 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:622 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:632 ___sys_sendmsg+0x51d/0x930 net/socket.c:2115 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2210 __do_sys_sendmmsg net/socket.c:2239 [inline] __se_sys_sendmmsg net/socket.c:2236 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2236 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446a29 Code: e8 ac b8 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4de5532db8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00000000006dcc38 RCX: 0000000000446a29 RDX: 00000000000000b8 RSI: 0000000020001b00 RDI: 0000000000000003 RBP: 00000000006dcc30 R08: 00007f4de5533700 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dcc3c R13: 00007ffe2b830fdf R14: 00007f4de55339c0 R15: 0000000000000001 Fixes: 71b1391a4128 ("l2tp: ensure sk->dst is still valid") Reported-by: syzbot+05f840f3b04f211bad55@syzkaller.appspotmail.com Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Cc: Guillaume Nault Cc: David Ahern Cc: Cong Wang Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 2bd701a58aa6..82cdf9020b53 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1098,7 +1098,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len /* Get routing info from the tunnel socket */ skb_dst_drop(skb); - skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0))); + skb_dst_set(skb, sk_dst_check(sk, 0)); inet = inet_sk(sk); fl = &inet->cork.fl; -- cgit v1.2.3 From 962ad1f937d86456e88d8cbcd93766746297f711 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:49 -0400 Subject: net: sched: act_connmark method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_connmark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2f9bc833d046..54c0bf54f2ac 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -31,8 +31,8 @@ static unsigned int connmark_net_id; static struct tc_action_ops act_connmark_ops; -static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; @@ -209,7 +209,7 @@ static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, .owner = THIS_MODULE, - .act = tcf_connmark, + .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, .walk = tcf_connmark_walker, -- cgit v1.2.3 From 2fbec27f816bd95ebc468d17a784b20a72b95896 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:50 -0400 Subject: net: sched: act_bpf method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 9e8a33f9fee3..9b30e62805c7 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -34,8 +34,8 @@ struct tcf_bpf_cfg { static unsigned int bpf_net_id; static struct tc_action_ops act_bpf_ops; -static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, - struct tcf_result *res) +static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, + struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); @@ -406,7 +406,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, .owner = THIS_MODULE, - .act = tcf_bpf, + .act = tcf_bpf_act, .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, -- cgit v1.2.3 From c831549c3f53537e3b8a205c5d67cbc16c054f6a Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:51 -0400 Subject: net: sched: act_sum method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_csum.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index f01c59ba6d12..5596fae4e478 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -555,8 +555,8 @@ fail: return 0; } -static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); struct tcf_csum_params *params; @@ -670,7 +670,7 @@ static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, .owner = THIS_MODULE, - .act = tcf_csum, + .act = tcf_csum_act, .dump = tcf_csum_dump, .init = tcf_csum_init, .cleanup = tcf_csum_cleanup, -- cgit v1.2.3 From 1740005e2a0cae1ba87e3efe2690755fe169837b Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:52 -0400 Subject: net: sched: act_gact method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_gact.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index bfccd34a3968..52a3e474d822 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -133,8 +133,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, return ret; } -static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); @@ -254,7 +254,7 @@ static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, .owner = THIS_MODULE, - .act = tcf_gact, + .act = tcf_gact_act, .stats_update = tcf_gact_stats_update, .dump = tcf_gact_dump, .init = tcf_gact_init, -- cgit v1.2.3 From 11b9695b3ff06990333d77963607944574953c98 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:53 -0400 Subject: net: sched: act_ipt method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index e149f0e66cb6..51f235bbeb5b 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -222,8 +222,8 @@ static int tcf_xt_init(struct net *net, struct nlattr *nla, bind); } -static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *ipt = to_ipt(a); @@ -348,7 +348,7 @@ static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, .owner = THIS_MODULE, - .act = tcf_ipt, + .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, @@ -406,7 +406,7 @@ static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, .owner = THIS_MODULE, - .act = tcf_ipt, + .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_xt_init, -- cgit v1.2.3 From 0390514fe15501e9ddc4e87bdeed35fec9fc4802 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:54 -0400 Subject: net: sched: act_nat method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_nat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4dd9188a72fd..822e903bfc25 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -93,8 +93,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return ret; } -static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; @@ -311,7 +311,7 @@ static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, .owner = THIS_MODULE, - .act = tcf_nat, + .act = tcf_nat_act, .dump = tcf_nat_dump, .init = tcf_nat_init, .walk = tcf_nat_walker, -- cgit v1.2.3 From 6a2b401cd17d41944672563b2edf65cdef44c242 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:55 -0400 Subject: net: sched: act_pedit method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3f62da72ab6a..8a7a7cb94e83 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -288,8 +288,8 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb, return ret; } -static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); int i; @@ -471,7 +471,7 @@ static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, .owner = THIS_MODULE, - .act = tcf_pedit, + .act = tcf_pedit_act, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, -- cgit v1.2.3 From 2ac063474dc738700eab3425d1f7c4ad98776bcd Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:56 -0400 Subject: net: sched: act_police method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_police.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 88c16d80c1cf..06f0742db593 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -56,7 +56,7 @@ struct tc_police_compat { static unsigned int police_net_id; static struct tc_action_ops act_police_ops; -static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, +static int tcf_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) @@ -73,7 +73,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_init(struct net *net, struct nlattr *nla, +static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) @@ -203,7 +203,7 @@ failure: return err; } -static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, +static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); @@ -267,7 +267,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcf_action; } -static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, +static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); @@ -335,10 +335,10 @@ static struct tc_action_ops act_police_ops = { .kind = "police", .type = TCA_ID_POLICE, .owner = THIS_MODULE, - .act = tcf_act_police, - .dump = tcf_act_police_dump, - .init = tcf_act_police_init, - .walk = tcf_act_police_walker, + .act = tcf_police_act, + .dump = tcf_police_dump, + .init = tcf_police_init, + .walk = tcf_police_walker, .lookup = tcf_police_search, .delete = tcf_police_delete, .size = sizeof(struct tcf_police), -- cgit v1.2.3 From 798de374e50309dba39cee16527cb3534e84ba86 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:57 -0400 Subject: net: sched: act_simple method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_simple.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 18e4452574cd..e616523ba3c1 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -28,8 +28,8 @@ static unsigned int simp_net_id; static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 -static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_defact *d = to_defact(a); @@ -207,7 +207,7 @@ static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, .owner = THIS_MODULE, - .act = tcf_simp, + .act = tcf_simp_act, .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, -- cgit v1.2.3 From 45da1dac612c0658ed946d573000e88f0d8ec5bc Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:58 -0400 Subject: net: sched: act_skbedit method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index a6db47ebec11..926d7bc4a89d 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -33,8 +33,8 @@ static unsigned int skbedit_net_id; static struct tc_action_ops act_skbedit_ops; -static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; @@ -310,7 +310,7 @@ static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, .owner = THIS_MODULE, - .act = tcf_skbedit, + .act = tcf_skbedit_act, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, -- cgit v1.2.3 From 353d2c253f4cc5c7b28a041a79949f46ed5edb25 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:59 -0400 Subject: net: sched: act_skbmod method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_skbmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index e9c86ade3b40..d6a1af0c4171 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -24,7 +24,7 @@ static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; #define MAX_EDIT_LEN ETH_HLEN -static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, +static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); @@ -270,7 +270,7 @@ static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, .owner = THIS_MODULE, - .act = tcf_skbmod_run, + .act = tcf_skbmod_act, .dump = tcf_skbmod_dump, .init = tcf_skbmod_init, .cleanup = tcf_skbmod_cleanup, -- cgit v1.2.3 From 8aa7f22e5649db6f994cda1212a37e7f8ae4e63e Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:35:00 -0400 Subject: net: sched: act_vlan method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 5bde17fe3608..d1f5028384c9 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -22,8 +22,8 @@ static unsigned int vlan_net_id; static struct tc_action_ops act_vlan_ops; -static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; @@ -307,7 +307,7 @@ static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, .owner = THIS_MODULE, - .act = tcf_vlan, + .act = tcf_vlan_act, .dump = tcf_vlan_dump, .init = tcf_vlan_init, .cleanup = tcf_vlan_cleanup, -- cgit v1.2.3 From 7c5790c4da0e5b96b147d683c94ba3ed93e5f0fe Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:35:01 -0400 Subject: net: sched: act_mirred method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 327be257033d..8ec216001077 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -190,8 +190,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return ret; } -static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); struct sk_buff *skb2 = skb; @@ -406,7 +406,7 @@ static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, .owner = THIS_MODULE, - .act = tcf_mirred, + .act = tcf_mirred_act, .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, -- cgit v1.2.3 From 71e41286203c017d24f041a7cd71abea7ca7b1e0 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 13 Aug 2018 10:42:46 +0800 Subject: packet: switch kvzalloc to allocate memory The patches includes following change: *Use modern kvzalloc()/kvfree() instead of custom allocations. *Remove order argument for alloc_pg_vec, it can get from req. *Remove order argument for free_pg_vec, free_pg_vec now uses kvfree which does not need order argument. *Remove pg_vec_order from struct packet_ring_buffer, no longer need to save/restore 'order' *Remove variable 'order' for packet_set_ring, it is now unused Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/packet/af_packet.c | 44 +++++++++++++------------------------------- net/packet/internal.h | 1 - 2 files changed, 13 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..5610061e7f2e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4137,52 +4137,36 @@ static const struct vm_operations_struct packet_mmap_ops = { .close = packet_mm_close, }; -static void free_pg_vec(struct pgv *pg_vec, unsigned int order, - unsigned int len) +static void free_pg_vec(struct pgv *pg_vec, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { - if (is_vmalloc_addr(pg_vec[i].buffer)) - vfree(pg_vec[i].buffer); - else - free_pages((unsigned long)pg_vec[i].buffer, - order); + kvfree(pg_vec[i].buffer); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } -static char *alloc_one_pg_vec_page(unsigned long order) +static char *alloc_one_pg_vec_page(unsigned long size) { char *buffer; - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | - __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; - buffer = (char *) __get_free_pages(gfp_flags, order); + buffer = kvzalloc(size, GFP_KERNEL); if (buffer) return buffer; - /* __get_free_pages failed, fall back to vmalloc */ - buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); - if (buffer) - return buffer; + buffer = kvzalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); - /* vmalloc failed, lets dig into swap here */ - gfp_flags &= ~__GFP_NORETRY; - buffer = (char *) __get_free_pages(gfp_flags, order); - if (buffer) - return buffer; - - /* complete and utter failure */ - return NULL; + return buffer; } -static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) +static struct pgv *alloc_pg_vec(struct tpacket_req *req) { unsigned int block_nr = req->tp_block_nr; + unsigned long size = req->tp_block_size; struct pgv *pg_vec; int i; @@ -4191,7 +4175,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) goto out; for (i = 0; i < block_nr; i++) { - pg_vec[i].buffer = alloc_one_pg_vec_page(order); + pg_vec[i].buffer = alloc_one_pg_vec_page(size); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } @@ -4200,7 +4184,7 @@ out: return pg_vec; out_free_pgvec: - free_pg_vec(pg_vec, order, block_nr); + free_pg_vec(pg_vec, block_nr); pg_vec = NULL; goto out; } @@ -4210,9 +4194,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); - int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; + int was_running; __be16 num; int err = -EINVAL; /* Added to avoid minimal code churn */ @@ -4274,8 +4258,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; err = -ENOMEM; - order = get_order(req->tp_block_size); - pg_vec = alloc_pg_vec(req, order); + pg_vec = alloc_pg_vec(req); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { @@ -4329,7 +4312,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); - swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; @@ -4355,7 +4337,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } if (pg_vec) - free_pg_vec(pg_vec, order, req->tp_block_nr); + free_pg_vec(pg_vec, req->tp_block_nr); out: return err; } diff --git a/net/packet/internal.h b/net/packet/internal.h index 3bb7c5fb3bff..8f50036f62f0 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -64,7 +64,6 @@ struct packet_ring_buffer { unsigned int frame_size; unsigned int frame_max; - unsigned int pg_vec_order; unsigned int pg_vec_pages; unsigned int pg_vec_len; -- cgit v1.2.3 From 42c625a486f367ad57d4257de6d9459daf9484a0 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 13 Aug 2018 20:20:11 +0300 Subject: net: sched: act_ife: disable bh when taking ife_mod_lock Lockdep reports deadlock for following locking scenario in ife action: Task one: 1) Executes ife action update. 2) Takes tcfa_lock. 3) Waits on ife_mod_lock which is already taken by task two. Task two: 1) Executes any path that obtains ife_mod_lock without disabling bh (any path that takes ife_mod_lock while holding tcfa_lock has bh disabled) like loading a meta module, or creating new action. 2) Takes ife_mod_lock. 3) Task is preempted by rate estimator timer. 4) Timer callback waits on tcfa_lock which is taken by task one. In described case tasks deadlock because they take same two locks in different order. To prevent potential deadlock reported by lockdep, always disable bh when obtaining ife_mod_lock. Lockdep warning: [ 508.101192] ===================================================== [ 508.107708] WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected [ 508.114728] 4.18.0-rc8+ #646 Not tainted [ 508.119050] ----------------------------------------------------- [ 508.125559] tc/5460 [HC0[0]:SC0[2]:HE1:SE0] is trying to acquire: [ 508.132025] 000000005a938c68 (ife_mod_lock){++++}, at: find_ife_oplist+0x1e/0xc0 [act_ife] [ 508.140996] and this task is already holding: [ 508.147548] 00000000d46f6c56 (&(&p->tcfa_lock)->rlock){+.-.}, at: tcf_ife_init+0x6ae/0xf40 [act_ife] [ 508.157371] which would create a new lock dependency: [ 508.162828] (&(&p->tcfa_lock)->rlock){+.-.} -> (ife_mod_lock){++++} [ 508.169572] but this new dependency connects a SOFTIRQ-irq-safe lock: [ 508.178197] (&(&p->tcfa_lock)->rlock){+.-.} [ 508.178201] ... which became SOFTIRQ-irq-safe at: [ 508.189771] _raw_spin_lock+0x2c/0x40 [ 508.193906] est_fetch_counters+0x41/0xb0 [ 508.198391] est_timer+0x83/0x3c0 [ 508.202180] call_timer_fn+0x16a/0x5d0 [ 508.206400] run_timer_softirq+0x399/0x920 [ 508.210967] __do_softirq+0x157/0x97d [ 508.215102] irq_exit+0x152/0x1c0 [ 508.218888] smp_apic_timer_interrupt+0xc0/0x4e0 [ 508.223976] apic_timer_interrupt+0xf/0x20 [ 508.228540] cpuidle_enter_state+0xf8/0x5d0 [ 508.233198] do_idle+0x28a/0x350 [ 508.236881] cpu_startup_entry+0xc7/0xe0 [ 508.241296] start_secondary+0x2e8/0x3f0 [ 508.245678] secondary_startup_64+0xa5/0xb0 [ 508.250347] to a SOFTIRQ-irq-unsafe lock: (ife_mod_lock){++++} [ 508.256531] ... which became SOFTIRQ-irq-unsafe at: [ 508.267279] ... [ 508.267283] _raw_write_lock+0x2c/0x40 [ 508.273653] register_ife_op+0x118/0x2c0 [act_ife] [ 508.278926] do_one_initcall+0xf7/0x4d9 [ 508.283214] do_init_module+0x18b/0x44e [ 508.287521] load_module+0x4167/0x5730 [ 508.291739] __do_sys_finit_module+0x16d/0x1a0 [ 508.296654] do_syscall_64+0x7a/0x3f0 [ 508.300788] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.306302] other info that might help us debug this: [ 508.315286] Possible interrupt unsafe locking scenario: [ 508.322771] CPU0 CPU1 [ 508.327681] ---- ---- [ 508.332604] lock(ife_mod_lock); [ 508.336300] local_irq_disable(); [ 508.342608] lock(&(&p->tcfa_lock)->rlock); [ 508.349793] lock(ife_mod_lock); [ 508.355990] [ 508.358974] lock(&(&p->tcfa_lock)->rlock); [ 508.363803] *** DEADLOCK *** [ 508.370715] 2 locks held by tc/5460: [ 508.374680] #0: 00000000e27e4fa4 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x583/0x7b0 [ 508.383366] #1: 00000000d46f6c56 (&(&p->tcfa_lock)->rlock){+.-.}, at: tcf_ife_init+0x6ae/0xf40 [act_ife] [ 508.393648] the dependencies between SOFTIRQ-irq-safe lock and the holding lock: [ 508.403505] -> (&(&p->tcfa_lock)->rlock){+.-.} ops: 1001553 { [ 508.409646] HARDIRQ-ON-W at: [ 508.413136] _raw_spin_lock_bh+0x34/0x40 [ 508.419059] gnet_stats_start_copy_compat+0xa2/0x230 [ 508.426021] gnet_stats_start_copy+0x16/0x20 [ 508.432333] tcf_action_copy_stats+0x95/0x1d0 [ 508.438735] tcf_action_dump_1+0xb0/0x4e0 [ 508.444795] tcf_action_dump+0xca/0x200 [ 508.450673] tcf_exts_dump+0xd9/0x320 [ 508.456392] fl_dump+0x1b7/0x4a0 [cls_flower] [ 508.462798] tcf_fill_node+0x380/0x530 [ 508.468601] tfilter_notify+0xdf/0x1c0 [ 508.474404] tc_new_tfilter+0x84a/0xc90 [ 508.480270] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.486419] netlink_rcv_skb+0x184/0x220 [ 508.492394] netlink_unicast+0x31b/0x460 [ 508.507411] netlink_sendmsg+0x3fb/0x840 [ 508.513390] sock_sendmsg+0x7b/0xd0 [ 508.518907] ___sys_sendmsg+0x4c6/0x610 [ 508.524797] __sys_sendmsg+0xd7/0x150 [ 508.530510] do_syscall_64+0x7a/0x3f0 [ 508.536201] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.543301] IN-SOFTIRQ-W at: [ 508.546834] _raw_spin_lock+0x2c/0x40 [ 508.552522] est_fetch_counters+0x41/0xb0 [ 508.558571] est_timer+0x83/0x3c0 [ 508.563912] call_timer_fn+0x16a/0x5d0 [ 508.569699] run_timer_softirq+0x399/0x920 [ 508.575840] __do_softirq+0x157/0x97d [ 508.581538] irq_exit+0x152/0x1c0 [ 508.586882] smp_apic_timer_interrupt+0xc0/0x4e0 [ 508.593533] apic_timer_interrupt+0xf/0x20 [ 508.599686] cpuidle_enter_state+0xf8/0x5d0 [ 508.605895] do_idle+0x28a/0x350 [ 508.611147] cpu_startup_entry+0xc7/0xe0 [ 508.617097] start_secondary+0x2e8/0x3f0 [ 508.623029] secondary_startup_64+0xa5/0xb0 [ 508.629245] INITIAL USE at: [ 508.632686] _raw_spin_lock_bh+0x34/0x40 [ 508.638557] gnet_stats_start_copy_compat+0xa2/0x230 [ 508.645491] gnet_stats_start_copy+0x16/0x20 [ 508.651719] tcf_action_copy_stats+0x95/0x1d0 [ 508.657992] tcf_action_dump_1+0xb0/0x4e0 [ 508.663937] tcf_action_dump+0xca/0x200 [ 508.669716] tcf_exts_dump+0xd9/0x320 [ 508.675337] fl_dump+0x1b7/0x4a0 [cls_flower] [ 508.681650] tcf_fill_node+0x380/0x530 [ 508.687366] tfilter_notify+0xdf/0x1c0 [ 508.693031] tc_new_tfilter+0x84a/0xc90 [ 508.698820] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.704869] netlink_rcv_skb+0x184/0x220 [ 508.710758] netlink_unicast+0x31b/0x460 [ 508.716627] netlink_sendmsg+0x3fb/0x840 [ 508.722510] sock_sendmsg+0x7b/0xd0 [ 508.727931] ___sys_sendmsg+0x4c6/0x610 [ 508.733729] __sys_sendmsg+0xd7/0x150 [ 508.739346] do_syscall_64 +0x7a/0x3f0 [ 508.744943] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.751930] } [ 508.753964] ... key at: [] __key.61145+0x0/0x40 [ 508.760946] ... acquired at: [ 508.764294] _raw_read_lock+0x2f/0x40 [ 508.768513] find_ife_oplist+0x1e/0xc0 [act_ife] [ 508.773692] tcf_ife_init+0x82f/0xf40 [act_ife] [ 508.778785] tcf_action_init_1+0x510/0x750 [ 508.783468] tcf_action_init+0x1e8/0x340 [ 508.787938] tcf_action_add+0xc5/0x240 [ 508.792241] tc_ctl_action+0x203/0x2a0 [ 508.796550] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.801200] netlink_rcv_skb+0x184/0x220 [ 508.805674] netlink_unicast+0x31b/0x460 [ 508.810129] netlink_sendmsg+0x3fb/0x840 [ 508.814611] sock_sendmsg+0x7b/0xd0 [ 508.818665] ___sys_sendmsg+0x4c6/0x610 [ 508.823029] __sys_sendmsg+0xd7/0x150 [ 508.827246] do_syscall_64+0x7a/0x3f0 [ 508.831483] entry_SYSCALL_64_after_hwframe+0x49/0xbe the dependencies between the lock to be acquired [ 508.838945] and SOFTIRQ-irq-unsafe lock: [ 508.851177] -> (ife_mod_lock){++++} ops: 95 { [ 508.855920] HARDIRQ-ON-W at: [ 508.859478] _raw_write_lock+0x2c/0x40 [ 508.865264] register_ife_op+0x118/0x2c0 [act_ife] [ 508.872071] do_one_initcall+0xf7/0x4d9 [ 508.877947] do_init_module+0x18b/0x44e [ 508.883819] load_module+0x4167/0x5730 [ 508.889595] __do_sys_finit_module+0x16d/0x1a0 [ 508.896043] do_syscall_64+0x7a/0x3f0 [ 508.901734] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.908827] HARDIRQ-ON-R at: [ 508.912359] _raw_read_lock+0x2f/0x40 [ 508.918043] find_ife_oplist+0x1e/0xc0 [act_ife] [ 508.924692] tcf_ife_init+0x82f/0xf40 [act_ife] [ 508.931252] tcf_action_init_1+0x510/0x750 [ 508.937393] tcf_action_init+0x1e8/0x340 [ 508.943366] tcf_action_add+0xc5/0x240 [ 508.949130] tc_ctl_action+0x203/0x2a0 [ 508.954922] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.961024] netlink_rcv_skb+0x184/0x220 [ 508.966970] netlink_unicast+0x31b/0x460 [ 508.972915] netlink_sendmsg+0x3fb/0x840 [ 508.978859] sock_sendmsg+0x7b/0xd0 [ 508.984400] ___sys_sendmsg+0x4c6/0x610 [ 508.990264] __sys_sendmsg+0xd7/0x150 [ 508.995952] do_syscall_64+0x7a/0x3f0 [ 509.001643] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.008722] SOFTIRQ-ON-W at:\ [ 509.012242] _raw_write_lock+0x2c/0x40 [ 509.018013] register_ife_op+0x118/0x2c0 [act_ife] [ 509.024841] do_one_initcall+0xf7/0x4d9 [ 509.030720] do_init_module+0x18b/0x44e [ 509.036604] load_module+0x4167/0x5730 [ 509.042397] __do_sys_finit_module+0x16d/0x1a0 [ 509.048865] do_syscall_64+0x7a/0x3f0 [ 509.054551] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.061636] SOFTIRQ-ON-R at: [ 509.065145] _raw_read_lock+0x2f/0x40 [ 509.070854] find_ife_oplist+0x1e/0xc0 [act_ife] [ 509.077515] tcf_ife_init+0x82f/0xf40 [act_ife] [ 509.084051] tcf_action_init_1+0x510/0x750 [ 509.090172] tcf_action_init+0x1e8/0x340 [ 509.096124] tcf_action_add+0xc5/0x240 [ 509.101891] tc_ctl_action+0x203/0x2a0 [ 509.107671] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 509.113811] netlink_rcv_skb+0x184/0x220 [ 509.119768] netlink_unicast+0x31b/0x460 [ 509.125716] netlink_sendmsg+0x3fb/0x840 [ 509.131668] sock_sendmsg+0x7b/0xd0 [ 509.137167] ___sys_sendmsg+0x4c6/0x610 [ 509.143010] __sys_sendmsg+0xd7/0x150 [ 509.148718] do_syscall_64+0x7a/0x3f0 [ 509.154443] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.161533] INITIAL USE at: [ 509.164956] _raw_read_lock+0x2f/0x40 [ 509.170574] find_ife_oplist+0x1e/0xc0 [act_ife] [ 509.177134] tcf_ife_init+0x82f/0xf40 [act_ife] [ 509.183619] tcf_action_init_1+0x510/0x750 [ 509.189674] tcf_action_init+0x1e8/0x340 [ 509.195534] tcf_action_add+0xc5/0x240 [ 509.201229] tc_ctl_action+0x203/0x2a0 [ 509.206920] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 509.212936] netlink_rcv_skb+0x184/0x220 [ 509.218818] netlink_unicast+0x31b/0x460 [ 509.224699] netlink_sendmsg+0x3fb/0x840 [ 509.230581] sock_sendmsg+0x7b/0xd0 [ 509.235984] ___sys_sendmsg+0x4c6/0x610 [ 509.241791] __sys_sendmsg+0xd7/0x150 [ 509.247425] do_syscall_64+0x7a/0x3f0 [ 509.253007] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.259975] } [ 509.261998] ... key at: [] ife_mod_lock+0x18/0xffffffffffff8dc0 [act_ife] [ 509.271569] ... acquired at: [ 509.274912] _raw_read_lock+0x2f/0x40 [ 509.279134] find_ife_oplist+0x1e/0xc0 [act_ife] [ 509.284324] tcf_ife_init+0x82f/0xf40 [act_ife] [ 509.289425] tcf_action_init_1+0x510/0x750 [ 509.294068] tcf_action_init+0x1e8/0x340 [ 509.298553] tcf_action_add+0xc5/0x240 [ 509.302854] tc_ctl_action+0x203/0x2a0 [ 509.307153] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 509.311805] netlink_rcv_skb+0x184/0x220 [ 509.316282] netlink_unicast+0x31b/0x460 [ 509.320769] netlink_sendmsg+0x3fb/0x840 [ 509.325248] sock_sendmsg+0x7b/0xd0 [ 509.329290] ___sys_sendmsg+0x4c6/0x610 [ 509.333687] __sys_sendmsg+0xd7/0x150 [ 509.337902] do_syscall_64+0x7a/0x3f0 [ 509.342116] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.349601] stack backtrace: [ 509.354663] CPU: 6 PID: 5460 Comm: tc Not tainted 4.18.0-rc8+ #646 [ 509.361216] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 Fixes: ef6980b6becb ("introduce IFE action") Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5d200495e467..fdb928ca81bb 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -167,16 +167,16 @@ static struct tcf_meta_ops *find_ife_oplist(u16 metaid) { struct tcf_meta_ops *o; - read_lock(&ife_mod_lock); + read_lock_bh(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { if (o->metaid == metaid) { if (!try_module_get(o->owner)) o = NULL; - read_unlock(&ife_mod_lock); + read_unlock_bh(&ife_mod_lock); return o; } } - read_unlock(&ife_mod_lock); + read_unlock_bh(&ife_mod_lock); return NULL; } @@ -190,12 +190,12 @@ int register_ife_op(struct tcf_meta_ops *mops) !mops->get || !mops->alloc) return -EINVAL; - write_lock(&ife_mod_lock); + write_lock_bh(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid || (strcmp(mops->name, m->name) == 0)) { - write_unlock(&ife_mod_lock); + write_unlock_bh(&ife_mod_lock); return -EEXIST; } } @@ -204,7 +204,7 @@ int register_ife_op(struct tcf_meta_ops *mops) mops->release = ife_release_meta_gen; list_add_tail(&mops->list, &ifeoplist); - write_unlock(&ife_mod_lock); + write_unlock_bh(&ife_mod_lock); return 0; } EXPORT_SYMBOL_GPL(unregister_ife_op); @@ -214,7 +214,7 @@ int unregister_ife_op(struct tcf_meta_ops *mops) struct tcf_meta_ops *m; int err = -ENOENT; - write_lock(&ife_mod_lock); + write_lock_bh(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid) { list_del(&mops->list); @@ -222,7 +222,7 @@ int unregister_ife_op(struct tcf_meta_ops *mops) break; } } - write_unlock(&ife_mod_lock); + write_unlock_bh(&ife_mod_lock); return err; } @@ -343,13 +343,13 @@ static int use_all_metadata(struct tcf_ife_info *ife) int rc = 0; int installed = 0; - read_lock(&ife_mod_lock); + read_lock_bh(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { rc = add_metainfo(ife, o->metaid, NULL, 0, true); if (rc == 0) installed += 1; } - read_unlock(&ife_mod_lock); + read_unlock_bh(&ife_mod_lock); if (installed) return 0; -- cgit v1.2.3 From 2df8bee5654bb2b7312662ca6810d4dc16b0b67f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 13 Aug 2018 18:44:03 +0800 Subject: net_sched: fix NULL pointer dereference when delete tcindex filter Li Shuang reported the following crash: [ 71.267724] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 [ 71.276456] PGD 800000085d9bd067 P4D 800000085d9bd067 PUD 859a0b067 PMD 0 [ 71.284127] Oops: 0000 [#1] SMP PTI [ 71.288015] CPU: 12 PID: 2386 Comm: tc Not tainted 4.18.0-rc8.latest+ #131 [ 71.295686] Hardware name: Dell Inc. PowerEdge R730/0WCJNT, BIOS 2.1.5 04/11/2016 [ 71.304037] RIP: 0010:tcindex_delete+0x72/0x280 [cls_tcindex] [ 71.310446] Code: 00 31 f6 48 87 75 20 48 85 f6 74 11 48 8b 47 18 48 8b 40 08 48 8b 40 50 e8 fb a6 f8 fc 48 85 db 0f 84 dc 00 00 00 48 8b 73 18 <8b> 56 04 48 8d 7e 04 85 d2 0f 84 7b 01 00 [ 71.331517] RSP: 0018:ffffb45207b3f898 EFLAGS: 00010282 [ 71.337345] RAX: ffff8ad3d72d6360 RBX: ffff8acc84393680 RCX: 000000000000002e [ 71.345306] RDX: ffff8ad3d72c8570 RSI: 0000000000000000 RDI: ffff8ad847a45800 [ 71.353277] RBP: ffff8acc84393688 R08: ffff8ad3d72c8400 R09: 0000000000000000 [ 71.361238] R10: ffff8ad3de786e00 R11: 0000000000000000 R12: ffffb45207b3f8c7 [ 71.369199] R13: ffff8ad3d93bd2a0 R14: 000000000000002e R15: ffff8ad3d72c9600 [ 71.377161] FS: 00007f9d3ec3e740(0000) GS:ffff8ad3df980000(0000) knlGS:0000000000000000 [ 71.386188] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 71.392597] CR2: 0000000000000004 CR3: 0000000852f06003 CR4: 00000000001606e0 [ 71.400558] Call Trace: [ 71.403299] tcindex_destroy_element+0x25/0x40 [cls_tcindex] [ 71.409611] tcindex_walk+0xbb/0x110 [cls_tcindex] [ 71.414953] tcindex_destroy+0x44/0x90 [cls_tcindex] [ 71.420492] ? tcindex_delete+0x280/0x280 [cls_tcindex] [ 71.426323] tcf_proto_destroy+0x16/0x40 [ 71.430696] tcf_chain_flush+0x51/0x70 [ 71.434876] tcf_block_put_ext.part.30+0x8f/0x1b0 [ 71.440122] tcf_block_put+0x4d/0x70 [ 71.444108] cbq_destroy+0x4d/0xd0 [sch_cbq] [ 71.448869] qdisc_destroy+0x62/0x130 [ 71.452951] dsmark_destroy+0x2a/0x70 [sch_dsmark] [ 71.458300] qdisc_destroy+0x62/0x130 [ 71.462373] qdisc_graft+0x3ba/0x470 [ 71.466359] tc_get_qdisc+0x2a6/0x2c0 [ 71.470443] ? cred_has_capability+0x7d/0x130 [ 71.475307] rtnetlink_rcv_msg+0x263/0x2d0 [ 71.479875] ? rtnl_calcit.isra.30+0x110/0x110 [ 71.484832] netlink_rcv_skb+0x4d/0x130 [ 71.489109] netlink_unicast+0x1a3/0x250 [ 71.493482] netlink_sendmsg+0x2ae/0x3a0 [ 71.497859] sock_sendmsg+0x36/0x40 [ 71.501748] ___sys_sendmsg+0x26f/0x2d0 [ 71.506029] ? handle_pte_fault+0x586/0xdf0 [ 71.510694] ? __handle_mm_fault+0x389/0x500 [ 71.515457] ? __sys_sendmsg+0x5e/0xa0 [ 71.519636] __sys_sendmsg+0x5e/0xa0 [ 71.523626] do_syscall_64+0x5b/0x180 [ 71.527711] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 71.533345] RIP: 0033:0x7f9d3e257f10 [ 71.537331] Code: c3 48 8b 05 82 6f 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d 8d d0 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 [ 71.558401] RSP: 002b:00007fff6f893398 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 71.566848] RAX: ffffffffffffffda RBX: 000000005b71274d RCX: 00007f9d3e257f10 [ 71.574810] RDX: 0000000000000000 RSI: 00007fff6f8933e0 RDI: 0000000000000003 [ 71.582770] RBP: 00007fff6f8933e0 R08: 000000000000ffff R09: 0000000000000003 [ 71.590729] R10: 00007fff6f892e20 R11: 0000000000000246 R12: 0000000000000000 [ 71.598689] R13: 0000000000662ee0 R14: 0000000000000000 R15: 0000000000000000 [ 71.606651] Modules linked in: sch_cbq cls_tcindex sch_dsmark xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_coni [ 71.685425] libahci i2c_algo_bit i2c_core i40e libata dca mdio megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [ 71.697075] CR2: 0000000000000004 [ 71.700792] ---[ end trace f604eb1acacd978b ]--- Reproducer: tc qdisc add dev lo handle 1:0 root dsmark indices 64 set_tc_index tc filter add dev lo parent 1:0 protocol ip prio 1 tcindex mask 0xfc shift 2 tc qdisc add dev lo parent 1:0 handle 2:0 cbq bandwidth 10Mbit cell 8 avpkt 1000 mpu 64 tc class add dev lo parent 2:0 classid 2:1 cbq bandwidth 10Mbit rate 1500Kbit avpkt 1000 prio 1 bounded isolated allot 1514 weight 1 maxburst 10 tc filter add dev lo parent 2:0 protocol ip prio 1 handle 0x2e tcindex classid 2:1 pass_on tc qdisc add dev lo parent 2:1 pfifo limit 5 tc qdisc del dev lo root This is because in tcindex_set_parms, when there is no old_r, we set new exts to cr.exts. And we didn't set it to filter when r == &new_filter_result. Then in tcindex_delete() -> tcf_exts_get_net(), we will get NULL pointer dereference as we didn't init exts. Fix it by moving tcf_exts_change() after "if (old_r && old_r != r)" check. Then we don't need "cr" as there is no errout after that. Fixes: bf63ac73b3e13 ("net_sched: fix an oops in tcindex filter") Reported-by: Li Shuang Signed-off-by: Hangbin Liu Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 32f4bbd82f35..ddaa4e63ce94 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -447,11 +447,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, tcf_bind_filter(tp, &cr.res, base); } - if (old_r) - tcf_exts_change(&r->exts, &e); - else - tcf_exts_change(&cr.exts, &e); - if (old_r && old_r != r) { err = tcindex_filter_result_init(old_r); if (err < 0) { @@ -462,6 +457,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, oldp = p; r->res = cr.res; + tcf_exts_change(&r->exts, &e); + rcu_assign_pointer(tp->root, cp); if (r == &new_filter_result) { -- cgit v1.2.3 From 008369dcc5f7bfba526c98054f8525322acf0ea3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 13 Aug 2018 18:44:04 +0800 Subject: net_sched: Fix missing res info when create new tc_index filter Li Shuang reported the following warn: [ 733.484610] WARNING: CPU: 6 PID: 21123 at net/sched/sch_cbq.c:1418 cbq_destroy_class+0x5d/0x70 [sch_cbq] [ 733.495190] Modules linked in: sch_cbq cls_tcindex sch_dsmark rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat l [ 733.574155] syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm igb ixgbe ahci libahci i2c_algo_bit libata i40e i2c_core dca mdio megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [ 733.592500] CPU: 6 PID: 21123 Comm: tc Not tainted 4.18.0-rc8.latest+ #131 [ 733.600169] Hardware name: Dell Inc. PowerEdge R730/0WCJNT, BIOS 2.1.5 04/11/2016 [ 733.608518] RIP: 0010:cbq_destroy_class+0x5d/0x70 [sch_cbq] [ 733.614734] Code: e7 d9 d2 48 8b 7b 48 e8 61 05 da d2 48 8d bb f8 00 00 00 e8 75 ae d5 d2 48 39 eb 74 0a 48 89 df 5b 5d e9 16 6c 94 d2 5b 5d c3 <0f> 0b eb b6 0f 1f 44 00 00 66 2e 0f 1f 84 [ 733.635798] RSP: 0018:ffffbfbb066bb9d8 EFLAGS: 00010202 [ 733.641627] RAX: 0000000000000001 RBX: ffff9cdd17392800 RCX: 000000008010000f [ 733.649588] RDX: ffff9cdd1df547e0 RSI: ffff9cdd17392800 RDI: ffff9cdd0f84c800 [ 733.657547] RBP: ffff9cdd0f84c800 R08: 0000000000000001 R09: 0000000000000000 [ 733.665508] R10: ffff9cdd0f84d000 R11: 0000000000000001 R12: 0000000000000001 [ 733.673469] R13: 0000000000000000 R14: 0000000000000001 R15: ffff9cdd17392200 [ 733.681430] FS: 00007f911890a740(0000) GS:ffff9cdd1f8c0000(0000) knlGS:0000000000000000 [ 733.690456] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 733.696864] CR2: 0000000000b5544c CR3: 0000000859374002 CR4: 00000000001606e0 [ 733.704826] Call Trace: [ 733.707554] cbq_destroy+0xa1/0xd0 [sch_cbq] [ 733.712318] qdisc_destroy+0x62/0x130 [ 733.716401] dsmark_destroy+0x2a/0x70 [sch_dsmark] [ 733.721745] qdisc_destroy+0x62/0x130 [ 733.725829] qdisc_graft+0x3ba/0x470 [ 733.729817] tc_get_qdisc+0x2a6/0x2c0 [ 733.733901] ? cred_has_capability+0x7d/0x130 [ 733.738761] rtnetlink_rcv_msg+0x263/0x2d0 [ 733.743330] ? rtnl_calcit.isra.30+0x110/0x110 [ 733.748287] netlink_rcv_skb+0x4d/0x130 [ 733.752576] netlink_unicast+0x1a3/0x250 [ 733.756949] netlink_sendmsg+0x2ae/0x3a0 [ 733.761324] sock_sendmsg+0x36/0x40 [ 733.765213] ___sys_sendmsg+0x26f/0x2d0 [ 733.769493] ? handle_pte_fault+0x586/0xdf0 [ 733.774158] ? __handle_mm_fault+0x389/0x500 [ 733.778919] ? __sys_sendmsg+0x5e/0xa0 [ 733.783099] __sys_sendmsg+0x5e/0xa0 [ 733.787087] do_syscall_64+0x5b/0x180 [ 733.791171] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 733.796805] RIP: 0033:0x7f9117f23f10 [ 733.800791] Code: c3 48 8b 05 82 6f 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d 8d d0 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 [ 733.821873] RSP: 002b:00007ffe96818398 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 733.830319] RAX: ffffffffffffffda RBX: 000000005b71244c RCX: 00007f9117f23f10 [ 733.838280] RDX: 0000000000000000 RSI: 00007ffe968183e0 RDI: 0000000000000003 [ 733.846241] RBP: 00007ffe968183e0 R08: 000000000000ffff R09: 0000000000000003 [ 733.854202] R10: 00007ffe96817e20 R11: 0000000000000246 R12: 0000000000000000 [ 733.862161] R13: 0000000000662ee0 R14: 0000000000000000 R15: 0000000000000000 [ 733.870121] ---[ end trace 28edd4aad712ddca ]--- This is because we didn't update f->result.res when create new filter. Then in tcindex_delete() -> tcf_unbind_filter(), we will failed to find out the res and unbind filter, which will trigger the WARN_ON() in cbq_destroy_class(). Fix it by updating f->result.res when create new filter. Fixes: 6e0565697a106 ("net_sched: fix another crash in cls_tcindex") Reported-by: Li Shuang Signed-off-by: Hangbin Liu Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ddaa4e63ce94..9ccc93f257db 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -465,6 +465,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; + f->result.res = r->res; tcf_exts_change(&f->result.exts, &r->exts); fp = cp->h + (handle % cp->hash); -- cgit v1.2.3 From c2ebc25674e5123d134e81758828084f1cc58cc3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Aug 2018 23:43:05 +0200 Subject: l2tp: fix unused function warning Removing one of the callers of pppol2tp_session_get_sock caused a harmless warning in some configurations: net/l2tp/l2tp_ppp.c:142:21: 'pppol2tp_session_get_sock' defined but not used [-Wunused-function] Rather than adding another #ifdef here, using a proper IS_ENABLED() check makes the code more readable and avoids those warnings while letting the compiler figure out for itself which code is needed. This adds one pointer for the unused show() callback in struct l2tp_session, but that seems harmless. Fixes: b0e29063dcb3 ("l2tp: remove pppol2tp_session_ioctl()") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.h | 2 -- net/l2tp/l2tp_eth.c | 7 ++----- net/l2tp/l2tp_ppp.c | 7 ++----- 3 files changed, 4 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 8480a0af973e..9c9afe94d389 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -118,9 +118,7 @@ struct l2tp_session { int (*build_header)(struct l2tp_session *session, void *buf); void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); void (*session_close)(struct l2tp_session *session); -#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) void (*show)(struct seq_file *m, void *priv); -#endif uint8_t priv[0]; /* private data */ }; diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 3728986ec885..8aadc4f3bb9e 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -199,7 +199,6 @@ static void l2tp_eth_delete(struct l2tp_session *session) } } -#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) static void l2tp_eth_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; @@ -219,7 +218,6 @@ static void l2tp_eth_show(struct seq_file *m, void *arg) dev_put(dev); } -#endif static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, struct l2tp_session *session, @@ -305,9 +303,8 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, session->recv_skb = l2tp_eth_dev_recv; session->session_close = l2tp_eth_delete; -#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) - session->show = l2tp_eth_show; -#endif + if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) + session->show = l2tp_eth_show; spriv = l2tp_session_priv(session); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 62f2d3f1e431..04d9946dcdba 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -533,7 +533,6 @@ out: return error; } -#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; @@ -547,16 +546,14 @@ static void pppol2tp_show(struct seq_file *m, void *arg) sock_put(sk); } } -#endif static void pppol2tp_session_init(struct l2tp_session *session) { struct pppol2tp_session *ps; session->recv_skb = pppol2tp_recv; -#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) - session->show = pppol2tp_show; -#endif + if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) + session->show = pppol2tp_show; ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); -- cgit v1.2.3 From 66b51b0a0341fd42ce657739bdae0561b0410a85 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Mon, 13 Aug 2018 22:23:13 +0000 Subject: net: sock_diag: Fix spectre v1 gadget in __sock_diag_cmd() req->sdiag_family is a user-controlled value that's used as an array index. Sanitize it after the bounds check to avoid speculative out-of-bounds array access. This also protects the sock_is_registered() call, so this removes the sanitize call there. Fixes: e978de7a6d38 ("net: socket: Fix potential spectre v1 gadget in sock_is_registered") Cc: Josh Poimboeuf Cc: konrad.wilk@oracle.com Cc: jamie.iles@oracle.com Cc: liran.alon@oracle.com Cc: stable@vger.kernel.org Signed-off-by: Jeremy Cline Signed-off-by: David S. Miller --- net/core/sock_diag.c | 2 ++ net/socket.c | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c37b5be7c5e4..3312a5849a97 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -218,6 +219,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) if (req->sdiag_family >= AF_MAX) return -EINVAL; + req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); if (sock_diag_handlers[req->sdiag_family] == NULL) sock_load_diag_module(req->sdiag_family, 0); diff --git a/net/socket.c b/net/socket.c index b91949168a87..270f28264cb1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2697,8 +2697,7 @@ EXPORT_SYMBOL(sock_unregister); bool sock_is_registered(int family) { - return family < NPROTO && - rcu_access_pointer(net_families[array_index_nospec(family, NPROTO)]); + return family < NPROTO && rcu_access_pointer(net_families[family]); } static int __init sock_init(void) -- cgit v1.2.3 From f6f7a26abd14cfa0f3f3009a2d274d36798668bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 11:07:42 +0200 Subject: rds: fix building with IPV6=m When CONFIG_RDS_TCP is built-in and CONFIG_IPV6 is a loadable module, we get a link error agains the modular ipv6_chk_addr() function: net/rds/tcp.o: In function `rds_tcp_laddr_check': tcp.c:(.text+0x3b2): undefined reference to `ipv6_chk_addr' This adds back a dependency that forces RDS_TCP to also be a loadable module when IPV6 is one. Fixes: e65d4d96334e ("rds: Remove IPv6 dependency") Signed-off-by: Arnd Bergmann Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 41f75563b54b..01b3bd6a3708 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -16,6 +16,7 @@ config RDS_RDMA config RDS_TCP tristate "RDS over TCP" depends on RDS + depends on IPV6 || !IPV6 ---help--- Allow RDS to use TCP as a transport. This transport does not support RDMA operations. -- cgit v1.2.3 From 4597b62f7a60d912a2bd00ca574e6bc7b87be905 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 14 Aug 2018 11:20:21 -0500 Subject: net: filter: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1472592 ("Missing break in switch") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 15b9d2df92ca..fd423ce3da34 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7235,6 +7235,7 @@ sk_reuseport_is_valid_access(int off, int size, case offsetof(struct sk_reuseport_md, eth_protocol): if (size < FIELD_SIZEOF(struct sk_buff, protocol)) return false; + /* fall through */ case offsetof(struct sk_reuseport_md, ip_protocol): case offsetof(struct sk_reuseport_md, bind_inany): case offsetof(struct sk_reuseport_md, len): -- cgit v1.2.3