summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/core.c1
-rw-r--r--net/ax25/ax25_route.c5
-rw-r--r--net/batman-adv/multicast.c2
-rw-r--r--net/bluetooth/hci_conn.c1
-rw-r--r--net/bluetooth/hci_core.c5
-rw-r--r--net/bluetooth/hci_event.c89
-rw-r--r--net/bluetooth/hci_sync.c7
-rw-r--r--net/bluetooth/mgmt.c117
-rw-r--r--net/bluetooth/msft.c170
-rw-r--r--net/bpf/bpf_dummy_struct_ops.c6
-rw-r--r--net/bpf/test_run.c277
-rw-r--r--net/bridge/br_private.h6
-rw-r--r--net/bridge/br_switchdev.c95
-rw-r--r--net/bridge/br_vlan.c117
-rw-r--r--net/caif/caif_dev.c2
-rw-r--r--net/can/gw.c9
-rw-r--r--net/core/dev.c274
-rw-r--r--net/core/drop_monitor.c41
-rw-r--r--net/core/filter.c273
-rw-r--r--net/core/gro.c16
-rw-r--r--net/core/link_watch.c6
-rw-r--r--net/core/net_namespace.c20
-rw-r--r--net/core/page_pool.c23
-rw-r--r--net/core/rtnetlink.c6
-rw-r--r--net/core/sock.c18
-rw-r--r--net/core/sock_map.c77
-rw-r--r--net/core/sysctl_net_core.c14
-rw-r--r--net/core/xdp.c78
-rw-r--r--net/dccp/dccp.h5
-rw-r--r--net/dccp/ipv4.c6
-rw-r--r--net/dccp/ipv6.c6
-rw-r--r--net/dccp/minisocks.c1
-rw-r--r--net/dsa/dsa2.c82
-rw-r--r--net/dsa/dsa_priv.h20
-rw-r--r--net/dsa/master.c4
-rw-r--r--net/dsa/port.c42
-rw-r--r--net/dsa/slave.c198
-rw-r--r--net/dsa/switch.c242
-rw-r--r--net/dsa/tag_8021q.c4
-rw-r--r--net/dsa/tag_ocelot_8021q.c7
-rw-r--r--net/dsa/tag_qca.c85
-rw-r--r--net/ethtool/rings.c15
-rw-r--r--net/hsr/hsr_debugfs.c40
-rw-r--r--net/hsr/hsr_device.c10
-rw-r--r--net/hsr/hsr_forward.c7
-rw-r--r--net/hsr/hsr_framereg.c200
-rw-r--r--net/hsr/hsr_framereg.h8
-rw-r--r--net/hsr/hsr_main.h14
-rw-r--r--net/hsr/hsr_netlink.c4
-rw-r--r--net/ieee802154/6lowpan/core.c1
-rw-r--r--net/ieee802154/nl-phy.c4
-rw-r--r--net/ipv4/bpf_tcp_ca.c28
-rw-r--r--net/ipv4/fib_frontend.c30
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_rules.c19
-rw-r--r--net/ipv4/fib_semantics.c58
-rw-r--r--net/ipv4/fib_trie.c58
-rw-r--r--net/ipv4/icmp.c91
-rw-r--r--net/ipv4/inet_connection_sock.c8
-rw-r--r--net/ipv4/inet_hashtables.c53
-rw-r--r--net/ipv4/inet_timewait_sock.c75
-rw-r--r--net/ipv4/ip_input.c31
-rw-r--r--net/ipv4/ip_options.c31
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ipmr.c20
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c24
-rw-r--r--net/ipv4/nexthop.c12
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/route.c39
-rw-r--r--net/ipv4/sysctl_net_ipv4.c20
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_bbr.c18
-rw-r--r--net/ipv4/tcp_cubic.c17
-rw-r--r--net/ipv4/tcp_dctcp.c18
-rw-r--r--net/ipv4/tcp_input.c11
-rw-r--r--net/ipv4/tcp_ipv4.c75
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/udp.c22
-rw-r--r--net/ipv6/addrconf.c207
-rw-r--r--net/ipv6/exthdrs.c8
-rw-r--r--net/ipv6/fib6_rules.c30
-rw-r--r--net/ipv6/icmp.c62
-rw-r--r--net/ipv6/inet6_hashtables.c5
-rw-r--r--net/ipv6/ioam6_iptunnel.c59
-rw-r--r--net/ipv6/ip6_input.c2
-rw-r--r--net/ipv6/ip6_offload.c5
-rw-r--r--net/ipv6/ip6_output.c100
-rw-r--r--net/ipv6/ip6_tunnel.c8
-rw-r--r--net/ipv6/ip6mr.c35
-rw-r--r--net/ipv6/ndisc.c6
-rw-r--r--net/ipv6/ping.c29
-rw-r--r--net/ipv6/route.c48
-rw-r--r--net/ipv6/tcp_ipv6.c9
-rw-r--r--net/ipv6/udp.c114
-rw-r--r--net/ipv6/xfrm6_policy.c1
-rw-r--r--net/mac80211/airtime.c11
-rw-r--r--net/mac80211/debugfs.c2
-rw-r--r--net/mac80211/debugfs_key.c2
-rw-r--r--net/mac80211/debugfs_netdev.c4
-rw-r--r--net/mac80211/ieee80211_i.h2
-rw-r--r--net/mac80211/mesh.c2
-rw-r--r--net/mac80211/mlme.c183
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c2
-rw-r--r--net/mac80211/rx.c2
-rw-r--r--net/mac80211/sta_info.c3
-rw-r--r--net/mac80211/status.c14
-rw-r--r--net/mac80211/util.c28
-rw-r--r--net/mac80211/vht.c4
-rw-r--r--net/mctp/af_mctp.c189
-rw-r--r--net/mctp/device.c1
-rw-r--r--net/mctp/route.c124
-rw-r--r--net/mctp/test/route-test.c157
-rw-r--r--net/mptcp/options.c77
-rw-r--r--net/mptcp/pm.c11
-rw-r--r--net/mptcp/pm_netlink.c92
-rw-r--r--net/mptcp/protocol.h29
-rw-r--r--net/mptcp/sockopt.c2
-rw-r--r--net/mptcp/subflow.c37
-rw-r--r--net/netfilter/Makefile5
-rw-r--r--net/netfilter/core.c3
-rw-r--r--net/netfilter/nf_conntrack_acct.c19
-rw-r--r--net/netfilter/nf_conntrack_bpf.c257
-rw-r--r--net/netfilter/nf_conntrack_core.c102
-rw-r--r--net/netfilter/nf_conntrack_ecache.c47
-rw-r--r--net/netfilter/nf_conntrack_extend.c132
-rw-r--r--net/netfilter/nf_conntrack_helper.c17
-rw-r--r--net/netfilter/nf_conntrack_labels.c20
-rw-r--r--net/netfilter/nf_conntrack_netlink.c36
-rw-r--r--net/netfilter/nf_conntrack_pptp.c60
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c4
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c16
-rw-r--r--net/netfilter/nf_conntrack_timeout.c50
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c20
-rw-r--r--net/netfilter/nf_nat_core.c28
-rw-r--r--net/netfilter/nf_synproxy_core.c24
-rw-r--r--net/netfilter/nf_tables_core.c16
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c11
-rw-r--r--net/netfilter/nfnetlink_queue.c13
-rw-r--r--net/netfilter/nft_cmp.c102
-rw-r--r--net/netfilter/nft_compat.c9
-rw-r--r--net/netfilter/nft_exthdr.c96
-rw-r--r--net/openvswitch/datapath.c18
-rw-r--r--net/openvswitch/datapath.h2
-rw-r--r--net/openvswitch/flow.c3
-rw-r--r--net/sched/act_ct.c13
-rw-r--r--net/sched/act_police.c16
-rw-r--r--net/sched/cls_api.c45
-rw-r--r--net/smc/af_smc.c200
-rw-r--r--net/smc/smc.h13
-rw-r--r--net/smc/smc_netlink.c15
-rw-r--r--net/smc/smc_pnet.c3
-rw-r--r--net/smc/smc_tx.c60
-rw-r--r--net/smc/smc_tx.h3
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c10
-rw-r--r--net/sunrpc/svc_xprt.c4
-rw-r--r--net/sunrpc/xprt.c4
-rw-r--r--net/switchdev/switchdev.c154
-rw-r--r--net/tipc/msg.h23
-rw-r--r--net/tls/tls_sw.c3
-rw-r--r--net/unix/af_unix.c250
-rw-r--r--net/wireless/pmsr.c4
-rw-r--r--net/wireless/scan.c9
-rw-r--r--net/wireless/util.c8
-rw-r--r--net/xdp/xsk.c13
-rw-r--r--net/xdp/xsk_buff_pool.c7
-rw-r--r--net/xdp/xsk_queue.h19
167 files changed, 5058 insertions, 2181 deletions
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index a068757eabaf..7b3341cef926 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -5,6 +5,7 @@
* (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
*/
+#include <linux/if_arp.h>
#include <linux/module.h>
#include <net/6lowpan.h>
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 9751207f7757..b7c4d656a94b 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -116,7 +116,6 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
return -ENOMEM;
}
- refcount_set(&ax25_rt->refcount, 1);
ax25_rt->callsign = route->dest_addr;
ax25_rt->dev = ax25_dev->dev;
ax25_rt->digipeat = NULL;
@@ -167,12 +166,12 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
ax25cmp(&route->dest_addr, &s->callsign) == 0) {
if (ax25_route_list == s) {
ax25_route_list = s->next;
- ax25_put_route(s);
+ __ax25_put_route(s);
} else {
for (t = ax25_route_list; t != NULL; t = t->next) {
if (t->next == s) {
t->next = s->next;
- ax25_put_route(s);
+ __ax25_put_route(s);
break;
}
}
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index f4004cf0ff6f..9f311fddfaf9 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -134,7 +134,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev)
{
struct inet6_dev *in6_dev = __in6_dev_get(dev);
- if (in6_dev && in6_dev->cnf.mc_forwarding)
+ if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding))
return BATADV_NO_FLAGS;
else
return BATADV_MCAST_WANT_NO_RTR6;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 04ebe901e86f..d10651108033 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -689,6 +689,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
bacpy(&conn->dst, dst);
bacpy(&conn->src, &hdev->bdaddr);
+ conn->handle = HCI_CONN_HANDLE_UNSET;
conn->hdev = hdev;
conn->type = type;
conn->role = role;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2b7bd3655b07..5bde0ec41177 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2503,6 +2503,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
INIT_LIST_HEAD(&hdev->conn_hash.list);
INIT_LIST_HEAD(&hdev->adv_instances);
INIT_LIST_HEAD(&hdev->blocked_keys);
+ INIT_LIST_HEAD(&hdev->monitored_devices);
INIT_LIST_HEAD(&hdev->local_codecs);
INIT_WORK(&hdev->rx_work, hci_rx_work);
@@ -3666,8 +3667,8 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
sco_recv_scodata(conn, skb);
return;
} else {
- bt_dev_err(hdev, "SCO packet for unknown connection handle %d",
- handle);
+ bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d",
+ handle);
}
kfree_skb(skb);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index fc30f4c03d29..63b925921c87 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3068,6 +3068,11 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
struct hci_ev_conn_complete *ev = data;
struct hci_conn *conn;
+ if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for invalid handle");
+ return;
+ }
+
bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -3106,6 +3111,17 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
}
}
+ /* The HCI_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (conn->handle != HCI_CONN_HANDLE_UNSET) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+ goto unlock;
+ }
+
if (!ev->status) {
conn->handle = __le16_to_cpu(ev->handle);
@@ -4534,7 +4550,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
if (!info) {
bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
HCI_EV_INQUIRY_RESULT_WITH_RSSI);
- return;
+ goto unlock;
}
bacpy(&data.bdaddr, &info->bdaddr);
@@ -4565,7 +4581,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
if (!info) {
bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
HCI_EV_INQUIRY_RESULT_WITH_RSSI);
- return;
+ goto unlock;
}
bacpy(&data.bdaddr, &info->bdaddr);
@@ -4587,7 +4603,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
HCI_EV_INQUIRY_RESULT_WITH_RSSI);
}
-
+unlock:
hci_dev_unlock(hdev);
}
@@ -4661,6 +4677,24 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
struct hci_ev_sync_conn_complete *ev = data;
struct hci_conn *conn;
+ switch (ev->link_type) {
+ case SCO_LINK:
+ case ESCO_LINK:
+ break;
+ default:
+ /* As per Core 5.3 Vol 4 Part E 7.7.35 (p.2219), Link_Type
+ * for HCI_Synchronous_Connection_Complete is limited to
+ * either SCO or eSCO
+ */
+ bt_dev_err(hdev, "Ignoring connect complete event for invalid link type");
+ return;
+ }
+
+ if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) {
+ bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete for invalid handle");
+ return;
+ }
+
bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -4684,23 +4718,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
goto unlock;
}
+ /* The HCI_Synchronous_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (conn->handle != HCI_CONN_HANDLE_UNSET) {
+ bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection");
+ goto unlock;
+ }
+
switch (ev->status) {
case 0x00:
- /* The synchronous connection complete event should only be
- * sent once per new connection. Receiving a successful
- * complete event when the connection status is already
- * BT_CONNECTED means that the device is misbehaving and sent
- * multiple complete event packets for the same new connection.
- *
- * Registering the device more than once can corrupt kernel
- * memory, hence upon detecting this invalid event, we report
- * an error and ignore the packet.
- */
- if (conn->state == BT_CONNECTED) {
- bt_dev_err(hdev, "Ignoring connect complete event for existing connection");
- goto unlock;
- }
-
conn->handle = __le16_to_cpu(ev->handle);
conn->state = BT_CONNECTED;
conn->type = ev->link_type;
@@ -5496,6 +5526,11 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
struct smp_irk *irk;
u8 addr_type;
+ if (handle > HCI_CONN_HANDLE_MAX) {
+ bt_dev_err(hdev, "Ignoring HCI_LE_Connection_Complete for invalid handle");
+ return;
+ }
+
hci_dev_lock(hdev);
/* All controllers implicitly stop advertising in the event of a
@@ -5537,6 +5572,17 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
cancel_delayed_work(&conn->le_conn_timeout);
}
+ /* The HCI_LE_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (conn->handle != HCI_CONN_HANDLE_UNSET) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+ goto unlock;
+ }
+
le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa);
/* Lookup the identity address from the stored connection
@@ -6798,7 +6844,7 @@ static const struct hci_ev {
HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt,
sizeof(struct hci_ev_num_comp_blocks)),
/* [0xff = HCI_EV_VENDOR] */
- HCI_EV(HCI_EV_VENDOR, msft_vendor_evt, 0),
+ HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE),
};
static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb,
@@ -6823,8 +6869,9 @@ static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb,
* decide if that is acceptable.
*/
if (skb->len > ev->max_len)
- bt_dev_warn(hdev, "unexpected event 0x%2.2x length: %u > %u",
- event, skb->len, ev->max_len);
+ bt_dev_warn_ratelimited(hdev,
+ "unexpected event 0x%2.2x length: %u > %u",
+ event, skb->len, ev->max_len);
data = hci_ev_skb_pull(hdev, skb, event, ev->min_len);
if (!data)
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 0feb68f12545..6e71aa6b6fea 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -382,6 +382,9 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
{
struct hci_cmd_sync_work_entry *entry;
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return -ENODEV;
+
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
@@ -5140,8 +5143,8 @@ static void set_ext_conn_params(struct hci_conn *conn,
p->max_ce_len = cpu_to_le16(0x0000);
}
-int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
- u8 own_addr_type)
+static int hci_le_ext_create_conn_sync(struct hci_dev *hdev,
+ struct hci_conn *conn, u8 own_addr_type)
{
struct hci_cp_le_ext_create_conn *cp;
struct hci_cp_le_ext_conn_param *p;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 37087cf7dc5a..5dd684e0b259 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -42,7 +42,7 @@
#include "aosp.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 21
+#define MGMT_REVISION 22
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -174,6 +174,8 @@ static const u16 mgmt_events[] = {
MGMT_EV_ADV_MONITOR_REMOVED,
MGMT_EV_CONTROLLER_SUSPEND,
MGMT_EV_CONTROLLER_RESUME,
+ MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+ MGMT_EV_ADV_MONITOR_DEVICE_LOST,
};
static const u16 mgmt_untrusted_commands[] = {
@@ -9589,12 +9591,116 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
return true;
}
+void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
+ bdaddr_t *bdaddr, u8 addr_type)
+{
+ struct mgmt_ev_adv_monitor_device_lost ev;
+
+ ev.monitor_handle = cpu_to_le16(handle);
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_DEVICE_LOST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+static void mgmt_adv_monitor_device_found(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, bool report_device,
+ struct sk_buff *skb,
+ struct sock *skip_sk)
+{
+ struct sk_buff *advmon_skb;
+ size_t advmon_skb_len;
+ __le16 *monitor_handle;
+ struct monitored_device *dev, *tmp;
+ bool matched = false;
+ bool notify = false;
+
+ /* We have received the Advertisement Report because:
+ * 1. the kernel has initiated active discovery
+ * 2. if not, we have pend_le_reports > 0 in which case we are doing
+ * passive scanning
+ * 3. if none of the above is true, we have one or more active
+ * Advertisement Monitor
+ *
+ * For case 1 and 2, report all advertisements via MGMT_EV_DEVICE_FOUND
+ * and report ONLY one advertisement per device for the matched Monitor
+ * via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+ *
+ * For case 3, since we are not active scanning and all advertisements
+ * received are due to a matched Advertisement Monitor, report all
+ * advertisements ONLY via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+ */
+ if (report_device && !hdev->advmon_pend_notify) {
+ mgmt_event_skb(skb, skip_sk);
+ return;
+ }
+
+ advmon_skb_len = (sizeof(struct mgmt_ev_adv_monitor_device_found) -
+ sizeof(struct mgmt_ev_device_found)) + skb->len;
+ advmon_skb = mgmt_alloc_skb(hdev, MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+ advmon_skb_len);
+ if (!advmon_skb) {
+ if (report_device)
+ mgmt_event_skb(skb, skip_sk);
+ else
+ kfree_skb(skb);
+ return;
+ }
+
+ /* ADV_MONITOR_DEVICE_FOUND is similar to DEVICE_FOUND event except
+ * that it also has 'monitor_handle'. Make a copy of DEVICE_FOUND and
+ * store monitor_handle of the matched monitor.
+ */
+ monitor_handle = skb_put(advmon_skb, sizeof(*monitor_handle));
+ skb_put_data(advmon_skb, skb->data, skb->len);
+
+ hdev->advmon_pend_notify = false;
+
+ list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+ if (!bacmp(&dev->bdaddr, bdaddr)) {
+ matched = true;
+
+ if (!dev->notified) {
+ *monitor_handle = cpu_to_le16(dev->handle);
+ notify = true;
+ dev->notified = true;
+ }
+ }
+
+ if (!dev->notified)
+ hdev->advmon_pend_notify = true;
+ }
+
+ if (!report_device &&
+ ((matched && !notify) || !msft_monitor_supported(hdev))) {
+ /* Handle 0 indicates that we are not active scanning and this
+ * is a subsequent advertisement report for an already matched
+ * Advertisement Monitor or the controller offloading support
+ * is not available.
+ */
+ *monitor_handle = 0;
+ notify = true;
+ }
+
+ if (report_device)
+ mgmt_event_skb(skb, skip_sk);
+ else
+ kfree_skb(skb);
+
+ if (notify)
+ mgmt_event_skb(advmon_skb, skip_sk);
+ else
+ kfree_skb(advmon_skb);
+}
+
void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
{
struct sk_buff *skb;
struct mgmt_ev_device_found *ev;
+ bool report_device = hci_discovery_active(hdev);
/* Don't send events for a non-kernel initiated discovery. With
* LE one exception is if we have pend_le_reports > 0 in which
@@ -9603,11 +9709,10 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
if (!hci_discovery_active(hdev)) {
if (link_type == ACL_LINK)
return;
- if (link_type == LE_LINK &&
- list_empty(&hdev->pend_le_reports) &&
- !hci_is_adv_monitoring(hdev)) {
+ if (link_type == LE_LINK && !list_empty(&hdev->pend_le_reports))
+ report_device = true;
+ else if (!hci_is_adv_monitoring(hdev))
return;
- }
}
if (hdev->discovery.result_filtering) {
@@ -9672,7 +9777,7 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
- mgmt_event_skb(skb, NULL);
+ mgmt_adv_monitor_device_found(hdev, bdaddr, report_device, skb, NULL);
}
void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
index 6a943634b31a..9a3d77d3ca86 100644
--- a/net/bluetooth/msft.c
+++ b/net/bluetooth/msft.c
@@ -80,6 +80,14 @@ struct msft_rp_le_set_advertisement_filter_enable {
__u8 sub_opcode;
} __packed;
+#define MSFT_EV_LE_MONITOR_DEVICE 0x02
+struct msft_ev_le_monitor_device {
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+ __u8 monitor_handle;
+ __u8 monitor_state;
+} __packed;
+
struct msft_monitor_advertisement_handle_data {
__u8 msft_handle;
__u16 mgmt_handle;
@@ -204,6 +212,37 @@ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data
return NULL;
}
+/* This function requires the caller holds hdev->lock */
+static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle,
+ bdaddr_t *bdaddr, __u8 addr_type,
+ bool notify)
+{
+ struct monitored_device *dev, *tmp;
+ int count = 0;
+
+ list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+ /* mgmt_handle == 0 indicates remove all devices, whereas,
+ * bdaddr == NULL indicates remove all devices matching the
+ * mgmt_handle.
+ */
+ if ((!mgmt_handle || dev->handle == mgmt_handle) &&
+ (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) &&
+ addr_type == dev->addr_type))) {
+ if (notify && dev->notified) {
+ mgmt_adv_monitor_device_lost(hdev, dev->handle,
+ &dev->bdaddr,
+ dev->addr_type);
+ }
+
+ list_del(&dev->list);
+ kfree(dev);
+ count++;
+ }
+ }
+
+ return count;
+}
+
static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev,
u8 status, u16 opcode,
struct sk_buff *skb)
@@ -294,6 +333,10 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
if (monitor && !msft->suspending)
hci_free_adv_monitor(hdev, monitor);
+ /* Clear any monitored devices by this Adv Monitor */
+ msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL,
+ 0, false);
+
list_del(&handle_data->list);
kfree(handle_data);
}
@@ -557,6 +600,14 @@ void msft_do_close(struct hci_dev *hdev)
list_del(&handle_data->list);
kfree(handle_data);
}
+
+ hci_dev_lock(hdev);
+
+ /* Clear any devices that are being monitored and notify device lost */
+ hdev->advmon_pend_notify = false;
+ msft_monitor_device_del(hdev, 0, NULL, 0, true);
+
+ hci_dev_unlock(hdev);
}
void msft_register(struct hci_dev *hdev)
@@ -590,10 +641,101 @@ void msft_unregister(struct hci_dev *hdev)
kfree(msft);
}
+/* This function requires the caller holds hdev->lock */
+static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ __u8 addr_type, __u16 mgmt_handle)
+{
+ struct monitored_device *dev;
+
+ dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ bt_dev_err(hdev, "MSFT vendor event %u: no memory",
+ MSFT_EV_LE_MONITOR_DEVICE);
+ return;
+ }
+
+ bacpy(&dev->bdaddr, bdaddr);
+ dev->addr_type = addr_type;
+ dev->handle = mgmt_handle;
+ dev->notified = false;
+
+ INIT_LIST_HEAD(&dev->list);
+ list_add(&dev->list, &hdev->monitored_devices);
+ hdev->advmon_pend_notify = true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ __u8 addr_type, __u16 mgmt_handle)
+{
+ if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type,
+ true)) {
+ bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list",
+ MSFT_EV_LE_MONITOR_DEVICE, bdaddr);
+ }
+}
+
+static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
+{
+ void *data;
+
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev);
+
+ return data;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct msft_ev_le_monitor_device *ev;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ u8 addr_type;
+
+ ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev));
+ if (!ev)
+ return;
+
+ bt_dev_dbg(hdev,
+ "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR",
+ MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle,
+ ev->monitor_state, &ev->bdaddr);
+
+ handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false);
+ if (!handle_data)
+ return;
+
+ switch (ev->addr_type) {
+ case ADDR_LE_DEV_PUBLIC:
+ addr_type = BDADDR_LE_PUBLIC;
+ break;
+
+ case ADDR_LE_DEV_RANDOM:
+ addr_type = BDADDR_LE_RANDOM;
+ break;
+
+ default:
+ bt_dev_err(hdev,
+ "MSFT vendor event 0x%02x: unknown addr type 0x%02x",
+ MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type);
+ return;
+ }
+
+ if (ev->monitor_state)
+ msft_device_found(hdev, &ev->bdaddr, addr_type,
+ handle_data->mgmt_handle);
+ else
+ msft_device_lost(hdev, &ev->bdaddr, addr_type,
+ handle_data->mgmt_handle);
+}
+
void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
{
struct msft_data *msft = hdev->msft_data;
- u8 event;
+ u8 *evt_prefix;
+ u8 *evt;
if (!msft)
return;
@@ -602,13 +744,12 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
* matches, and otherwise just return.
*/
if (msft->evt_prefix_len > 0) {
- if (skb->len < msft->evt_prefix_len)
+ evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len);
+ if (!evt_prefix)
return;
- if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len))
+ if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len))
return;
-
- skb_pull(skb, msft->evt_prefix_len);
}
/* Every event starts at least with an event code and the rest of
@@ -617,10 +758,23 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
if (skb->len < 1)
return;
- event = *skb->data;
- skb_pull(skb, 1);
+ evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt));
+ if (!evt)
+ return;
+
+ hci_dev_lock(hdev);
+
+ switch (*evt) {
+ case MSFT_EV_LE_MONITOR_DEVICE:
+ msft_monitor_device_evt(hdev, skb);
+ break;
- bt_dev_dbg(hdev, "MSFT vendor event %u", event);
+ default:
+ bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt);
+ break;
+ }
+
+ hci_dev_unlock(hdev);
}
__u64 msft_get_features(struct hci_dev *hdev)
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index fbc896323bec..d0e54e30658a 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -145,7 +145,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
- u32 *next_btf_id)
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
{
const struct btf_type *state;
s32 type_id;
@@ -162,7 +163,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
return -EACCES;
}
- err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+ err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
if (err < 0)
return err;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 46dd95755967..f08034500813 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -5,6 +5,7 @@
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/slab.h>
+#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/etherdevice.h>
#include <linux/filter.h>
@@ -130,7 +131,8 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
static int bpf_test_finish(const union bpf_attr *kattr,
union bpf_attr __user *uattr, const void *data,
- u32 size, u32 retval, u32 duration)
+ struct skb_shared_info *sinfo, u32 size,
+ u32 retval, u32 duration)
{
void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
int err = -EFAULT;
@@ -145,8 +147,37 @@ static int bpf_test_finish(const union bpf_attr *kattr,
err = -ENOSPC;
}
- if (data_out && copy_to_user(data_out, data, copy_size))
- goto out;
+ if (data_out) {
+ int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size;
+
+ if (copy_to_user(data_out, data, len))
+ goto out;
+
+ if (sinfo) {
+ int i, offset = len;
+ u32 data_len;
+
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ if (offset >= copy_size) {
+ err = -ENOSPC;
+ break;
+ }
+
+ data_len = min_t(u32, copy_size - offset,
+ skb_frag_size(frag));
+
+ if (copy_to_user(data_out + offset,
+ skb_frag_address(frag),
+ data_len))
+ goto out;
+
+ offset += data_len;
+ }
+ }
+ }
+
if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
goto out;
if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
@@ -171,6 +202,8 @@ int noinline bpf_fentry_test1(int a)
{
return a + 1;
}
+EXPORT_SYMBOL_GPL(bpf_fentry_test1);
+ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO);
int noinline bpf_fentry_test2(int a, u64 b)
{
@@ -232,28 +265,142 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
return sk;
}
+struct prog_test_ref_kfunc {
+ int a;
+ int b;
+ struct prog_test_ref_kfunc *next;
+};
+
+static struct prog_test_ref_kfunc prog_test_struct = {
+ .a = 42,
+ .b = 108,
+ .next = &prog_test_struct,
+};
+
+noinline struct prog_test_ref_kfunc *
+bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
+{
+ /* randomly return NULL */
+ if (get_jiffies_64() % 2)
+ return NULL;
+ return &prog_test_struct;
+}
+
+noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+{
+}
+
+struct prog_test_pass1 {
+ int x0;
+ struct {
+ int x1;
+ struct {
+ int x2;
+ struct {
+ int x3;
+ };
+ };
+ };
+};
+
+struct prog_test_pass2 {
+ int len;
+ short arr1[4];
+ struct {
+ char arr2[4];
+ unsigned long arr3[8];
+ } x;
+};
+
+struct prog_test_fail1 {
+ void *p;
+ int x;
+};
+
+struct prog_test_fail2 {
+ int x8;
+ struct prog_test_pass1 x;
+};
+
+struct prog_test_fail3 {
+ int len;
+ char arr1[2];
+ char arr2[];
+};
+
+noinline void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
+{
+}
+
+noinline void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
+{
+}
+
+noinline void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
+{
+}
+
+noinline void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
+{
+}
+
+noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
+{
+}
+
__diag_pop();
ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
-BTF_SET_START(test_sk_kfunc_ids)
+BTF_SET_START(test_sk_check_kfunc_ids)
BTF_ID(func, bpf_kfunc_call_test1)
BTF_ID(func, bpf_kfunc_call_test2)
BTF_ID(func, bpf_kfunc_call_test3)
-BTF_SET_END(test_sk_kfunc_ids)
-
-bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner)
-{
- if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id))
- return true;
- return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner);
-}
-
-static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
- u32 headroom, u32 tailroom)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_ID(func, bpf_kfunc_call_test_release)
+BTF_ID(func, bpf_kfunc_call_test_pass_ctx)
+BTF_ID(func, bpf_kfunc_call_test_pass1)
+BTF_ID(func, bpf_kfunc_call_test_pass2)
+BTF_ID(func, bpf_kfunc_call_test_fail1)
+BTF_ID(func, bpf_kfunc_call_test_fail2)
+BTF_ID(func, bpf_kfunc_call_test_fail3)
+BTF_ID(func, bpf_kfunc_call_test_mem_len_pass1)
+BTF_ID(func, bpf_kfunc_call_test_mem_len_fail1)
+BTF_ID(func, bpf_kfunc_call_test_mem_len_fail2)
+BTF_SET_END(test_sk_check_kfunc_ids)
+
+BTF_SET_START(test_sk_acquire_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_SET_END(test_sk_acquire_kfunc_ids)
+
+BTF_SET_START(test_sk_release_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test_release)
+BTF_SET_END(test_sk_release_kfunc_ids)
+
+BTF_SET_START(test_sk_ret_null_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test_acquire)
+BTF_SET_END(test_sk_ret_null_kfunc_ids)
+
+static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
+ u32 size, u32 headroom, u32 tailroom)
{
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
- u32 user_size = kattr->test.data_size_in;
void *data;
if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
@@ -581,7 +728,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
if (kattr->test.flags || kattr->test.cpu)
return -EINVAL;
- data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN,
+ data = bpf_test_init(kattr, kattr->test.data_size_in,
+ size, NET_SKB_PAD + NET_IP_ALIGN,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
if (IS_ERR(data))
return PTR_ERR(data);
@@ -683,7 +831,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
/* bpf program can never convert linear skb to non-linear */
if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
size = skb_headlen(skb);
- ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
+ ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval,
+ duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, ctx,
sizeof(struct __sk_buff));
@@ -758,16 +907,16 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- u32 headroom = XDP_PACKET_HEADROOM;
u32 size = kattr->test.data_size_in;
+ u32 headroom = XDP_PACKET_HEADROOM;
+ u32 retval, duration, max_data_sz;
u32 repeat = kattr->test.repeat;
struct netdev_rx_queue *rxqueue;
+ struct skb_shared_info *sinfo;
struct xdp_buff xdp = {};
- u32 retval, duration;
+ int i, ret = -EINVAL;
struct xdp_md *ctx;
- u32 max_data_sz;
void *data;
- int ret = -EINVAL;
if (prog->expected_attach_type == BPF_XDP_DEVMAP ||
prog->expected_attach_type == BPF_XDP_CPUMAP)
@@ -787,26 +936,65 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
headroom -= ctx->data;
}
- /* XDP have extra tailroom as (most) drivers use full page */
max_data_sz = 4096 - headroom - tailroom;
+ size = min_t(u32, size, max_data_sz);
- data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
+ data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
goto free_ctx;
}
rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
- xdp_init_buff(&xdp, headroom + max_data_sz + tailroom,
- &rxqueue->xdp_rxq);
+ rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom;
+ xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq);
xdp_prepare_buff(&xdp, data, headroom, size, true);
+ sinfo = xdp_get_shared_info_from_buff(&xdp);
ret = xdp_convert_md_to_buff(ctx, &xdp);
if (ret)
goto free_data;
+ if (unlikely(kattr->test.data_size_in > size)) {
+ void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+
+ while (size < kattr->test.data_size_in) {
+ struct page *page;
+ skb_frag_t *frag;
+ u32 data_len;
+
+ if (sinfo->nr_frags == MAX_SKB_FRAGS) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frag = &sinfo->frags[sinfo->nr_frags++];
+ __skb_frag_set_page(frag, page);
+
+ data_len = min_t(u32, kattr->test.data_size_in - size,
+ PAGE_SIZE);
+ skb_frag_size_set(frag, data_len);
+
+ if (copy_from_user(page_address(page), data_in + size,
+ data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ sinfo->xdp_frags_size += data_len;
+ size += data_len;
+ }
+ xdp_buff_set_frags_flag(&xdp);
+ }
+
if (repeat > 1)
bpf_prog_change_xdp(NULL, prog);
+
ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
/* We convert the xdp_buff back to an xdp_md before checking the return
* code so the reference count of any held netdevice will be decremented
@@ -816,12 +1004,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
if (ret)
goto out;
- if (xdp.data_meta != data + headroom ||
- xdp.data_end != xdp.data_meta + size)
- size = xdp.data_end - xdp.data_meta;
-
- ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval,
- duration);
+ size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size;
+ ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size,
+ retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, ctx,
sizeof(struct xdp_md));
@@ -830,6 +1015,8 @@ out:
if (repeat > 1)
bpf_prog_change_xdp(prog, NULL);
free_data:
+ for (i = 0; i < sinfo->nr_frags; i++)
+ __free_page(skb_frag_page(&sinfo->frags[i]));
kfree(data);
free_ctx:
kfree(ctx);
@@ -876,7 +1063,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (size < ETH_HLEN)
return -EINVAL;
- data = bpf_test_init(kattr, size, 0, 0);
+ data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0);
if (IS_ERR(data))
return PTR_ERR(data);
@@ -911,8 +1098,8 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (ret < 0)
goto out;
- ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
- retval, duration);
+ ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL,
+ sizeof(flow_keys), retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, user_ctx,
sizeof(struct bpf_flow_keys));
@@ -960,7 +1147,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
goto out;
- if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+ if (user_ctx->local_port > U16_MAX) {
ret = -ERANGE;
goto out;
}
@@ -968,7 +1155,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
ctx.family = (u16)user_ctx->family;
ctx.protocol = (u16)user_ctx->protocol;
ctx.dport = (u16)user_ctx->local_port;
- ctx.sport = (__force __be16)user_ctx->remote_port;
+ ctx.sport = user_ctx->remote_port;
switch (ctx.family) {
case AF_INET:
@@ -1016,7 +1203,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
}
- ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration);
+ ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
@@ -1067,3 +1254,17 @@ out:
kfree(ctx);
return err;
}
+
+static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &test_sk_check_kfunc_ids,
+ .acquire_set = &test_sk_acquire_kfunc_ids,
+ .release_set = &test_sk_release_kfunc_ids,
+ .ret_null_set = &test_sk_ret_null_kfunc_ids,
+};
+
+static int __init bpf_prog_test_run_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
+}
+late_initcall(bpf_prog_test_run_init);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2661dda1a92b..48bc61ebc211 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1985,7 +1985,7 @@ void br_switchdev_mdb_notify(struct net_device *dev,
struct net_bridge_port_group *pg,
int type);
int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
- struct netlink_ext_ack *extack);
+ bool changed, struct netlink_ext_ack *extack);
int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
void br_switchdev_init(struct net_bridge *br);
@@ -2052,8 +2052,8 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
return 0;
}
-static inline int br_switchdev_port_vlan_add(struct net_device *dev,
- u16 vid, u16 flags,
+static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid,
+ u16 flags, bool changed,
struct netlink_ext_ack *extack)
{
return -EOPNOTSUPP;
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index f8fbaaa7c501..59fcabd08ef1 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -160,13 +160,14 @@ br_switchdev_fdb_notify(struct net_bridge *br,
}
int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
- struct netlink_ext_ack *extack)
+ bool changed, struct netlink_ext_ack *extack)
{
struct switchdev_obj_port_vlan v = {
.obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
.flags = flags,
.vid = vid,
+ .changed = changed,
};
return switchdev_port_obj_add(dev, &v.obj, extack);
@@ -351,51 +352,19 @@ br_switchdev_vlan_replay_one(struct notifier_block *nb,
return notifier_to_errno(err);
}
-static int br_switchdev_vlan_replay(struct net_device *br_dev,
- struct net_device *dev,
- const void *ctx, bool adding,
- struct notifier_block *nb,
- struct netlink_ext_ack *extack)
+static int br_switchdev_vlan_replay_group(struct notifier_block *nb,
+ struct net_device *dev,
+ struct net_bridge_vlan_group *vg,
+ const void *ctx, unsigned long action,
+ struct netlink_ext_ack *extack)
{
- struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
- struct net_bridge_port *p;
- struct net_bridge *br;
- unsigned long action;
int err = 0;
u16 pvid;
- ASSERT_RTNL();
-
- if (!nb)
- return 0;
-
- if (!netif_is_bridge_master(br_dev))
- return -EINVAL;
-
- if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
- return -EINVAL;
-
- if (netif_is_bridge_master(dev)) {
- br = netdev_priv(dev);
- vg = br_vlan_group(br);
- p = NULL;
- } else {
- p = br_port_get_rtnl(dev);
- if (WARN_ON(!p))
- return -EINVAL;
- vg = nbp_vlan_group(p);
- br = p->br;
- }
-
if (!vg)
return 0;
- if (adding)
- action = SWITCHDEV_PORT_OBJ_ADD;
- else
- action = SWITCHDEV_PORT_OBJ_DEL;
-
pvid = br_get_pvid(vg);
list_for_each_entry(v, &vg->vlan_list, vlist) {
@@ -415,7 +384,48 @@ static int br_switchdev_vlan_replay(struct net_device *br_dev,
return err;
}
- return err;
+ return 0;
+}
+
+static int br_switchdev_vlan_replay(struct net_device *br_dev,
+ const void *ctx, bool adding,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(br_dev);
+ struct net_bridge_port *p;
+ unsigned long action;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev))
+ return -EINVAL;
+
+ if (adding)
+ action = SWITCHDEV_PORT_OBJ_ADD;
+ else
+ action = SWITCHDEV_PORT_OBJ_DEL;
+
+ err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br),
+ ctx, action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ struct net_device *dev = p->dev;
+
+ err = br_switchdev_vlan_replay_group(nb, dev,
+ nbp_vlan_group(p),
+ ctx, action, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -681,8 +691,7 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx,
struct net_device *dev = p->dev;
int err;
- err = br_switchdev_vlan_replay(br_dev, dev, ctx, true, blocking_nb,
- extack);
+ err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack);
if (err && err != -EOPNOTSUPP)
return err;
@@ -706,11 +715,11 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
struct net_device *br_dev = p->br->dev;
struct net_device *dev = p->dev;
- br_switchdev_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
+ br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb);
br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
- br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb);
+ br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);
}
/* Let the bridge know that this port is offloaded, so that it can assign a
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 1402d5ca242d..7557e90b60e1 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -34,53 +34,70 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
}
-static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg,
+static void __vlan_add_pvid(struct net_bridge_vlan_group *vg,
const struct net_bridge_vlan *v)
{
if (vg->pvid == v->vid)
- return false;
+ return;
smp_wmb();
br_vlan_set_pvid_state(vg, v->state);
vg->pvid = v->vid;
-
- return true;
}
-static bool __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
{
if (vg->pvid != vid)
- return false;
+ return;
smp_wmb();
vg->pvid = 0;
-
- return true;
}
-/* return true if anything changed, false otherwise */
-static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
+/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v.
+ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and
+ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v.
+ */
+static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags,
+ bool commit)
{
struct net_bridge_vlan_group *vg;
- u16 old_flags = v->flags;
- bool ret;
+ bool change;
if (br_vlan_is_master(v))
vg = br_vlan_group(v->br);
else
vg = nbp_vlan_group(v->port);
+ /* check if anything would be changed on commit */
+ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) ||
+ ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED);
+
+ if (!commit)
+ goto out;
+
if (flags & BRIDGE_VLAN_INFO_PVID)
- ret = __vlan_add_pvid(vg, v);
+ __vlan_add_pvid(vg, v);
else
- ret = __vlan_delete_pvid(vg, v->vid);
+ __vlan_delete_pvid(vg, v->vid);
if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
v->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
else
v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED;
- return ret || !!(old_flags ^ v->flags);
+out:
+ return change;
+}
+
+static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags)
+{
+ return __vlan_flags_update(v, flags, false);
+}
+
+static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags)
+{
+ __vlan_flags_update(v, flags, true);
}
static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
@@ -92,7 +109,7 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q add.
*/
- err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack);
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack);
if (err == -EOPNOTSUPP)
return vlan_vid_add(dev, br->vlan_proto, v->vid);
v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV;
@@ -284,9 +301,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
}
br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx);
} else {
- err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack);
- if (err && err != -EOPNOTSUPP)
- goto out;
+ if (br_vlan_should_use(v)) {
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags,
+ false, extack);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
br_multicast_ctx_init(br, v, &v->br_mcast_ctx);
v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
}
@@ -310,7 +330,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
goto out_fdb_insert;
__vlan_add_list(v);
- __vlan_add_flags(v, flags);
+ __vlan_flags_commit(v, flags);
br_multicast_toggle_one_vlan(v, true);
if (p)
@@ -404,6 +424,7 @@ static void __vlan_flush(const struct net_bridge *br,
{
struct net_bridge_vlan *vlan, *tmp;
u16 v_start = 0, v_end = 0;
+ int err;
__vlan_delete_pvid(vg, vg->pvid);
list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) {
@@ -417,7 +438,13 @@ static void __vlan_flush(const struct net_bridge *br,
}
v_end = vlan->vid;
- __vlan_del(vlan);
+ err = __vlan_del(vlan);
+ if (err) {
+ br_err(br,
+ "port %u(%s) failed to delete vlan %d: %pe\n",
+ (unsigned int) p->port_no, p->dev->name,
+ vlan->vid, ERR_PTR(err));
+ }
}
/* notify about the last/whole vlan range */
@@ -670,18 +697,29 @@ static int br_vlan_add_existing(struct net_bridge *br,
u16 flags, bool *changed,
struct netlink_ext_ack *extack)
{
+ bool would_change = __vlan_flags_would_change(vlan, flags);
+ bool becomes_brentry = false;
int err;
- err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, extack);
- if (err && err != -EOPNOTSUPP)
- return err;
-
if (!br_vlan_is_brentry(vlan)) {
/* Trying to change flags of non-existent bridge vlan */
- if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
- err = -EINVAL;
- goto err_flags;
- }
+ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
+ return -EINVAL;
+
+ becomes_brentry = true;
+ }
+
+ /* Master VLANs that aren't brentries weren't notified before,
+ * time to notify them now.
+ */
+ if (becomes_brentry || would_change) {
+ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags,
+ would_change, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ if (becomes_brentry) {
/* It was only kept for port vlans, now make it real */
err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid);
if (err) {
@@ -696,13 +734,13 @@ static int br_vlan_add_existing(struct net_bridge *br,
br_multicast_toggle_one_vlan(vlan, true);
}
- if (__vlan_add_flags(vlan, flags))
+ __vlan_flags_commit(vlan, flags);
+ if (would_change)
*changed = true;
return 0;
err_fdb_insert:
-err_flags:
br_switchdev_port_vlan_del(br->dev, vlan->vid);
return err;
}
@@ -1247,11 +1285,18 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
*changed = false;
vlan = br_vlan_find(nbp_vlan_group(port), vid);
if (vlan) {
- /* Pass the flags to the hardware bridge */
- ret = br_switchdev_port_vlan_add(port->dev, vid, flags, extack);
- if (ret && ret != -EOPNOTSUPP)
- return ret;
- *changed = __vlan_add_flags(vlan, flags);
+ bool would_change = __vlan_flags_would_change(vlan, flags);
+
+ if (would_change) {
+ /* Pass the flags to the hardware bridge */
+ ret = br_switchdev_port_vlan_add(port->dev, vid, flags,
+ true, extack);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ __vlan_flags_commit(vlan, flags);
+ *changed = would_change;
return 0;
}
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 440139706130..52dd0b6835bc 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -268,7 +268,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev,
err = caifd->layer.up->receive(caifd->layer.up, pkt);
- /* For -EILSEQ the packet is not freed so so it now */
+ /* For -EILSEQ the packet is not freed so free it now */
if (err == -EILSEQ)
cfpkt_destroy(pkt);
diff --git a/net/can/gw.c b/net/can/gw.c
index d8861e862f15..24221352e059 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1239,16 +1239,19 @@ static int __net_init cangw_pernet_init(struct net *net)
return 0;
}
-static void __net_exit cangw_pernet_exit(struct net *net)
+static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- cgw_remove_all_jobs(net);
+ list_for_each_entry(net, net_list, exit_list)
+ cgw_remove_all_jobs(net);
rtnl_unlock();
}
static struct pernet_operations cangw_pernet_ops = {
.init = cangw_pernet_init,
- .exit = cangw_pernet_exit,
+ .exit_batch = cangw_pernet_exit_batch,
};
static __init int cgw_module_init(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 1baab07820f6..05fa867f1878 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -216,18 +216,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock(struct softnet_data *sd)
+static inline void rps_lock_irqsave(struct softnet_data *sd,
+ unsigned long *flags)
{
-#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
-#endif
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(*flags);
}
-static inline void rps_unlock(struct softnet_data *sd)
+static inline void rps_lock_irq_disable(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
-#endif
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_lock_irq(&sd->input_pkt_queue.lock);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
+}
+
+static inline void rps_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_unlock_irq(&sd->input_pkt_queue.lock);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
}
static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
@@ -1037,7 +1057,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, name_node->name, IFNAMSIZ))
- set_bit(i, inuse);
+ __set_bit(i, inuse);
}
if (!sscanf(d->name, name, &i))
continue;
@@ -1047,7 +1067,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, d->name, IFNAMSIZ))
- set_bit(i, inuse);
+ __set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, max_netdevices);
@@ -4456,11 +4476,11 @@ static void rps_trigger_softirq(void *data)
* If yes, queue it to our IPI list and return 1
* If no, return 0
*/
-static int rps_ipi_queued(struct softnet_data *sd)
+static int napi_schedule_rps(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+#ifdef CONFIG_RPS
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
@@ -4469,6 +4489,7 @@ static int rps_ipi_queued(struct softnet_data *sd)
return 1;
}
#endif /* CONFIG_RPS */
+ __napi_schedule_irqoff(&mysd->backlog);
return 0;
}
@@ -4525,9 +4546,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
sd = &per_cpu(softnet_data, cpu);
- local_irq_save(flags);
-
- rps_lock(sd);
+ rps_lock_irqsave(sd, &flags);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
@@ -4536,26 +4555,21 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
- rps_unlock(sd);
- local_irq_restore(flags);
+ rps_unlock_irq_restore(sd, &flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
- if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
- if (!rps_ipi_queued(sd))
- ____napi_schedule(sd, &sd->backlog);
- }
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ napi_schedule_rps(sd);
goto enqueue;
}
drop:
sd->dropped++;
- rps_unlock(sd);
-
- local_irq_restore(flags);
+ rps_unlock_irq_restore(sd, &flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
@@ -4796,7 +4810,6 @@ static int netif_rx_internal(struct sk_buff *skb)
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4806,78 +4819,67 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
}
return ret;
}
/**
+ * __netif_rx - Slightly optimized version of netif_rx
+ * @skb: buffer to post
+ *
+ * This behaves as netif_rx except that it does not disable bottom halves.
+ * As a result this function may only be invoked from the interrupt context
+ * (either hard or soft interrupt).
+ */
+int __netif_rx(struct sk_buff *skb)
+{
+ int ret;
+
+ lockdep_assert_once(hardirq_count() | softirq_count());
+
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ return ret;
+}
+EXPORT_SYMBOL(__netif_rx);
+
+/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
+ * the upper (protocol) levels to process via the backlog NAPI device. It
+ * always succeeds. The buffer may be dropped during processing for
+ * congestion control or by the protocol layers.
+ * The network buffer is passed via the backlog NAPI device. Modern NIC
+ * driver should use NAPI and GRO.
+ * This function can used from any context.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
-
int netif_rx(struct sk_buff *skb)
{
int ret;
+ local_bh_disable();
trace_netif_rx_entry(skb);
-
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret);
-
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL(netif_rx);
-int netif_rx_ni(struct sk_buff *skb)
-{
- int err;
-
- trace_netif_rx_ni_entry(skb);
-
- preempt_disable();
- err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
- trace_netif_rx_ni_exit(err);
-
- return err;
-}
-EXPORT_SYMBOL(netif_rx_ni);
-
-int netif_rx_any_context(struct sk_buff *skb)
-{
- /*
- * If invoked from contexts which do not invoke bottom half
- * processing either at return from interrupt or when softrqs are
- * reenabled, use netif_rx_ni() which invokes bottomhalf processing
- * directly.
- */
- if (in_interrupt())
- return netif_rx(skb);
- else
- return netif_rx_ni(skb);
-}
-EXPORT_SYMBOL(netif_rx_any_context);
-
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -5650,8 +5652,7 @@ static void flush_backlog(struct work_struct *work)
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
@@ -5659,8 +5660,7 @@ static void flush_backlog(struct work_struct *work)
input_queue_head_incr(sd);
}
}
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
@@ -5678,16 +5678,14 @@ static bool flush_required(int cpu)
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
bool do_flush;
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
/* as insertion into process_queue happens with the rps lock held,
* process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
return do_flush;
#endif
@@ -5802,8 +5800,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
@@ -5819,8 +5816,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
}
return work;
@@ -9143,7 +9139,7 @@ DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
- dev_net(dev)->dev_unreg_count++;
+ atomic_inc(&dev_net(dev)->dev_unreg_count);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -9683,8 +9679,10 @@ int register_netdevice(struct net_device *dev)
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
- dev_hold(dev);
+
+ dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
list_netdevice(dev);
+
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
@@ -9813,8 +9811,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10;
#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
- * netdev_wait_allrefs - wait until all references are gone.
- * @dev: target net_device
+ * netdev_wait_allrefs_any - wait until all references are gone.
+ * @list: list of net_devices to wait on
*
* This is called when unregistering network devices.
*
@@ -9824,37 +9822,45 @@ int netdev_unregister_timeout_secs __read_mostly = 10;
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
-static void netdev_wait_allrefs(struct net_device *dev)
+static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
unsigned long rebroadcast_time, warning_time;
- int wait = 0, refcnt;
+ struct net_device *dev;
+ int wait = 0;
- linkwatch_forget_dev(dev);
+ list_for_each_entry(dev, list, todo_list)
+ linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
- refcnt = netdev_refcnt_read(dev);
- while (refcnt != 1) {
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ while (true) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ list_for_each_entry(dev, list, todo_list)
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock();
rcu_barrier();
rtnl_lock();
- if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
- &dev->state)) {
- /* We must not have linkwatch events
- * pending on unregister. If this
- * happens, we simply run the queue
- * unscheduled, resulting in a noop
- * for this device.
- */
- linkwatch_run_queue();
- }
+ list_for_each_entry(dev, list, todo_list)
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ break;
+ }
__rtnl_unlock();
@@ -9869,14 +9875,18 @@ static void netdev_wait_allrefs(struct net_device *dev)
wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
}
- refcnt = netdev_refcnt_read(dev);
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
- if (refcnt != 1 &&
- time_after(jiffies, warning_time +
+ if (time_after(jiffies, warning_time +
netdev_unregister_timeout_secs * HZ)) {
- pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
- dev->name, refcnt);
- ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ list_for_each_entry(dev, list, todo_list) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, netdev_refcnt_read(dev));
+ ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ }
+
warning_time = jiffies;
}
}
@@ -9908,6 +9918,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
*/
void netdev_run_todo(void)
{
+ struct net_device *dev, *tmp;
struct list_head list;
#ifdef CONFIG_LOCKDEP
struct list_head unlink_list;
@@ -9928,26 +9939,23 @@ void netdev_run_todo(void)
__rtnl_unlock();
-
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
- while (!list_empty(&list)) {
- struct net_device *dev
- = list_first_entry(&list, struct net_device, todo_list);
- list_del(&dev->todo_list);
-
+ list_for_each_entry_safe(dev, tmp, &list, todo_list) {
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
- pr_err("network todo '%s' but state %d\n",
- dev->name, dev->reg_state);
- dump_stack();
+ netdev_WARN(dev, "run_todo but not unregistering\n");
+ list_del(&dev->todo_list);
continue;
}
dev->reg_state = NETREG_UNREGISTERED;
+ }
- netdev_wait_allrefs(dev);
+ while (!list_empty(&list)) {
+ dev = netdev_wait_allrefs_any(&list);
+ list_del(&dev->todo_list);
/* paranoia */
BUG_ON(netdev_refcnt_read(dev) != 1);
@@ -9963,11 +9971,8 @@ void netdev_run_todo(void)
if (dev->needs_free_netdev)
free_netdev(dev);
- /* Report a network device has been unregistered */
- rtnl_lock();
- dev_net(dev)->dev_unreg_count--;
- __rtnl_unlock();
- wake_up(&netdev_unregistering_wq);
+ if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
/* Free network device */
kobject_put(&dev->dev.kobj);
@@ -10172,7 +10177,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
- dev_hold(dev);
+ __dev_hold(dev);
#else
refcount_set(&dev->dev_refcnt, 1);
#endif
@@ -10449,7 +10454,7 @@ void unregister_netdevice_many(struct list_head *head)
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
- dev_put(dev);
+ dev_put_track(dev, &dev->dev_registered_tracker);
net_set_todo(dev);
}
@@ -10732,8 +10737,7 @@ static int __net_init netdev_init(struct net *net)
BUILD_BUG_ON(GRO_HASH_BUCKETS >
8 * sizeof_field(struct napi_struct, gro_bitmask));
- if (net != &init_net)
- INIT_LIST_HEAD(&net->dev_base_head);
+ INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
@@ -10849,14 +10853,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
.exit = netdev_exit,
};
-static void __net_exit default_device_exit(struct net *net)
+static void __net_exit default_device_exit_net(struct net *net)
{
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
- rtnl_lock();
+ ASSERT_RTNL();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
@@ -10880,24 +10884,24 @@ static void __net_exit default_device_exit(struct net *net)
BUG();
}
}
- rtnl_unlock();
}
static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
{
- /* Return with the rtnl_lock held when there are no network
+ /* Return (with the rtnl_lock held) when there are no network
* devices unregistering in any network namespace in net_list.
*/
- struct net *net;
- bool unregistering;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ bool unregistering;
+ struct net *net;
+ ASSERT_RTNL();
add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
unregistering = false;
- rtnl_lock();
+
list_for_each_entry(net, net_list, exit_list) {
- if (net->dev_unreg_count > 0) {
+ if (atomic_read(&net->dev_unreg_count) > 0) {
unregistering = true;
break;
}
@@ -10907,6 +10911,7 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
__rtnl_unlock();
wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ rtnl_lock();
}
remove_wait_queue(&netdev_unregistering_wq, &wait);
}
@@ -10922,6 +10927,11 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ default_device_exit_net(net);
+ cond_resched();
+ }
/* To prevent network device cleanup code from dereferencing
* loopback devices or network devices that have been freed
* wait here for all pending unregistrations to complete,
@@ -10934,6 +10944,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
* default_device_exit_batch.
*/
rtnl_lock_unregistering(net_list);
+
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
@@ -10947,7 +10958,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
static struct pernet_operations __net_initdata default_device_ops = {
- .exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index d5dc6be2522c..d7d177f75d43 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -48,6 +48,19 @@
static int trace_state = TRACE_OFF;
static bool monitor_hw;
+#undef EM
+#undef EMe
+
+#define EM(a, b) [a] = #b,
+#define EMe(a, b) [a] = #b
+
+/* drop_reasons is used to translate 'enum skb_drop_reason' to string,
+ * which is reported to user space.
+ */
+static const char * const drop_reasons[] = {
+ TRACE_SKB_DROP_REASON
+};
+
/* net_dm_mutex
*
* An overall lock guarding every operation coming from userspace.
@@ -126,6 +139,7 @@ struct net_dm_skb_cb {
struct devlink_trap_metadata *hw_metadata;
void *pc;
};
+ enum skb_drop_reason reason;
};
#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -502,6 +516,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
{
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *data;
+ struct net_dm_skb_cb *cb;
struct sk_buff *nskb;
unsigned long flags;
@@ -512,7 +527,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
if (!nskb)
return;
- NET_DM_SKB_CB(nskb)->pc = location;
+ if ((unsigned int)reason >= SKB_DROP_REASON_MAX)
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ cb = NET_DM_SKB_CB(nskb);
+ cb->reason = reason;
+ cb->pc = location;
/* Override the timestamp because we care about the time when the
* packet was dropped.
*/
@@ -557,7 +576,8 @@ static size_t net_dm_in_port_size(void)
#define NET_DM_MAX_SYMBOL_LEN 40
-static size_t net_dm_packet_report_size(size_t payload_len)
+static size_t net_dm_packet_report_size(size_t payload_len,
+ enum skb_drop_reason reason)
{
size_t size;
@@ -578,6 +598,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
nla_total_size(sizeof(u32)) +
/* NET_DM_ATTR_PROTO */
nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_REASON */
+ nla_total_size(strlen(drop_reasons[reason]) + 1) +
/* NET_DM_ATTR_PAYLOAD */
nla_total_size(payload_len);
}
@@ -610,7 +632,7 @@ nla_put_failure:
static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
size_t payload_len)
{
- u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+ struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
char buf[NET_DM_MAX_SYMBOL_LEN];
struct nlattr *attr;
void *hdr;
@@ -624,10 +646,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
goto nla_put_failure;
- if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+ NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_REASON,
+ drop_reasons[cb->reason]))
goto nla_put_failure;
- snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+ snprintf(buf, sizeof(buf), "%pS", cb->pc);
if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
goto nla_put_failure;
@@ -683,7 +710,9 @@ static void net_dm_packet_report(struct sk_buff *skb)
if (net_dm_trunc_len)
payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
- msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len,
+ NET_DM_SKB_CB(skb)->reason),
+ GFP_KERNEL);
if (!msg)
goto out;
diff --git a/net/core/filter.c b/net/core/filter.c
index 4603b7cd3cd1..818244068c2d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2603,7 +2603,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
+ if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2809,7 +2809,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, &msg->sg.copy);
+ __clear_bit(new, msg->sg.copy);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
@@ -3783,6 +3783,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
+{
+ return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
+};
+
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3817,11 +3839,208 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush)
+{
+ unsigned long ptr_len, ptr_off = 0;
+ skb_frag_t *next_frag, *end_frag;
+ struct skb_shared_info *sinfo;
+ void *src, *dst;
+ u8 *ptr_buf;
+
+ if (likely(xdp->data_end - xdp->data >= off + len)) {
+ src = flush ? buf : xdp->data + off;
+ dst = flush ? xdp->data + off : buf;
+ memcpy(dst, src, len);
+ return;
+ }
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ end_frag = &sinfo->frags[sinfo->nr_frags];
+ next_frag = &sinfo->frags[0];
+
+ ptr_len = xdp->data_end - xdp->data;
+ ptr_buf = xdp->data;
+
+ while (true) {
+ if (off < ptr_off + ptr_len) {
+ unsigned long copy_off = off - ptr_off;
+ unsigned long copy_len = min(len, ptr_len - copy_off);
+
+ src = flush ? buf : ptr_buf + copy_off;
+ dst = flush ? ptr_buf + copy_off : buf;
+ memcpy(dst, src, copy_len);
+
+ off += copy_len;
+ len -= copy_len;
+ buf += copy_len;
+ }
+
+ if (!len || next_frag == end_frag)
+ break;
+
+ ptr_off += ptr_len;
+ ptr_buf = skb_frag_address(next_frag);
+ ptr_len = skb_frag_size(next_frag);
+ next_frag++;
+ }
+}
+
+static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ u32 size = xdp->data_end - xdp->data;
+ void *addr = xdp->data;
+ int i;
+
+ if (unlikely(offset > 0xffff || len > 0xffff))
+ return ERR_PTR(-EFAULT);
+
+ if (offset + len > xdp_get_buff_len(xdp))
+ return ERR_PTR(-EINVAL);
+
+ if (offset < size) /* linear area */
+ goto out;
+
+ offset -= size;
+ for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+ u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+ if (offset < frag_size) {
+ addr = skb_frag_address(&sinfo->frags[i]);
+ size = frag_size;
+ break;
+ }
+ offset -= frag_size;
+ }
+out:
+ return offset + len < size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+ else
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+ .func = bpf_xdp_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+ else
+ memcpy(ptr, buf, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+ .func = bpf_xdp_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+ struct xdp_rxq_info *rxq = xdp->rxq;
+ unsigned int tailroom;
+
+ if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
+ return -EOPNOTSUPP;
+
+ tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+ if (unlikely(offset > tailroom))
+ return -EINVAL;
+
+ memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+ skb_frag_size_add(frag, offset);
+ sinfo->xdp_frags_size += offset;
+
+ return 0;
+}
+
+static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, n_frags_free = 0, len_free = 0;
+
+ if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
+ return -EINVAL;
+
+ for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ int shrink = min_t(int, offset, skb_frag_size(frag));
+
+ len_free += shrink;
+ offset -= shrink;
+
+ if (skb_frag_size(frag) == shrink) {
+ struct page *page = skb_frag_page(frag);
+
+ __xdp_return(page_address(page), &xdp->rxq->mem,
+ false, NULL);
+ n_frags_free++;
+ } else {
+ skb_frag_size_sub(frag, shrink);
+ break;
+ }
+ }
+ sinfo->nr_frags -= n_frags_free;
+ sinfo->xdp_frags_size -= len_free;
+
+ if (unlikely(!sinfo->nr_frags)) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp->data_end -= offset;
+ }
+
+ return 0;
+}
+
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
+ if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
+ if (offset < 0)
+ return bpf_xdp_frags_shrink_tail(xdp, -offset);
+
+ return bpf_xdp_frags_increase_tail(xdp, offset);
+ }
+
/* Notice that xdp_data_hard_end have reserved some tailroom */
if (unlikely(data_end > data_hard_end))
return -EINVAL;
@@ -4047,6 +4266,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
enum bpf_map_type map_type = ri->map_type;
+ /* XDP_REDIRECT is not fully supported yet for xdp frags since
+ * not all XDP capable drivers can map non-linear xdp_frame in
+ * ndo_xdp_xmit.
+ */
+ if (unlikely(xdp_buff_has_frags(xdp) &&
+ map_type != BPF_MAP_TYPE_CPUMAP))
+ return -EOPNOTSUPP;
+
if (map_type == BPF_MAP_TYPE_XSKMAP)
return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
@@ -4590,10 +4817,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
};
#endif
-static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
unsigned long off, unsigned long len)
{
- memcpy(dst_buff, src_buff + off, len);
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+ bpf_xdp_copy_buf(xdp, off, dst, len, false);
return 0;
}
@@ -4604,11 +4833,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(!xdp ||
- xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+
+ if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
return -EFAULT;
- return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ return bpf_event_output(map, flags, meta, meta_size, xdp,
xdp_size, bpf_xdp_copy);
}
@@ -4862,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_txrehash = (u8)val;
+ break;
default:
ret = -EINVAL;
}
@@ -5040,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
*((int *)optval) = sk->sk_reuseport;
break;
+ case SO_TXREHASH:
+ *((int *)optval) = sk->sk_txrehash;
+ break;
default:
goto err_clear;
}
@@ -7533,6 +7772,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_map_proto;
case BPF_FUNC_xdp_adjust_tail:
return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_proto;
+ case BPF_FUNC_xdp_load_bytes:
+ return &bpf_xdp_load_bytes_proto;
+ case BPF_FUNC_xdp_store_bytes:
+ return &bpf_xdp_store_bytes_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
case BPF_FUNC_check_mtu:
@@ -8030,6 +8275,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
+ int field_size;
if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
@@ -8041,7 +8287,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case offsetof(struct bpf_sock, family):
case offsetof(struct bpf_sock, type):
case offsetof(struct bpf_sock, protocol):
- case offsetof(struct bpf_sock, dst_port):
case offsetof(struct bpf_sock, src_port):
case offsetof(struct bpf_sock, rx_queue_mapping):
case bpf_ctx_range(struct bpf_sock, src_ip4):
@@ -8050,6 +8295,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sock, dst_port):
+ field_size = size == size_default ?
+ size_default : sizeof_field(struct bpf_sock, dst_port);
+ bpf_ctx_record_field_size(info, field_size);
+ return bpf_ctx_narrow_access_ok(off, size, field_size);
+ case offsetofend(struct bpf_sock, dst_port) ...
+ offsetof(struct bpf_sock, dst_ip4) - 1:
+ return false;
}
return size == size_default;
@@ -10062,7 +10315,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
- .check_kfunc_call = bpf_prog_test_check_kfunc_call,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10601,7 +10853,8 @@ static bool sk_lookup_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
- case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+ case offsetof(struct bpf_sk_lookup, remote_port) ...
+ offsetof(struct bpf_sk_lookup, local_ip4) - 1:
case bpf_ctx_range(struct bpf_sk_lookup, local_port):
case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
bpf_ctx_record_field_size(info, sizeof(__u32));
diff --git a/net/core/gro.c b/net/core/gro.c
index a11b286d1495..ee5e7e889d8b 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -459,29 +459,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
- NAPI_GRO_CB(skb)->same_flow = 0;
+ BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+ sizeof(u32))); /* Avoid slow unaligned acc */
+ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->encap_mark = 0;
- NAPI_GRO_CB(skb)->recursion_counter = 0;
- NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->is_atomic = 1;
- NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
NAPI_GRO_CB(skb)->csum = skb->csum;
NAPI_GRO_CB(skb)->csum_valid = 1;
- NAPI_GRO_CB(skb)->csum_cnt = 0;
break;
case CHECKSUM_UNNECESSARY:
NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
- NAPI_GRO_CB(skb)->csum_valid = 0;
break;
- default:
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- NAPI_GRO_CB(skb)->csum_valid = 0;
}
pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
@@ -634,7 +627,6 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
if (unlikely(skb->slow_gro)) {
skb_orphan(skb);
skb_ext_reset(skb);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index b0f5344d1185..95098d1a49bd 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -166,10 +166,10 @@ static void linkwatch_do_dev(struct net_device *dev)
netdev_state_change(dev);
}
- /* Note: our callers are responsible for
- * calling netdev_tracker_free().
+ /* Note: our callers are responsible for calling netdev_tracker_free().
+ * This is the reason we use __dev_put() instead of dev_put().
*/
- dev_put(dev);
+ __dev_put(dev);
}
static void __linkwatch_run_queue(int urgent_only)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a5b5bb99c644..0ec2f5906a27 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -44,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem);
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif
-struct net init_net = {
- .ns.count = REFCOUNT_INIT(1),
- .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
-#ifdef CONFIG_KEYS
- .key_domain = &init_net_key_domain,
-#endif
-};
+struct net init_net;
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
@@ -301,6 +295,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
return peer;
}
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
/*
* setup_net runs the initializers for the network namespace object.
@@ -363,6 +358,8 @@ out_undo:
static int __net_init net_defaults_init_net(struct net *net)
{
net->core.sysctl_somaxconn = SOMAXCONN;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+
return 0;
}
@@ -1084,7 +1081,7 @@ out:
rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}
-static int __init net_ns_init(void)
+void __init net_ns_init(void)
{
struct net_generic *ng;
@@ -1105,6 +1102,9 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
+#ifdef CONFIG_KEYS
+ init_net.key_domain = &init_net_key_domain;
+#endif
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
@@ -1119,12 +1119,8 @@ static int __init net_ns_init(void)
RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
RTNL_FLAG_DOIT_UNLOCKED);
-
- return 0;
}
-pure_initcall(net_ns_init);
-
static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
{
ops_pre_exit_list(ops, net_exit_list);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index bd62c01a2ec3..e25d359d84d9 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -423,11 +423,6 @@ static __always_inline struct page *
__page_pool_put_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct)
{
- /* It is not the last user for the page frag case */
- if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
- page_pool_atomic_sub_frag_count_return(page, 1))
- return NULL;
-
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
@@ -471,8 +466,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
return NULL;
}
-void page_pool_put_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
if (page && !page_pool_recycle_in_ring(pool, page)) {
@@ -480,7 +475,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,
page_pool_return_page(pool, page);
}
}
-EXPORT_SYMBOL(page_pool_put_page);
+EXPORT_SYMBOL(page_pool_put_defragged_page);
/* Caller must not use data area after call, as this function overwrites it */
void page_pool_put_page_bulk(struct page_pool *pool, void **data,
@@ -491,6 +486,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
for (i = 0; i < count; i++) {
struct page *page = virt_to_head_page(data[i]);
+ /* It is not the last user for the page frag case */
+ if (!page_pool_is_last_frag(pool, page))
+ continue;
+
page = __page_pool_put_page(pool, page, -1, false);
/* Approved for bulk recycling in ptr_ring cache */
if (page)
@@ -526,8 +525,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
long drain_count = BIAS_MAX - pool->frag_users;
/* Some user is still using the page frag */
- if (likely(page_pool_atomic_sub_frag_count_return(page,
- drain_count)))
+ if (likely(page_pool_defrag_page(page, drain_count)))
return NULL;
if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
@@ -548,8 +546,7 @@ static void page_pool_free_frag(struct page_pool *pool)
pool->frag_page = NULL;
- if (!page ||
- page_pool_atomic_sub_frag_count_return(page, drain_count))
+ if (!page || page_pool_defrag_page(page, drain_count))
return;
page_pool_return_page(pool, page);
@@ -588,7 +585,7 @@ frag_reset:
pool->frag_users = 1;
*offset = 0;
pool->frag_offset = size;
- page_pool_set_frag_count(page, BIAS_MAX);
+ page_pool_fragment_page(page, BIAS_MAX);
return page;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2fb8eb6791e8..20a9e1686453 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void)
* setup_net() and cleanup_net() are not possible.
*/
for_each_net(net) {
- if (net->dev_unreg_count > 0) {
+ if (atomic_read(&net->dev_unreg_count) > 0) {
unregistering = true;
break;
}
@@ -5407,8 +5407,10 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
return -ENODEV;
filter_mask = ifsm->filter_mask;
- if (!filter_mask)
+ if (!filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
return -EINVAL;
+ }
nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
if (!nskb)
diff --git a/net/core/sock.c b/net/core/sock.c
index 4ff806d71921..09d31a7dc68f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1447,6 +1447,15 @@ set_sndbuf:
break;
}
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Paired with READ_ONCE() in tcp_rtx_synack() */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reserved_mem;
break;
+ case SO_TXREHASH:
+ v.val = sk->sk_txrehash;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2266,6 +2279,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+ sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
@@ -2611,7 +2625,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
switch (cmsg->cmsg_type) {
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -3278,6 +3293,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
+ sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1827669eedd6..2d213c4011db 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1416,38 +1416,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
- struct bpf_prog **pprog;
if (!progs)
return -EOPNOTSUPP;
switch (which) {
case BPF_SK_MSG_VERDICT:
- pprog = &progs->msg_parser;
+ *pprog = &progs->msg_parser;
break;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- pprog = &progs->stream_parser;
+ *pprog = &progs->stream_parser;
break;
#endif
case BPF_SK_SKB_STREAM_VERDICT:
if (progs->skb_verdict)
return -EBUSY;
- pprog = &progs->stream_verdict;
+ *pprog = &progs->stream_verdict;
break;
case BPF_SK_SKB_VERDICT:
if (progs->stream_verdict)
return -EBUSY;
- pprog = &progs->skb_verdict;
+ *pprog = &progs->skb_verdict;
break;
default:
return -EOPNOTSUPP;
}
+ return 0;
+}
+
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which)
+{
+ struct bpf_prog **pprog;
+ int ret;
+
+ ret = sock_map_prog_lookup(map, &pprog, which);
+ if (ret)
+ return ret;
+
if (old)
return psock_replace_prog(pprog, prog, old);
@@ -1455,6 +1467,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
return 0;
}
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd;
+ struct bpf_prog **pprog;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct fd f;
+ u32 id = 0;
+ int ret;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ rcu_read_lock();
+
+ ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+ if (ret)
+ goto end;
+
+ prog = *pprog;
+ prog_cnt = !prog ? 0 : 1;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ goto end;
+
+ /* we do not hold the refcnt, the bpf prog may be released
+ * asynchronously and the id would be set to 0.
+ */
+ id = data_race(prog->aux->id);
+ if (id == 0)
+ prog_cnt = 0;
+
+end:
+ rcu_read_unlock();
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) ||
+ (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) ||
+ copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ ret = -EFAULT;
+
+ fdput(f);
+ return ret;
+}
+
static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
{
switch (link->map->map_type) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7b4d485aac7a..dbeb8ecbcd98 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
+ {
+ .procname = "txrehash",
+ .data = &init_net.core.sysctl_txrehash,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dou8vec_minmax,
+ },
{ }
};
@@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
static __net_init int sysctl_core_net_init(struct net *net)
{
- struct ctl_table *tbl;
+ struct ctl_table *tbl, *tmp;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
@@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
if (tbl == NULL)
goto err_dup;
- tbl[0].data = &net->core.sysctl_somaxconn;
+ for (tmp = tbl; tmp->procname; tmp++)
+ tmp->data += (char *)net - (char *)&init_net;
/* Don't export any sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) {
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 7aba35504986..361df312ee7f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -162,8 +162,9 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
}
/* Returns 0 on success, negative on failure */
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
- struct net_device *dev, u32 queue_index, unsigned int napi_id)
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ struct net_device *dev, u32 queue_index,
+ unsigned int napi_id, u32 frag_size)
{
if (!dev) {
WARN(1, "Missing net_device from driver");
@@ -185,11 +186,12 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
xdp_rxq->napi_id = napi_id;
+ xdp_rxq->frag_size = frag_size;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;
}
-EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
{
@@ -369,8 +371,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- struct xdp_buff *xdp)
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ struct xdp_buff *xdp)
{
struct xdp_mem_allocator *xa;
struct page *page;
@@ -406,12 +408,38 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
void xdp_return_frame(struct xdp_frame *xdpf)
{
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdpf->mem, false, NULL);
+ }
+out:
__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdpf->mem, true, NULL);
+ }
+out:
__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -447,7 +475,7 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
struct xdp_mem_allocator *xa;
if (mem->type != MEM_TYPE_PAGE_POOL) {
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ xdp_return_frame(xdpf);
return;
}
@@ -466,12 +494,38 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
}
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
+ struct skb_shared_info *sinfo;
+ int i;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ bq->q[bq->count++] = skb_frag_address(frag);
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+ }
+ }
bq->q[bq->count++] = xdpf->data;
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
void xdp_return_buff(struct xdp_buff *xdp)
{
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
+ }
+out:
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
}
@@ -561,8 +615,14 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
struct net_device *dev)
{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
unsigned int headroom, frame_size;
void *hard_start;
+ u8 nr_frags;
+
+ /* xdp frags frame */
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ nr_frags = sinfo->nr_frags;
/* Part of headroom was reserved to xdpf */
headroom = sizeof(*xdpf) + xdpf->headroom;
@@ -582,6 +642,12 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_is_frag_pfmemalloc(xdpf));
+
/* Essential SKB info: protocol and skb->dev */
skb->protocol = eth_type_trans(skb, dev);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 5183e627468d..671c377f0889 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -136,11 +136,6 @@ static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
}
-static inline u64 max48(const u64 seq1, const u64 seq2)
-{
- return after48(seq1, seq2) ? seq1 : seq2;
-}
-
/**
* dccp_loss_count - Approximate the number of lost data packets in a burst loss
* @s1: last known sequence number before the loss ('hole')
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 0ea29270d7e5..ae662567a6cb 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1030,15 +1030,9 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
inet_ctl_sock_destroy(pn->v4_ctl_sk);
}
-static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&dccp_hashinfo, AF_INET);
-}
-
static struct pernet_operations dccp_v4_ops = {
.init = dccp_v4_init_net,
.exit = dccp_v4_exit_net,
- .exit_batch = dccp_v4_exit_batch,
.id = &dccp_v4_pernet_id,
.size = sizeof(struct dccp_v4_pernet),
};
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index fa663518fa0e..eab3bd1ee9a0 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1115,15 +1115,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
inet_ctl_sock_destroy(pn->v6_ctl_sk);
}
-static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&dccp_hashinfo, AF_INET6);
-}
-
static struct pernet_operations dccp_v6_ops = {
.init = dccp_v6_init_net,
.exit = dccp_v6_exit_net,
- .exit_batch = dccp_v6_exit_batch,
.id = &dccp_v6_pernet_id,
.size = sizeof(struct dccp_v6_pernet),
};
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 91e7a2202697..64d805b27add 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -22,6 +22,7 @@
#include "feat.h"
struct inet_timewait_death_row dccp_death_row = {
+ .tw_refcount = REFCOUNT_INIT(1),
.sysctl_max_tw_buckets = NR_FILE * 2,
.hashinfo = &dccp_hashinfo,
};
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dcad3100b164..408b79a28cd4 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -15,6 +15,7 @@
#include <linux/of.h>
#include <linux/of_net.h>
#include <net/devlink.h>
+#include <net/sch_generic.h>
#include "dsa_priv.h"
@@ -452,8 +453,10 @@ static int dsa_port_setup(struct dsa_port *dp)
return 0;
mutex_init(&dp->addr_lists_lock);
+ mutex_init(&dp->vlans_lock);
INIT_LIST_HEAD(&dp->fdbs);
INIT_LIST_HEAD(&dp->mdbs);
+ INIT_LIST_HEAD(&dp->vlans);
if (ds->ops->port_setup) {
err = ds->ops->port_setup(ds, dp->index);
@@ -562,6 +565,7 @@ static void dsa_port_teardown(struct dsa_port *dp)
struct dsa_switch *ds = dp->ds;
struct dsa_mac_addr *a, *tmp;
struct net_device *slave;
+ struct dsa_vlan *v, *n;
if (!dp->setup)
return;
@@ -602,6 +606,11 @@ static void dsa_port_teardown(struct dsa_port *dp)
kfree(a);
}
+ list_for_each_entry_safe(v, n, &dp->vlans, list) {
+ list_del(&v->list);
+ kfree(v);
+ }
+
dp->setup = false;
}
@@ -1064,9 +1073,18 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
list_for_each_entry(dp, &dst->ports, list) {
if (dsa_port_is_cpu(dp)) {
- err = dsa_master_setup(dp->master, dp);
+ struct net_device *master = dp->master;
+ bool admin_up = (master->flags & IFF_UP) &&
+ !qdisc_tx_is_noop(master);
+
+ err = dsa_master_setup(master, dp);
if (err)
return err;
+
+ /* Replay master state event */
+ dsa_tree_master_admin_state_change(dst, master, admin_up);
+ dsa_tree_master_oper_state_change(dst, master,
+ netif_oper_up(master));
}
}
@@ -1081,9 +1099,19 @@ static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
rtnl_lock();
- list_for_each_entry(dp, &dst->ports, list)
- if (dsa_port_is_cpu(dp))
- dsa_master_teardown(dp->master);
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_cpu(dp)) {
+ struct net_device *master = dp->master;
+
+ /* Synthesizing an "admin down" state is sufficient for
+ * the switches to get a notification if the master is
+ * currently up and running.
+ */
+ dsa_tree_master_admin_state_change(dst, master, false);
+
+ dsa_master_teardown(master);
+ }
+ }
rtnl_unlock();
}
@@ -1279,6 +1307,52 @@ out_unlock:
return err;
}
+static void dsa_tree_master_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master)
+{
+ struct dsa_notifier_master_state_info info;
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+
+ info.master = master;
+ info.operational = dsa_port_master_is_operational(cpu_dp);
+
+ dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info);
+}
+
+void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up)
+{
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+ bool notify = false;
+
+ if ((dsa_port_master_is_operational(cpu_dp)) !=
+ (up && cpu_dp->master_oper_up))
+ notify = true;
+
+ cpu_dp->master_admin_up = up;
+
+ if (notify)
+ dsa_tree_master_state_change(dst, master);
+}
+
+void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up)
+{
+ struct dsa_port *cpu_dp = master->dsa_ptr;
+ bool notify = false;
+
+ if ((dsa_port_master_is_operational(cpu_dp)) !=
+ (cpu_dp->master_admin_up && up))
+ notify = true;
+
+ cpu_dp->master_oper_up = up;
+
+ if (notify)
+ dsa_tree_master_state_change(dst, master);
+}
+
static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
{
struct dsa_switch_tree *dst = ds->dst;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 23c79e91ac67..a37f0883676a 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -34,12 +34,15 @@ enum {
DSA_NOTIFIER_HOST_MDB_DEL,
DSA_NOTIFIER_VLAN_ADD,
DSA_NOTIFIER_VLAN_DEL,
+ DSA_NOTIFIER_HOST_VLAN_ADD,
+ DSA_NOTIFIER_HOST_VLAN_DEL,
DSA_NOTIFIER_MTU,
DSA_NOTIFIER_TAG_PROTO,
DSA_NOTIFIER_TAG_PROTO_CONNECT,
DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
+ DSA_NOTIFIER_MASTER_STATE_CHANGE,
};
/* DSA_NOTIFIER_AGEING_TIME */
@@ -109,6 +112,12 @@ struct dsa_notifier_tag_8021q_vlan_info {
u16 vid;
};
+/* DSA_NOTIFIER_MASTER_STATE_CHANGE */
+struct dsa_notifier_master_state_info {
+ const struct net_device *master;
+ bool operational;
+};
+
struct dsa_switchdev_event_work {
struct dsa_switch *ds;
int port;
@@ -226,6 +235,11 @@ int dsa_port_vlan_add(struct dsa_port *dp,
struct netlink_ext_ack *extack);
int dsa_port_vlan_del(struct dsa_port *dp,
const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
int dsa_port_mrp_add(const struct dsa_port *dp,
const struct switchdev_obj_mrp *mrp);
int dsa_port_mrp_del(const struct dsa_port *dp,
@@ -481,6 +495,12 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
struct net_device *master,
const struct dsa_device_ops *tag_ops,
const struct dsa_device_ops *old_tag_ops);
+void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up);
+void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *master,
+ bool up);
unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
void dsa_bridge_num_put(const struct net_device *bridge_dev,
unsigned int bridge_num);
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 2199104ca7df..6ac393cc6ea7 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -330,8 +330,6 @@ static const struct attribute_group dsa_group = {
.attrs = dsa_slave_attrs,
};
-static struct lock_class_key dsa_master_addr_list_lock_key;
-
int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
{
struct dsa_switch *ds = cpu_dp->ds;
@@ -353,8 +351,6 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
wmb();
dev->dsa_ptr = cpu_dp;
- lockdep_set_class(&dev->addr_list_lock,
- &dsa_master_addr_list_lock_key);
dsa_master_set_promiscuity(dev, 1);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index bd78192e0e47..cca5cf686f74 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -904,6 +904,48 @@ int dsa_port_vlan_del(struct dsa_port *dp,
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
}
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_notifier_vlan_info info = {
+ .sw_index = dp->ds->index,
+ .port = dp->index,
+ .vlan = vlan,
+ .extack = extack,
+ };
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+ int err;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_ADD, &info);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ vlan_vid_add(cpu_dp->master, htons(ETH_P_8021Q), vlan->vid);
+
+ return err;
+}
+
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_notifier_vlan_info info = {
+ .sw_index = dp->ds->index,
+ .port = dp->index,
+ .vlan = vlan,
+ };
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+ int err;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_DEL, &info);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ vlan_vid_del(cpu_dp->master, htons(ETH_P_8021Q), vlan->vid);
+
+ return err;
+}
+
int dsa_port_mrp_add(const struct dsa_port *dp,
const struct switchdev_obj_mrp *mrp)
{
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 22241afcac81..f61e6b72ffbb 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -348,9 +348,8 @@ static int dsa_slave_vlan_add(struct net_device *dev,
const struct switchdev_obj *obj,
struct netlink_ext_ack *extack)
{
- struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct switchdev_obj_port_vlan vlan;
+ struct switchdev_obj_port_vlan *vlan;
int err;
if (dsa_port_skip_vlan_configuration(dp)) {
@@ -358,14 +357,14 @@ static int dsa_slave_vlan_add(struct net_device *dev,
return 0;
}
- vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
/* Deny adding a bridge VLAN when there is already an 802.1Q upper with
* the same VID.
*/
if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
rcu_read_lock();
- err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan);
+ err = dsa_slave_vlan_check_for_8021q_uppers(dev, vlan);
rcu_read_unlock();
if (err) {
NL_SET_ERR_MSG_MOD(extack,
@@ -374,21 +373,36 @@ static int dsa_slave_vlan_add(struct net_device *dev,
}
}
- err = dsa_port_vlan_add(dp, &vlan, extack);
- if (err)
- return err;
+ return dsa_port_vlan_add(dp, vlan, extack);
+}
+
+/* Offload a VLAN installed on the bridge or on a foreign interface by
+ * installing it as a VLAN towards the CPU port.
+ */
+static int dsa_slave_host_vlan_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct switchdev_obj_port_vlan vlan;
+
+ /* Do nothing if this is a software bridge */
+ if (!dp->bridge)
+ return -EOPNOTSUPP;
+
+ if (dsa_port_skip_vlan_configuration(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+ return 0;
+ }
+
+ vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
- /* We need the dedicated CPU port to be a member of the VLAN as well.
- * Even though drivers often handle CPU membership in special ways,
+ /* Even though drivers often handle CPU membership in special ways,
* it doesn't make sense to program a PVID, so clear this flag.
*/
vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
- err = dsa_port_vlan_add(dp->cpu_dp, &vlan, extack);
- if (err)
- return err;
-
- return vlan_vid_add(master, htons(ETH_P_8021Q), vlan.vid);
+ return dsa_port_host_vlan_add(dp, &vlan, extack);
}
static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx,
@@ -415,10 +429,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx,
err = dsa_port_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
- if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
- return -EOPNOTSUPP;
-
- err = dsa_slave_vlan_add(dev, obj, extack);
+ if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ err = dsa_slave_vlan_add(dev, obj, extack);
+ else
+ err = dsa_slave_host_vlan_add(dev, obj, extack);
break;
case SWITCHDEV_OBJ_ID_MRP:
if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
@@ -444,26 +458,33 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx,
static int dsa_slave_vlan_del(struct net_device *dev,
const struct switchdev_obj *obj)
{
- struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
struct switchdev_obj_port_vlan *vlan;
- int err;
if (dsa_port_skip_vlan_configuration(dp))
return 0;
vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
- /* Do not deprogram the CPU port as it may be shared with other user
- * ports which can be members of this VLAN as well.
- */
- err = dsa_port_vlan_del(dp, vlan);
- if (err)
- return err;
+ return dsa_port_vlan_del(dp, vlan);
+}
- vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid);
+static int dsa_slave_host_vlan_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
- return 0;
+ /* Do nothing if this is a software bridge */
+ if (!dp->bridge)
+ return -EOPNOTSUPP;
+
+ if (dsa_port_skip_vlan_configuration(dp))
+ return 0;
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ return dsa_port_host_vlan_del(dp, vlan);
}
static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx,
@@ -489,10 +510,10 @@ static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx,
err = dsa_port_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
- if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
- return -EOPNOTSUPP;
-
- err = dsa_slave_vlan_del(dev, obj);
+ if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ err = dsa_slave_vlan_del(dev, obj);
+ else
+ err = dsa_slave_host_vlan_del(dev, obj);
break;
case SWITCHDEV_OBJ_ID_MRP:
if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
@@ -515,26 +536,6 @@ static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx,
return err;
}
-static int dsa_slave_get_port_parent_id(struct net_device *dev,
- struct netdev_phys_item_id *ppid)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
-
- /* For non-legacy ports, devlink is used and it takes
- * care of the name generation. This ndo implementation
- * should be removed with legacy support.
- */
- if (dp->ds->devlink)
- return -EOPNOTSUPP;
-
- ppid->id_len = sizeof(dst->index);
- memcpy(&ppid->id, &dst->index, ppid->id_len);
-
- return 0;
-}
-
static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
struct sk_buff *skb)
{
@@ -973,24 +974,6 @@ static void dsa_slave_poll_controller(struct net_device *dev)
}
#endif
-static int dsa_slave_get_phys_port_name(struct net_device *dev,
- char *name, size_t len)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- /* For non-legacy ports, devlink is used and it takes
- * care of the name generation. This ndo implementation
- * should be removed with legacy support.
- */
- if (dp->ds->devlink)
- return -EOPNOTSUPP;
-
- if (snprintf(name, len, "p%d", dp->index) >= len)
- return -EINVAL;
-
- return 0;
-}
-
static struct dsa_mall_tc_entry *
dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
{
@@ -1385,7 +1368,6 @@ static int dsa_slave_get_ts_info(struct net_device *dev,
static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
u16 vid)
{
- struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
struct switchdev_obj_port_vlan vlan = {
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
@@ -1405,7 +1387,7 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
}
/* And CPU port... */
- ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &extack);
+ ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
if (ret) {
if (extack._msg)
netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
@@ -1413,13 +1395,12 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
return ret;
}
- return vlan_vid_add(master, proto, vid);
+ return 0;
}
static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
u16 vid)
{
- struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
struct switchdev_obj_port_vlan vlan = {
.vid = vid,
@@ -1428,16 +1409,11 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
};
int err;
- /* Do not deprogram the CPU port as it may be shared with other user
- * ports which can be members of this VLAN as well.
- */
err = dsa_port_vlan_del(dp, &vlan);
if (err)
return err;
- vlan_vid_del(master, proto, vid);
-
- return 0;
+ return dsa_port_host_vlan_del(dp, &vlan);
}
static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg)
@@ -1747,7 +1723,7 @@ static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- return dp->ds->devlink ? &dp->devlink_port : NULL;
+ return &dp->devlink_port;
}
static void dsa_slave_get_stats64(struct net_device *dev,
@@ -1792,10 +1768,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup,
.ndo_poll_controller = dsa_slave_poll_controller,
#endif
- .ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
.ndo_setup_tc = dsa_slave_setup_tc,
.ndo_get_stats64 = dsa_slave_get_stats64,
- .ndo_get_port_parent_id = dsa_slave_get_port_parent_id,
.ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
.ndo_get_devlink_port = dsa_slave_get_devlink_port,
@@ -1917,15 +1891,6 @@ void dsa_slave_setup_tagger(struct net_device *slave)
slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
}
-static struct lock_class_key dsa_slave_netdev_xmit_lock_key;
-static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock,
- &dsa_slave_netdev_xmit_lock_key);
-}
-
int dsa_slave_suspend(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
@@ -1988,9 +1953,6 @@ int dsa_slave_create(struct dsa_port *port)
slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
- netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
- NULL);
-
SET_NETDEV_DEV(slave_dev, port->ds->dev);
slave_dev->dev.of_node = port->dn;
slave_dev->vlan_features = master->vlan_features;
@@ -2346,6 +2308,36 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
err = dsa_port_lag_change(dp, info->lower_state_info);
return notifier_from_errno(err);
}
+ case NETDEV_CHANGE:
+ case NETDEV_UP: {
+ /* Track state of master port.
+ * DSA driver may require the master port (and indirectly
+ * the tagger) to be available for some special operation.
+ */
+ if (netdev_uses_dsa(dev)) {
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->ds->dst;
+
+ /* Track when the master port is UP */
+ dsa_tree_master_oper_state_change(dst, dev,
+ netif_oper_up(dev));
+
+ /* Track when the master port is ready and can accept
+ * packet.
+ * NETDEV_UP event is not enough to flag a port as ready.
+ * We also have to wait for linkwatch_do_dev to dev_activate
+ * and emit a NETDEV_CHANGE event.
+ * We check if a master port is ready by checking if the dev
+ * have a qdisc assigned and is not noop.
+ */
+ dsa_tree_master_admin_state_change(dst, dev,
+ !qdisc_tx_is_noop(dev));
+
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+ }
case NETDEV_GOING_DOWN: {
struct dsa_port *dp, *cpu_dp;
struct dsa_switch_tree *dst;
@@ -2357,6 +2349,8 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
cpu_dp = dev->dsa_ptr;
dst = cpu_dp->ds->dst;
+ dsa_tree_master_admin_state_change(dst, dev, false);
+
list_for_each_entry(dp, &dst->ports, list) {
if (!dsa_port_is_user(dp))
continue;
@@ -2550,14 +2544,16 @@ static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,
switch (event) {
case SWITCHDEV_PORT_OBJ_ADD:
- err = switchdev_handle_port_obj_add(dev, ptr,
- dsa_slave_dev_check,
- dsa_slave_port_obj_add);
+ err = switchdev_handle_port_obj_add_foreign(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_foreign_dev_check,
+ dsa_slave_port_obj_add);
return notifier_from_errno(err);
case SWITCHDEV_PORT_OBJ_DEL:
- err = switchdev_handle_port_obj_del(dev, ptr,
- dsa_slave_dev_check,
- dsa_slave_port_obj_del);
+ err = switchdev_handle_port_obj_del_foreign(dev, ptr,
+ dsa_slave_dev_check,
+ dsa_foreign_dev_check,
+ dsa_slave_port_obj_del);
return notifier_from_errno(err);
case SWITCHDEV_PORT_ATTR_SET:
err = switchdev_handle_port_attr_set(dev, ptr,
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index e3c7d2627a61..0bb3987bd4e6 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -113,26 +113,15 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds,
return dsa_tag_8021q_bridge_join(ds, info);
}
-static int dsa_switch_bridge_leave(struct dsa_switch *ds,
- struct dsa_notifier_bridge_info *info)
+static int dsa_switch_sync_vlan_filtering(struct dsa_switch *ds,
+ struct dsa_notifier_bridge_info *info)
{
- struct dsa_switch_tree *dst = ds->dst;
struct netlink_ext_ack extack = {0};
bool change_vlan_filtering = false;
bool vlan_filtering;
struct dsa_port *dp;
int err;
- if (dst->index == info->tree_index && ds->index == info->sw_index &&
- ds->ops->port_bridge_leave)
- ds->ops->port_bridge_leave(ds, info->port, info->bridge);
-
- if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
- ds->ops->crosschip_bridge_leave)
- ds->ops->crosschip_bridge_leave(ds, info->tree_index,
- info->sw_index, info->port,
- info->bridge);
-
if (ds->needs_standalone_vlan_filtering &&
!br_vlan_enabled(info->bridge.dev)) {
change_vlan_filtering = true;
@@ -172,6 +161,31 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
return err;
}
+ return 0;
+}
+
+static int dsa_switch_bridge_leave(struct dsa_switch *ds,
+ struct dsa_notifier_bridge_info *info)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+ int err;
+
+ if (dst->index == info->tree_index && ds->index == info->sw_index &&
+ ds->ops->port_bridge_leave)
+ ds->ops->port_bridge_leave(ds, info->port, info->bridge);
+
+ if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
+ ds->ops->crosschip_bridge_leave)
+ ds->ops->crosschip_bridge_leave(ds, info->tree_index,
+ info->sw_index, info->port,
+ info->bridge);
+
+ if (ds->dst->index == info->tree_index && ds->index == info->sw_index) {
+ err = dsa_switch_sync_vlan_filtering(ds, info);
+ if (err)
+ return err;
+ }
+
return dsa_tag_8021q_bridge_leave(ds, info);
}
@@ -544,6 +558,7 @@ static int dsa_switch_host_mdb_del(struct dsa_switch *ds,
return err;
}
+/* Port VLANs match on the targeted port and on all DSA ports */
static bool dsa_port_vlan_match(struct dsa_port *dp,
struct dsa_notifier_vlan_info *info)
{
@@ -556,6 +571,126 @@ static bool dsa_port_vlan_match(struct dsa_port *dp,
return false;
}
+/* Host VLANs match on the targeted port's CPU port, and on all DSA ports
+ * (upstream and downstream) of that switch and its upstream switches.
+ */
+static bool dsa_port_host_vlan_match(struct dsa_port *dp,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *targeted_dp, *cpu_dp;
+ struct dsa_switch *targeted_ds;
+
+ targeted_ds = dsa_switch_find(dp->ds->dst->index, info->sw_index);
+ targeted_dp = dsa_to_port(targeted_ds, info->port);
+ cpu_dp = targeted_dp->cpu_dp;
+
+ if (dsa_switch_is_upstream_of(dp->ds, targeted_ds))
+ return dsa_port_is_dsa(dp) || dp == cpu_dp;
+
+ return false;
+}
+
+static struct dsa_vlan *dsa_vlan_find(struct list_head *vlan_list,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_vlan *v;
+
+ list_for_each_entry(v, vlan_list, list)
+ if (v->vid == vlan->vid)
+ return v;
+
+ return NULL;
+}
+
+static int dsa_port_do_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct dsa_vlan *v;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports. */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+ return ds->ops->port_vlan_add(ds, port, vlan, extack);
+
+ /* No need to propagate on shared ports the existing VLANs that were
+ * re-notified after just the flags have changed. This would cause a
+ * refcount bump which we need to avoid, since it unbalances the
+ * additions with the deletions.
+ */
+ if (vlan->changed)
+ return 0;
+
+ mutex_lock(&dp->vlans_lock);
+
+ v = dsa_vlan_find(&dp->vlans, vlan);
+ if (v) {
+ refcount_inc(&v->refcount);
+ goto out;
+ }
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ if (err) {
+ kfree(v);
+ goto out;
+ }
+
+ v->vid = vlan->vid;
+ refcount_set(&v->refcount, 1);
+ list_add_tail(&v->list, &dp->vlans);
+
+out:
+ mutex_unlock(&dp->vlans_lock);
+
+ return err;
+}
+
+static int dsa_port_do_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct dsa_vlan *v;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+ return ds->ops->port_vlan_del(ds, port, vlan);
+
+ mutex_lock(&dp->vlans_lock);
+
+ v = dsa_vlan_find(&dp->vlans, vlan);
+ if (!v) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&v->refcount))
+ goto out;
+
+ err = ds->ops->port_vlan_del(ds, port, vlan);
+ if (err) {
+ refcount_set(&v->refcount, 1);
+ goto out;
+ }
+
+ list_del(&v->list);
+ kfree(v);
+
+out:
+ mutex_unlock(&dp->vlans_lock);
+
+ return err;
+}
+
static int dsa_switch_vlan_add(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
@@ -567,8 +702,8 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
dsa_switch_for_each_port(dp, ds) {
if (dsa_port_vlan_match(dp, info)) {
- err = ds->ops->port_vlan_add(ds, dp->index, info->vlan,
- info->extack);
+ err = dsa_port_do_vlan_add(dp, info->vlan,
+ info->extack);
if (err)
return err;
}
@@ -580,15 +715,61 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
static int dsa_switch_vlan_del(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
+ struct dsa_port *dp;
+ int err;
+
if (!ds->ops->port_vlan_del)
return -EOPNOTSUPP;
- if (ds->index == info->sw_index)
- return ds->ops->port_vlan_del(ds, info->port, info->vlan);
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_del(dp, info->vlan);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_host_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_add(dp, info->vlan,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_host_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_del(dp, info->vlan);
+ if (err)
+ return err;
+ }
+ }
- /* Do not deprogram the DSA links as they may be used as conduit
- * for other VLAN members in the fabric.
- */
return 0;
}
@@ -683,6 +864,18 @@ dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
return 0;
}
+static int
+dsa_switch_master_state_change(struct dsa_switch *ds,
+ struct dsa_notifier_master_state_info *info)
+{
+ if (!ds->ops->master_state_change)
+ return 0;
+
+ ds->ops->master_state_change(ds, info->master, info->operational);
+
+ return 0;
+}
+
static int dsa_switch_event(struct notifier_block *nb,
unsigned long event, void *info)
{
@@ -738,6 +931,12 @@ static int dsa_switch_event(struct notifier_block *nb,
case DSA_NOTIFIER_VLAN_DEL:
err = dsa_switch_vlan_del(ds, info);
break;
+ case DSA_NOTIFIER_HOST_VLAN_ADD:
+ err = dsa_switch_host_vlan_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_VLAN_DEL:
+ err = dsa_switch_host_vlan_del(ds, info);
+ break;
case DSA_NOTIFIER_MTU:
err = dsa_switch_mtu(ds, info);
break;
@@ -756,6 +955,9 @@ static int dsa_switch_event(struct notifier_block *nb,
case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
err = dsa_switch_tag_8021q_vlan_del(ds, info);
break;
+ case DSA_NOTIFIER_MASTER_STATE_CHANGE:
+ err = dsa_switch_master_state_change(ds, info);
+ break;
default:
err = -EOPNOTSUPP;
break;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 27712a81c967..114f663332d0 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -577,14 +577,14 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id)
{
u16 vid, tci;
- skb_push_rcsum(skb, ETH_HLEN);
if (skb_vlan_tag_present(skb)) {
tci = skb_vlan_tag_get(skb);
__vlan_hwaccel_clear_tag(skb);
} else {
+ skb_push_rcsum(skb, ETH_HLEN);
__skb_vlan_pop(skb, &tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
}
- skb_pull_rcsum(skb, ETH_HLEN);
vid = tci & VLAN_VID_MASK;
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 68982b2789a5..bd6f1d0e5372 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -32,6 +32,13 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
if (!xmit_work_fn || !xmit_worker)
return NULL;
+ /* PTP over IP packets need UDP checksumming. We may have inherited
+ * NETIF_F_HW_CSUM from the DSA master, but these packets are not sent
+ * through the DSA master, so calculate the checksum here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
if (!xmit_work)
return NULL;
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 1ea9401b8ace..57d2e00f1e5d 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -4,30 +4,12 @@
*/
#include <linux/etherdevice.h>
+#include <linux/bitfield.h>
+#include <net/dsa.h>
+#include <linux/dsa/tag_qca.h>
#include "dsa_priv.h"
-#define QCA_HDR_LEN 2
-#define QCA_HDR_VERSION 0x2
-
-#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14)
-#define QCA_HDR_RECV_VERSION_S 14
-#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11)
-#define QCA_HDR_RECV_PRIORITY_S 11
-#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6)
-#define QCA_HDR_RECV_TYPE_S 6
-#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3)
-#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
-
-#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14)
-#define QCA_HDR_XMIT_VERSION_S 14
-#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11)
-#define QCA_HDR_XMIT_PRIORITY_S 11
-#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8)
-#define QCA_HDR_XMIT_CONTROL_S 8
-#define QCA_HDR_XMIT_FROM_CPU BIT(7)
-#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0)
-
static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
@@ -40,8 +22,9 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
phdr = dsa_etype_header_pos_tx(skb);
/* Set the version field, and set destination port information */
- hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
- QCA_HDR_XMIT_FROM_CPU | BIT(dp->index);
+ hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION);
+ hdr |= QCA_HDR_XMIT_FROM_CPU;
+ hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(dp->index));
*phdr = htons(hdr);
@@ -50,10 +33,17 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
{
- u8 ver;
- u16 hdr;
- int port;
+ struct qca_tagger_data *tagger_data;
+ struct dsa_port *dp = dev->dsa_ptr;
+ struct dsa_switch *ds = dp->ds;
+ u8 ver, pk_type;
__be16 *phdr;
+ int port;
+ u16 hdr;
+
+ BUILD_BUG_ON(sizeof(struct qca_mgmt_ethhdr) != QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN);
+
+ tagger_data = ds->tagger_data;
if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
return NULL;
@@ -62,16 +52,33 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
hdr = ntohs(*phdr);
/* Make sure the version is correct */
- ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+ ver = FIELD_GET(QCA_HDR_RECV_VERSION, hdr);
if (unlikely(ver != QCA_HDR_VERSION))
return NULL;
+ /* Get pk type */
+ pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr);
+
+ /* Ethernet mgmt read/write packet */
+ if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) {
+ if (likely(tagger_data->rw_reg_ack_handler))
+ tagger_data->rw_reg_ack_handler(ds, skb);
+ return NULL;
+ }
+
+ /* Ethernet MIB counter packet */
+ if (pk_type == QCA_HDR_RECV_TYPE_MIB) {
+ if (likely(tagger_data->mib_autocast_handler))
+ tagger_data->mib_autocast_handler(ds, skb);
+ return NULL;
+ }
+
/* Remove QCA tag and recalculate checksum */
skb_pull_rcsum(skb, QCA_HDR_LEN);
dsa_strip_etype_header(skb, QCA_HDR_LEN);
/* Get source port information */
- port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+ port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
skb->dev = dsa_master_find_slave(dev, 0, port);
if (!skb->dev)
@@ -80,12 +87,34 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
return skb;
}
+static int qca_tag_connect(struct dsa_switch *ds)
+{
+ struct qca_tagger_data *tagger_data;
+
+ tagger_data = kzalloc(sizeof(*tagger_data), GFP_KERNEL);
+ if (!tagger_data)
+ return -ENOMEM;
+
+ ds->tagger_data = tagger_data;
+
+ return 0;
+}
+
+static void qca_tag_disconnect(struct dsa_switch *ds)
+{
+ kfree(ds->tagger_data);
+ ds->tagger_data = NULL;
+}
+
static const struct dsa_device_ops qca_netdev_ops = {
.name = "qca",
.proto = DSA_TAG_PROTO_QCA,
+ .connect = qca_tag_connect,
+ .disconnect = qca_tag_disconnect,
.xmit = qca_tag_xmit,
.rcv = qca_tag_rcv,
.needed_headroom = QCA_HDR_LEN,
+ .promisc_on_master = true,
};
MODULE_LICENSE("GPL");
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index c1d5f5e0fdc9..18a5035d3bee 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -53,7 +53,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */
nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */
nla_total_size(sizeof(u32)) + /* _RINGS_TX */
- nla_total_size(sizeof(u32)); /* _RINGS_RX_BUF_LEN */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */
+ nla_total_size(sizeof(u8)); /* _RINGS_TCP_DATA_SPLIT */
}
static int rings_fill_reply(struct sk_buff *skb,
@@ -61,9 +62,11 @@ static int rings_fill_reply(struct sk_buff *skb,
const struct ethnl_reply_data *reply_base)
{
const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
- const struct kernel_ethtool_ringparam *kernel_ringparam = &data->kernel_ringparam;
+ const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
const struct ethtool_ringparam *ringparam = &data->ringparam;
+ WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
+
if ((ringparam->rx_max_pending &&
(nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX,
ringparam->rx_max_pending) ||
@@ -84,9 +87,11 @@ static int rings_fill_reply(struct sk_buff *skb,
ringparam->tx_max_pending) ||
nla_put_u32(skb, ETHTOOL_A_RINGS_TX,
ringparam->tx_pending))) ||
- (kernel_ringparam->rx_buf_len &&
- (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN,
- kernel_ringparam->rx_buf_len))))
+ (kr->rx_buf_len &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) ||
+ (kr->tcp_data_split &&
+ (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
+ kr->tcp_data_split))))
return -EMSGSIZE;
return 0;
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 99f3af1a9d4d..fe6094e9a2db 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/debugfs.h>
+#include <linux/jhash.h>
#include "hsr_main.h"
#include "hsr_framereg.h"
@@ -28,6 +29,7 @@ hsr_node_table_show(struct seq_file *sfp, void *data)
{
struct hsr_priv *priv = (struct hsr_priv *)sfp->private;
struct hsr_node *node;
+ int i;
seq_printf(sfp, "Node Table entries for (%s) device\n",
(priv->prot_version == PRP_V1 ? "PRP" : "HSR"));
@@ -39,22 +41,28 @@ hsr_node_table_show(struct seq_file *sfp, void *data)
seq_puts(sfp, "DAN-H\n");
rcu_read_lock();
- list_for_each_entry_rcu(node, &priv->node_db, mac_list) {
- /* skip self node */
- if (hsr_addr_is_self(priv, node->macaddress_A))
- continue;
- seq_printf(sfp, "%pM ", &node->macaddress_A[0]);
- seq_printf(sfp, "%pM ", &node->macaddress_B[0]);
- seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]);
- seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]);
- seq_printf(sfp, "%14x, ", node->addr_B_port);
-
- if (priv->prot_version == PRP_V1)
- seq_printf(sfp, "%5x, %5x, %5x\n",
- node->san_a, node->san_b,
- (node->san_a == 0 && node->san_b == 0));
- else
- seq_printf(sfp, "%5x\n", 1);
+
+ for (i = 0 ; i < priv->hash_buckets; i++) {
+ hlist_for_each_entry_rcu(node, &priv->node_db[i], mac_list) {
+ /* skip self node */
+ if (hsr_addr_is_self(priv, node->macaddress_A))
+ continue;
+ seq_printf(sfp, "%pM ", &node->macaddress_A[0]);
+ seq_printf(sfp, "%pM ", &node->macaddress_B[0]);
+ seq_printf(sfp, "%10lx, ",
+ node->time_in[HSR_PT_SLAVE_A]);
+ seq_printf(sfp, "%10lx, ",
+ node->time_in[HSR_PT_SLAVE_B]);
+ seq_printf(sfp, "%14x, ", node->addr_B_port);
+
+ if (priv->prot_version == PRP_V1)
+ seq_printf(sfp, "%5x, %5x, %5x\n",
+ node->san_a, node->san_b,
+ (node->san_a == 0 &&
+ node->san_b == 0));
+ else
+ seq_printf(sfp, "%5x\n", 1);
+ }
}
rcu_read_unlock();
return 0;
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index e57fdad9ef94..7f250216433d 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -485,12 +485,16 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
{
bool unregister = false;
struct hsr_priv *hsr;
- int res;
+ int res, i;
hsr = netdev_priv(hsr_dev);
INIT_LIST_HEAD(&hsr->ports);
- INIT_LIST_HEAD(&hsr->node_db);
- INIT_LIST_HEAD(&hsr->self_node_db);
+ INIT_HLIST_HEAD(&hsr->self_node_db);
+ hsr->hash_buckets = HSR_HSIZE;
+ get_random_bytes(&hsr->hash_seed, sizeof(hsr->hash_seed));
+ for (i = 0; i < hsr->hash_buckets; i++)
+ INIT_HLIST_HEAD(&hsr->node_db[i]);
+
spin_lock_init(&hsr->list_lock);
eth_hw_addr_set(hsr_dev, slave[0]->dev_addr);
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index e59cbb4f0cd1..5bf357734b11 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -570,20 +570,23 @@ static int fill_frame_info(struct hsr_frame_info *frame,
struct ethhdr *ethhdr;
__be16 proto;
int ret;
+ u32 hash;
/* Check if skb contains ethhdr */
if (skb->mac_len < sizeof(struct ethhdr))
return -EINVAL;
memset(frame, 0, sizeof(*frame));
+
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ hash = hsr_mac_hash(port->hsr, ethhdr->h_source);
frame->is_supervision = is_supervision_frame(port->hsr, skb);
- frame->node_src = hsr_get_node(port, &hsr->node_db, skb,
+ frame->node_src = hsr_get_node(port, &hsr->node_db[hash], skb,
frame->is_supervision,
port->type);
if (!frame->node_src)
return -1; /* Unknown node and !is_supervision, or no mem */
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
frame->is_vlan = false;
proto = ethhdr->h_proto;
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 0775f0f95dbf..b3c6ffa1894d 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -15,11 +15,28 @@
#include <linux/etherdevice.h>
#include <linux/slab.h>
#include <linux/rculist.h>
+#include <linux/jhash.h>
#include "hsr_main.h"
#include "hsr_framereg.h"
#include "hsr_netlink.h"
-/* TODO: use hash lists for mac addresses (linux/jhash.h)? */
+u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr)
+{
+ u32 hash = jhash(addr, ETH_ALEN, hsr->hash_seed);
+
+ return reciprocal_scale(hash, hsr->hash_buckets);
+}
+
+struct hsr_node *hsr_node_get_first(struct hlist_head *head)
+{
+ struct hlist_node *first;
+
+ first = rcu_dereference(hlist_first_rcu(head));
+ if (first)
+ return hlist_entry(first, struct hsr_node, mac_list);
+
+ return NULL;
+}
/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
* false otherwise.
@@ -42,8 +59,7 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
{
struct hsr_node *node;
- node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node,
- mac_list);
+ node = hsr_node_get_first(&hsr->self_node_db);
if (!node) {
WARN_ONCE(1, "HSR: No self node\n");
return false;
@@ -59,12 +75,12 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
/* Search for mac entry. Caller must hold rcu read lock.
*/
-static struct hsr_node *find_node_by_addr_A(struct list_head *node_db,
+static struct hsr_node *find_node_by_addr_A(struct hlist_head *node_db,
const unsigned char addr[ETH_ALEN])
{
struct hsr_node *node;
- list_for_each_entry_rcu(node, node_db, mac_list) {
+ hlist_for_each_entry_rcu(node, node_db, mac_list) {
if (ether_addr_equal(node->macaddress_A, addr))
return node;
}
@@ -79,7 +95,7 @@ int hsr_create_self_node(struct hsr_priv *hsr,
const unsigned char addr_a[ETH_ALEN],
const unsigned char addr_b[ETH_ALEN])
{
- struct list_head *self_node_db = &hsr->self_node_db;
+ struct hlist_head *self_node_db = &hsr->self_node_db;
struct hsr_node *node, *oldnode;
node = kmalloc(sizeof(*node), GFP_KERNEL);
@@ -90,14 +106,13 @@ int hsr_create_self_node(struct hsr_priv *hsr,
ether_addr_copy(node->macaddress_B, addr_b);
spin_lock_bh(&hsr->list_lock);
- oldnode = list_first_or_null_rcu(self_node_db,
- struct hsr_node, mac_list);
+ oldnode = hsr_node_get_first(self_node_db);
if (oldnode) {
- list_replace_rcu(&oldnode->mac_list, &node->mac_list);
+ hlist_replace_rcu(&oldnode->mac_list, &node->mac_list);
spin_unlock_bh(&hsr->list_lock);
kfree_rcu(oldnode, rcu_head);
} else {
- list_add_tail_rcu(&node->mac_list, self_node_db);
+ hlist_add_tail_rcu(&node->mac_list, self_node_db);
spin_unlock_bh(&hsr->list_lock);
}
@@ -106,25 +121,25 @@ int hsr_create_self_node(struct hsr_priv *hsr,
void hsr_del_self_node(struct hsr_priv *hsr)
{
- struct list_head *self_node_db = &hsr->self_node_db;
+ struct hlist_head *self_node_db = &hsr->self_node_db;
struct hsr_node *node;
spin_lock_bh(&hsr->list_lock);
- node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list);
+ node = hsr_node_get_first(self_node_db);
if (node) {
- list_del_rcu(&node->mac_list);
+ hlist_del_rcu(&node->mac_list);
kfree_rcu(node, rcu_head);
}
spin_unlock_bh(&hsr->list_lock);
}
-void hsr_del_nodes(struct list_head *node_db)
+void hsr_del_nodes(struct hlist_head *node_db)
{
struct hsr_node *node;
- struct hsr_node *tmp;
+ struct hlist_node *tmp;
- list_for_each_entry_safe(node, tmp, node_db, mac_list)
- kfree(node);
+ hlist_for_each_entry_safe(node, tmp, node_db, mac_list)
+ kfree_rcu(node, rcu_head);
}
void prp_handle_san_frame(bool san, enum hsr_port_type port,
@@ -145,7 +160,7 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port,
* originating from the newly added node.
*/
static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
- struct list_head *node_db,
+ struct hlist_head *node_db,
unsigned char addr[],
u16 seq_out, bool san,
enum hsr_port_type rx_port)
@@ -175,14 +190,14 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
hsr->proto_ops->handle_san_frame(san, rx_port, new_node);
spin_lock_bh(&hsr->list_lock);
- list_for_each_entry_rcu(node, node_db, mac_list,
- lockdep_is_held(&hsr->list_lock)) {
+ hlist_for_each_entry_rcu(node, node_db, mac_list,
+ lockdep_is_held(&hsr->list_lock)) {
if (ether_addr_equal(node->macaddress_A, addr))
goto out;
if (ether_addr_equal(node->macaddress_B, addr))
goto out;
}
- list_add_tail_rcu(&new_node->mac_list, node_db);
+ hlist_add_tail_rcu(&new_node->mac_list, node_db);
spin_unlock_bh(&hsr->list_lock);
return new_node;
out:
@@ -202,7 +217,7 @@ void prp_update_san_info(struct hsr_node *node, bool is_sup)
/* Get the hsr_node from which 'skb' was sent.
*/
-struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db,
struct sk_buff *skb, bool is_sup,
enum hsr_port_type rx_port)
{
@@ -218,7 +233,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
ethhdr = (struct ethhdr *)skb_mac_header(skb);
- list_for_each_entry_rcu(node, node_db, mac_list) {
+ hlist_for_each_entry_rcu(node, node_db, mac_list) {
if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
if (hsr->proto_ops->update_san_info)
hsr->proto_ops->update_san_info(node, is_sup);
@@ -268,11 +283,12 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
struct hsr_sup_tlv *hsr_sup_tlv;
struct hsr_node *node_real;
struct sk_buff *skb = NULL;
- struct list_head *node_db;
+ struct hlist_head *node_db;
struct ethhdr *ethhdr;
int i;
unsigned int pull_size = 0;
unsigned int total_pull_size = 0;
+ u32 hash;
/* Here either frame->skb_hsr or frame->skb_prp should be
* valid as supervision frame always will have protocol
@@ -310,11 +326,13 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
hsr_sp = (struct hsr_sup_payload *)skb->data;
/* Merge node_curr (registered on macaddress_B) into node_real */
- node_db = &port_rcv->hsr->node_db;
- node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A);
+ node_db = port_rcv->hsr->node_db;
+ hash = hsr_mac_hash(hsr, hsr_sp->macaddress_A);
+ node_real = find_node_by_addr_A(&node_db[hash], hsr_sp->macaddress_A);
if (!node_real)
/* No frame received from AddrA of this node yet */
- node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A,
+ node_real = hsr_add_node(hsr, &node_db[hash],
+ hsr_sp->macaddress_A,
HSR_SEQNR_START - 1, true,
port_rcv->type);
if (!node_real)
@@ -348,7 +366,8 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
hsr_sp = (struct hsr_sup_payload *)skb->data;
/* Check if redbox mac and node mac are equal. */
- if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) {
+ if (!ether_addr_equal(node_real->macaddress_A,
+ hsr_sp->macaddress_A)) {
/* This is a redbox supervision frame for a VDAN! */
goto done;
}
@@ -368,7 +387,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
node_real->addr_B_port = port_rcv->type;
spin_lock_bh(&hsr->list_lock);
- list_del_rcu(&node_curr->mac_list);
+ hlist_del_rcu(&node_curr->mac_list);
spin_unlock_bh(&hsr->list_lock);
kfree_rcu(node_curr, rcu_head);
@@ -406,6 +425,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
struct hsr_port *port)
{
struct hsr_node *node_dst;
+ u32 hash;
if (!skb_mac_header_was_set(skb)) {
WARN_ONCE(1, "%s: Mac header not set\n", __func__);
@@ -415,7 +435,8 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest))
return;
- node_dst = find_node_by_addr_A(&port->hsr->node_db,
+ hash = hsr_mac_hash(port->hsr, eth_hdr(skb)->h_dest);
+ node_dst = find_node_by_addr_A(&port->hsr->node_db[hash],
eth_hdr(skb)->h_dest);
if (!node_dst) {
if (net_ratelimit())
@@ -491,59 +512,73 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr,
void hsr_prune_nodes(struct timer_list *t)
{
struct hsr_priv *hsr = from_timer(hsr, t, prune_timer);
+ struct hlist_node *tmp;
struct hsr_node *node;
- struct hsr_node *tmp;
struct hsr_port *port;
unsigned long timestamp;
unsigned long time_a, time_b;
+ int i;
spin_lock_bh(&hsr->list_lock);
- list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) {
- /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A]
- * nor time_in[HSR_PT_SLAVE_B], will ever be updated for
- * the master port. Thus the master node will be repeatedly
- * pruned leading to packet loss.
- */
- if (hsr_addr_is_self(hsr, node->macaddress_A))
- continue;
-
- /* Shorthand */
- time_a = node->time_in[HSR_PT_SLAVE_A];
- time_b = node->time_in[HSR_PT_SLAVE_B];
-
- /* Check for timestamps old enough to risk wrap-around */
- if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2))
- node->time_in_stale[HSR_PT_SLAVE_A] = true;
- if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2))
- node->time_in_stale[HSR_PT_SLAVE_B] = true;
-
- /* Get age of newest frame from node.
- * At least one time_in is OK here; nodes get pruned long
- * before both time_ins can get stale
- */
- timestamp = time_a;
- if (node->time_in_stale[HSR_PT_SLAVE_A] ||
- (!node->time_in_stale[HSR_PT_SLAVE_B] &&
- time_after(time_b, time_a)))
- timestamp = time_b;
-
- /* Warn of ring error only as long as we get frames at all */
- if (time_is_after_jiffies(timestamp +
- msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) {
- rcu_read_lock();
- port = get_late_port(hsr, node);
- if (port)
- hsr_nl_ringerror(hsr, node->macaddress_A, port);
- rcu_read_unlock();
- }
- /* Prune old entries */
- if (time_is_before_jiffies(timestamp +
- msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
- hsr_nl_nodedown(hsr, node->macaddress_A);
- list_del_rcu(&node->mac_list);
- /* Note that we need to free this entry later: */
- kfree_rcu(node, rcu_head);
+ for (i = 0; i < hsr->hash_buckets; i++) {
+ hlist_for_each_entry_safe(node, tmp, &hsr->node_db[i],
+ mac_list) {
+ /* Don't prune own node.
+ * Neither time_in[HSR_PT_SLAVE_A]
+ * nor time_in[HSR_PT_SLAVE_B], will ever be updated
+ * for the master port. Thus the master node will be
+ * repeatedly pruned leading to packet loss.
+ */
+ if (hsr_addr_is_self(hsr, node->macaddress_A))
+ continue;
+
+ /* Shorthand */
+ time_a = node->time_in[HSR_PT_SLAVE_A];
+ time_b = node->time_in[HSR_PT_SLAVE_B];
+
+ /* Check for timestamps old enough to
+ * risk wrap-around
+ */
+ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2))
+ node->time_in_stale[HSR_PT_SLAVE_A] = true;
+ if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2))
+ node->time_in_stale[HSR_PT_SLAVE_B] = true;
+
+ /* Get age of newest frame from node.
+ * At least one time_in is OK here; nodes get pruned
+ * long before both time_ins can get stale
+ */
+ timestamp = time_a;
+ if (node->time_in_stale[HSR_PT_SLAVE_A] ||
+ (!node->time_in_stale[HSR_PT_SLAVE_B] &&
+ time_after(time_b, time_a)))
+ timestamp = time_b;
+
+ /* Warn of ring error only as long as we get
+ * frames at all
+ */
+ if (time_is_after_jiffies(timestamp +
+ msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) {
+ rcu_read_lock();
+ port = get_late_port(hsr, node);
+ if (port)
+ hsr_nl_ringerror(hsr,
+ node->macaddress_A,
+ port);
+ rcu_read_unlock();
+ }
+
+ /* Prune old entries */
+ if (time_is_before_jiffies(timestamp +
+ msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr, node->macaddress_A);
+ hlist_del_rcu(&node->mac_list);
+ /* Note that we need to free this
+ * entry later:
+ */
+ kfree_rcu(node, rcu_head);
+ }
}
}
spin_unlock_bh(&hsr->list_lock);
@@ -557,17 +592,19 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
unsigned char addr[ETH_ALEN])
{
struct hsr_node *node;
+ u32 hash;
+
+ hash = hsr_mac_hash(hsr, addr);
if (!_pos) {
- node = list_first_or_null_rcu(&hsr->node_db,
- struct hsr_node, mac_list);
+ node = hsr_node_get_first(&hsr->node_db[hash]);
if (node)
ether_addr_copy(addr, node->macaddress_A);
return node;
}
node = _pos;
- list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) {
+ hlist_for_each_entry_continue_rcu(node, mac_list) {
ether_addr_copy(addr, node->macaddress_A);
return node;
}
@@ -587,8 +624,11 @@ int hsr_get_node_data(struct hsr_priv *hsr,
struct hsr_node *node;
struct hsr_port *port;
unsigned long tdiff;
+ u32 hash;
+
+ hash = hsr_mac_hash(hsr, addr);
- node = find_node_by_addr_A(&hsr->node_db, addr);
+ node = find_node_by_addr_A(&hsr->node_db[hash], addr);
if (!node)
return -ENOENT;
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index bdbb8c822ba1..d7cce6b161e3 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -28,9 +28,11 @@ struct hsr_frame_info {
bool is_from_san;
};
+u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr);
+struct hsr_node *hsr_node_get_first(struct hlist_head *head);
void hsr_del_self_node(struct hsr_priv *hsr);
-void hsr_del_nodes(struct list_head *node_db);
-struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+void hsr_del_nodes(struct hlist_head *node_db);
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db,
struct sk_buff *skb, bool is_sup,
enum hsr_port_type rx_port);
void hsr_handle_sup_frame(struct hsr_frame_info *frame);
@@ -68,7 +70,7 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port,
void prp_update_san_info(struct hsr_node *node, bool is_sup);
struct hsr_node {
- struct list_head mac_list;
+ struct hlist_node mac_list;
unsigned char macaddress_A[ETH_ALEN];
unsigned char macaddress_B[ETH_ALEN];
/* Local slave through which AddrB frames are received from this node */
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 043e4e9a1694..ca556bda3467 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -63,6 +63,9 @@ struct hsr_tag {
#define HSR_V1_SUP_LSDUSIZE 52
+#define HSR_HSIZE_SHIFT 8
+#define HSR_HSIZE BIT(HSR_HSIZE_SHIFT)
+
/* The helper functions below assumes that 'path' occupies the 4 most
* significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
* equivalently, the 4 most significant bits of HSR tag byte 14).
@@ -201,8 +204,8 @@ struct hsr_proto_ops {
struct hsr_priv {
struct rcu_head rcu_head;
struct list_head ports;
- struct list_head node_db; /* Known HSR nodes */
- struct list_head self_node_db; /* MACs of slaves */
+ struct hlist_head node_db[HSR_HSIZE]; /* Known HSR nodes */
+ struct hlist_head self_node_db; /* MACs of slaves */
struct timer_list announce_timer; /* Supervision frame dispatch */
struct timer_list prune_timer;
int announce_count;
@@ -212,6 +215,8 @@ struct hsr_priv {
spinlock_t seqnr_lock; /* locking for sequence_nr */
spinlock_t list_lock; /* locking for node list */
struct hsr_proto_ops *proto_ops;
+ u32 hash_buckets;
+ u32 hash_seed;
#define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set
* based on SLAVE_A or SLAVE_B
*/
@@ -259,11 +264,6 @@ static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct)
return ntohs(rct->sequence_nr);
}
-static inline u16 get_prp_lan_id(struct prp_rct *rct)
-{
- return ntohs(rct->lan_id_and_LSDU_size) >> 12;
-}
-
/* assume there is a valid rct */
static inline bool prp_check_lsdu_size(struct sk_buff *skb,
struct prp_rct *rct,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index f3c8f91dbe2c..1405c037cf7a 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -105,6 +105,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
static void hsr_dellink(struct net_device *dev, struct list_head *head)
{
struct hsr_priv *hsr = netdev_priv(dev);
+ int i;
del_timer_sync(&hsr->prune_timer);
del_timer_sync(&hsr->announce_timer);
@@ -113,7 +114,8 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head)
hsr_del_ports(hsr);
hsr_del_self_node(hsr);
- hsr_del_nodes(&hsr->node_db);
+ for (i = 0; i < hsr->hash_buckets; i++)
+ hsr_del_nodes(&hsr->node_db[i]);
unregister_netdevice_queue(dev, head);
}
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 2cf62718a282..2c087b7f17c5 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -47,6 +47,7 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/ieee802154.h>
+#include <linux/if_arp.h>
#include <net/ipv6.h>
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index dd5a45f8a78a..359249ab77bf 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -30,7 +30,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
{
void *hdr;
int i, pages = 0;
- uint32_t *buf = kcalloc(32, sizeof(uint32_t), GFP_KERNEL);
+ u32 *buf = kcalloc(IEEE802154_MAX_PAGE + 1, sizeof(u32), GFP_KERNEL);
pr_debug("%s\n", __func__);
@@ -47,7 +47,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
goto nla_put_failure;
- for (i = 0; i < 32; i++) {
+ for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
if (phy->supported.channels[i])
buf[pages++] = phy->supported.channels[i] | (i << 27);
}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index de610cb83694..f79ab942f03b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
+#include <linux/init.h>
#include <linux/types.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
@@ -95,12 +96,14 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
- u32 *next_btf_id)
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
{
size_t end;
if (atype == BPF_READ)
- return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
@@ -212,26 +215,23 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
}
}
-BTF_SET_START(bpf_tcp_ca_kfunc_ids)
+BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
BTF_ID(func, tcp_reno_ssthresh)
BTF_ID(func, tcp_reno_cong_avoid)
BTF_ID(func, tcp_reno_undo_cwnd)
BTF_ID(func, tcp_slow_start)
BTF_ID(func, tcp_cong_avoid_ai)
-BTF_SET_END(bpf_tcp_ca_kfunc_ids)
+BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
-static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner)
-{
- if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id))
- return true;
- return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner);
-}
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &bpf_tcp_ca_check_kfunc_ids,
+};
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
.get_func_proto = bpf_tcp_ca_get_func_proto,
.is_valid_access = bpf_tcp_ca_is_valid_access,
.btf_struct_access = bpf_tcp_ca_btf_struct_access,
- .check_kfunc_call = bpf_tcp_ca_check_kfunc_call,
};
static int bpf_tcp_ca_init_member(const struct btf_type *t,
@@ -300,3 +300,9 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
.init = bpf_tcp_ca_init,
.name = "tcp_congestion_ops",
};
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 85117b45216d..e0730c4d07d6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -738,8 +739,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
memset(cfg, 0, sizeof(*cfg));
rtm = nlmsg_data(nlh);
+
+ if (!inet_validate_dscp(rtm->rtm_tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ err = -EINVAL;
+ goto errout;
+ }
+ cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_tos = rtm->rtm_tos;
cfg->fc_table = rtm->rtm_table;
cfg->fc_protocol = rtm->rtm_protocol;
cfg->fc_scope = rtm->rtm_scope;
@@ -1550,7 +1559,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
- rtnl_lock();
+ ASSERT_RTNL();
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1575,7 +1584,7 @@ static void ip_fib_net_exit(struct net *net)
#ifdef CONFIG_IP_MULTIPLE_TABLES
fib4_rules_exit(net);
#endif
- rtnl_unlock();
+
kfree(net->ipv4.fib_table_hash);
fib4_notifier_exit(net);
}
@@ -1602,7 +1611,9 @@ out:
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
+ rtnl_lock();
ip_fib_net_exit(net);
+ rtnl_unlock();
goto out;
}
@@ -1610,12 +1621,23 @@ static void __net_exit fib_net_exit(struct net *net)
{
fib_proc_exit(net);
nl_fib_lookup_exit(net);
- ip_fib_net_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip_fib_net_exit(net);
+
+ rtnl_unlock();
}
static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
+ .exit_batch = fib_net_exit_batch,
};
void __init ip_fib_init(void)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 78e40ea42e58..f9b9e26c32c1 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -4,13 +4,14 @@
#include <linux/types.h>
#include <linux/list.h>
+#include <net/inet_dscp.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
struct fib_alias {
struct hlist_node fa_list;
struct fib_info *fa_info;
- u8 fa_tos;
+ dscp_t fa_dscp;
u8 fa_type;
u8 fa_state;
u8 fa_slen;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index e0b6c8b6de57..001fea394bde 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -35,7 +36,7 @@ struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
- u8 tos;
+ dscp_t dscp;
__be32 src;
__be32 srcmask;
__be32 dst;
@@ -49,7 +50,7 @@ static bool fib4_rule_matchall(const struct fib_rule *rule)
{
struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
- if (r->dst_len || r->src_len || r->tos)
+ if (r->dst_len || r->src_len || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -185,7 +186,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl4->flowi4_tos))
+ if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
@@ -225,10 +226,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ /* IPv4 currently doesn't handle high order DSCP bits correctly */
if (frh->tos & ~IPTOS_TOS_MASK) {
NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
}
+ rule4->dscp = inet_dsfield_to_dscp(frh->tos);
/* split local/main if they are not already split */
err = fib_unmerge(net);
@@ -270,7 +278,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->srcmask = inet_make_mask(rule4->src_len);
rule4->dst_len = frh->dst_len;
rule4->dstmask = inet_make_mask(rule4->dst_len);
- rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
@@ -313,7 +320,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
- if (frh->tos && (rule4->tos != frh->tos))
+ if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
return 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -337,7 +344,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
- frh->tos = rule4->tos;
+ frh->tos = inet_dscp_to_dsfield(rule4->dscp);
if ((rule4->dst_len &&
nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2dd375f7407b..c5a29703185a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
#include <linux/hash.h>
#include <net/arp.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -523,7 +524,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
fri.tb_id = tb_id;
fri.dst = key;
fri.dst_len = dst_len;
- fri.tos = fa->fa_tos;
+ fri.tos = inet_dscp_to_dsfield(fa->fa_dscp);
fri.type = fa->fa_type;
fri.offload = READ_ONCE(fa->offload);
fri.trap = READ_ONCE(fa->trap);
@@ -1257,34 +1258,13 @@ fib_info_laddrhash_bucket(const struct net *net, __be32 val)
return &fib_info_laddrhash[slot];
}
-static struct hlist_head *fib_info_hash_alloc(int bytes)
-{
- if (bytes <= PAGE_SIZE)
- return kzalloc(bytes, GFP_KERNEL);
- else
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(bytes));
-}
-
-static void fib_info_hash_free(struct hlist_head *hash, int bytes)
-{
- if (!hash)
- return;
-
- if (bytes <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long) hash, get_order(bytes));
-}
-
static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *new_laddrhash,
unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_info_hash_size;
- unsigned int i, bytes;
+ unsigned int i;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
@@ -1325,9 +1305,8 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
- bytes = old_size * sizeof(struct hlist_head *);
- fib_info_hash_free(old_info_hash, bytes);
- fib_info_hash_free(old_laddrhash, bytes);
+ kvfree(old_info_hash);
+ kvfree(old_laddrhash);
}
__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
@@ -1444,19 +1423,19 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
- unsigned int bytes;
+ size_t bytes;
if (!new_size)
new_size = 16;
- bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_info_hash_alloc(bytes);
- new_laddrhash = fib_info_hash_alloc(bytes);
+ bytes = (size_t)new_size * sizeof(struct hlist_head *);
+ new_info_hash = kvzalloc(bytes, GFP_KERNEL);
+ new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
if (!new_info_hash || !new_laddrhash) {
- fib_info_hash_free(new_info_hash, bytes);
- fib_info_hash_free(new_laddrhash, bytes);
- } else
+ kvfree(new_info_hash);
+ kvfree(new_laddrhash);
+ } else {
fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
-
+ }
if (!fib_info_hash_size)
goto failure;
}
@@ -2061,7 +2040,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
int order = -1, last_idx = -1;
struct fib_alias *fa, *fa1 = NULL;
u32 last_prio = res->fi->fib_priority;
- u8 last_tos = 0;
+ dscp_t last_dscp = 0;
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
@@ -2069,19 +2048,20 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
if (fa->fa_slen != slen)
continue;
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp &&
+ fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos))
continue;
if (fa->tb_id != tb->tb_id)
continue;
if (next_fi->fib_priority > last_prio &&
- fa->fa_tos == last_tos) {
- if (last_tos)
+ fa->fa_dscp == last_dscp) {
+ if (last_dscp)
continue;
break;
}
if (next_fi->fib_flags & RTNH_F_DEAD)
continue;
- last_tos = fa->fa_tos;
+ last_dscp = fa->fa_dscp;
last_prio = next_fi->fib_priority;
if (next_fi->fib_scope != res->scope ||
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f7f74d5c14da..2af2b99e0bea 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -61,6 +61,7 @@
#include <linux/vmalloc.h>
#include <linux/notifier.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -81,7 +82,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .tos = inet_dscp_to_dsfield(fa->fa_dscp),
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -98,7 +99,7 @@ static int call_fib_entry_notifiers(struct net *net,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .tos = inet_dscp_to_dsfield(fa->fa_dscp),
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -973,13 +974,13 @@ static struct key_vector *fib_find_node(struct trie *t,
return n;
}
-/* Return the first fib alias matching TOS with
+/* Return the first fib alias matching DSCP with
* priority less than or equal to PRIO.
* If 'find_first' is set, return the first matching
- * fib alias, regardless of TOS and priority.
+ * fib alias, regardless of DSCP and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
- u8 tos, u32 prio, u32 tb_id,
+ dscp_t dscp, u32 prio, u32 tb_id,
bool find_first)
{
struct fib_alias *fa;
@@ -988,6 +989,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
return NULL;
hlist_for_each_entry(fa, fah, fa_list) {
+ /* Avoid Sparse warning when using dscp_t in inequalities */
+ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+ u8 __dscp = inet_dscp_to_dsfield(dscp);
+
if (fa->fa_slen < slen)
continue;
if (fa->fa_slen != slen)
@@ -998,9 +1003,9 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
break;
if (find_first)
return fa;
- if (fa->fa_tos > tos)
+ if (__fa_dscp > __dscp)
continue;
- if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
return fa;
}
@@ -1027,8 +1032,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
- fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
- fa->fa_type == fri->type)
+ fa->fa_dscp == inet_dsfield_to_dscp(fri->tos) &&
+ fa->fa_info == fri->fi && fa->fa_type == fri->type)
return fa;
}
@@ -1214,7 +1219,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
int err;
@@ -1231,12 +1236,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto err;
}
+ dscp = cfg->fc_dscp;
l = fib_find_node(t, &tp, key);
- fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
+ * with the same keys [prefix,dscp,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
@@ -1244,7 +1250,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
* of the new alias.
*/
- if (fa && fa->fa_tos == tos &&
+ if (fa && fa->fa_dscp == dscp &&
fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_first, *fa_match;
@@ -1264,7 +1270,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
hlist_for_each_entry_from(fa, fa_list) {
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1292,7 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
fi_drop = fa->fa_info;
- new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
@@ -1355,7 +1361,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
+ new_fa->fa_dscp = dscp;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
@@ -1571,7 +1577,8 @@ found:
if (index >= (1ul << fa->fa_slen))
continue;
}
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp &&
+ inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
continue;
if (fi->fib_dead)
continue;
@@ -1707,7 +1714,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
struct key_vector *l, *tp;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
key = ntohl(cfg->fc_dst);
@@ -1719,11 +1726,13 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!l)
return -ESRCH;
- fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
+ dscp = cfg->fc_dscp;
+ fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
- pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+ inet_dscp_to_dsfield(dscp), t);
fa_to_delete = NULL;
hlist_for_each_entry_from(fa, fa_list) {
@@ -1731,7 +1740,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -2299,7 +2308,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
fri.tb_id = tb->tb_id;
fri.dst = xkey;
fri.dst_len = KEYLENGTH - fa->fa_slen;
- fri.tos = fa->fa_tos;
+ fri.tos = inet_dscp_to_dsfield(fa->fa_dscp);
fri.type = fa->fa_type;
fri.offload = READ_ONCE(fa->offload);
fri.trap = READ_ONCE(fa->trap);
@@ -2811,8 +2820,9 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
fa->fa_info->fib_scope),
rtn_type(buf2, sizeof(buf2),
fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
+ if (fa->fa_dscp)
+ seq_printf(seq, " tos=%d",
+ inet_dscp_to_dsfield(fa->fa_dscp));
seq_putc(seq, '\n');
}
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b7e277d8a84d..72a375c7f417 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -192,24 +192,14 @@ struct icmp_control {
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmp_sk(struct net *net)
-{
- return this_cpu_read(*net->ipv4.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmp_sk(net);
+ sk = this_cpu_read(ipv4_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
@@ -217,11 +207,13 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
static inline void icmp_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
@@ -363,14 +355,13 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
return 0;
}
-static void icmp_push_reply(struct icmp_bxm *icmp_param,
+static void icmp_push_reply(struct sock *sk,
+ struct icmp_bxm *icmp_param,
struct flowi4 *fl4,
struct ipcm_cookie *ipc, struct rtable **rt)
{
- struct sock *sk;
struct sk_buff *skb;
- sk = icmp_sk(dev_net((*rt)->dst.dev));
if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
@@ -452,7 +443,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
if (IS_ERR(rt))
goto out_unlock;
if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
@@ -766,7 +757,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);
- icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
@@ -1434,46 +1425,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
},
};
-static void __net_exit icmp_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
- free_percpu(net->ipv4.icmp_sk);
- net->ipv4.icmp_sk = NULL;
-}
-
static int __net_init icmp_sk_init(struct net *net)
{
- int i, err;
-
- net->ipv4.icmp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.icmp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(i) {
- struct sock *sk;
-
- err = inet_ctl_sock_create(&sk, PF_INET,
- SOCK_RAW, IPPROTO_ICMP, net);
- if (err < 0)
- goto fail;
-
- *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
-
- /* Enough space for 2 64K ICMP packets, including
- * sk_buff/skb_shared_info struct overhead.
- */
- sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
-
- /*
- * Speedup sock_wfree()
- */
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
- }
-
/* Control parameters for ECHO replies. */
net->ipv4.sysctl_icmp_echo_ignore_all = 0;
net->ipv4.sysctl_icmp_echo_enable_probe = 0;
@@ -1499,18 +1452,36 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
return 0;
-
-fail:
- icmp_sk_exit(net);
- return err;
}
static struct pernet_operations __net_initdata icmp_sk_ops = {
.init = icmp_sk_init,
- .exit = icmp_sk_exit,
};
int __init icmp_init(void)
{
+ int err, i;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, &init_net);
+ if (err < 0)
+ return err;
+
+ per_cpu(ipv4_icmp_sk, i) = sk;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff/skb_shared_info struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+ }
return register_pernet_subsys(&icmp_sk_ops);
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fc2a985f6064..1e5b53c2bb26 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -866,12 +866,9 @@ static void reqsk_timer_handler(struct timer_list *t)
(!resend ||
!inet_rtx_syn_ack(sk_listener, req) ||
inet_rsk(req)->acked)) {
- unsigned long timeo;
-
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
- timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
- mod_timer(&req->rsk_timer, jiffies + timeo);
+ mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
if (!nreq)
return;
@@ -1046,6 +1043,9 @@ int inet_csk_listen_start(struct sock *sk)
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
+ if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
+ sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 30ab717ff1b8..17440840a791 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
+ local_bh_disable();
inet_ehash_nolisten(sk, osk, NULL);
+ local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
@@ -669,45 +671,54 @@ int inet_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
EXPORT_SYMBOL_GPL(inet_hash);
-void inet_unhash(struct sock *sk)
+static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_listen_hashbucket *ilb = NULL;
- spinlock_t *lock;
-
if (sk_unhashed(sk))
return;
- if (sk->sk_state == TCP_LISTEN) {
- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &ilb->lock;
- } else {
- lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- }
- spin_lock_bh(lock);
- if (sk_unhashed(sk))
- goto unlock;
-
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_stop_listen_sock(sk);
if (ilb) {
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
inet_unhash2(hashinfo, sk);
ilb->count--;
}
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-unlock:
- spin_unlock_bh(lock);
+}
+
+void inet_unhash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+ if (sk_unhashed(sk))
+ return;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ struct inet_listen_hashbucket *ilb;
+
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ /* Don't disable bottom halves while acquiring the lock to
+ * avoid circular locking dependency on PREEMPT_RT.
+ */
+ spin_lock(&ilb->lock);
+ __inet_unhash(sk, ilb);
+ spin_unlock(&ilb->lock);
+ } else {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ __inet_unhash(sk, NULL);
+ spin_unlock_bh(lock);
+ }
}
EXPORT_SYMBOL_GPL(inet_unhash);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 437afe392e66..9e0bbd026560 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -52,14 +52,15 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
spin_unlock(lock);
/* Disassociate with bind bucket. */
- bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
- hashinfo->bhash_size)];
+ bhead = &hashinfo->bhash[tw->tw_bslot];
spin_lock(&bhead->lock);
inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- atomic_dec(&tw->tw_dr->tw_count);
+ if (refcount_dec_and_test(&tw->tw_dr->tw_refcount))
+ kfree(tw->tw_dr);
+
inet_twsk_put(tw);
}
@@ -110,8 +111,12 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed.
*/
- bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
- hashinfo->bhash_size)];
+ /* Cache inet_bhashfn(), because 'struct net' might be no longer
+ * available later in inet_twsk_kill().
+ */
+ tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num,
+ hashinfo->bhash_size);
+ bhead = &hashinfo->bhash[tw->tw_bslot];
spin_lock(&bhead->lock);
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
@@ -145,10 +150,6 @@ static void tw_timer_handler(struct timer_list *t)
{
struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
- if (tw->tw_kill)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
- else
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
@@ -158,7 +159,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
{
struct inet_timewait_sock *tw;
- if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ if (refcount_read(&dr->tw_refcount) - 1 >= dr->sysctl_max_tw_buckets)
return NULL;
tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
@@ -244,59 +245,15 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
* of PAWS.
*/
- tw->tw_kill = timeo <= 4*HZ;
if (!rearm) {
+ bool kill = timeo <= 4*HZ;
+
+ __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+ LINUX_MIB_TIMEWAITED);
BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
- atomic_inc(&tw->tw_dr->tw_count);
+ refcount_inc(&tw->tw_dr->tw_refcount);
} else {
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
-{
- struct inet_timewait_sock *tw;
- struct sock *sk;
- struct hlist_nulls_node *node;
- unsigned int slot;
-
- for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
-restart_rcu:
- cond_resched();
- rcu_read_lock();
-restart:
- sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (sk->sk_state != TCP_TIME_WAIT)
- continue;
- tw = inet_twsk(sk);
- if ((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->ns.count))
- continue;
-
- if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
- continue;
-
- if (unlikely((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->ns.count))) {
- inet_twsk_put(tw);
- goto restart;
- }
-
- rcu_read_unlock();
- local_bh_disable();
- inet_twsk_deschedule_put(tw);
- local_bh_enable();
- goto restart_rcu;
- }
- /* If the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto restart;
- rcu_read_unlock();
- }
-}
-EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3a025c011971..d94f9f7e60c3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -196,7 +196,8 @@ resubmit:
if (ipprot) {
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_XFRM_POLICY);
return;
}
nf_reset_ct(skb);
@@ -215,7 +216,7 @@ resubmit:
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
@@ -318,8 +319,10 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
+ int err, drop_reason;
struct rtable *rt;
- int err;
+
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (ip_can_use_hint(skb, iph, hint)) {
err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
@@ -396,19 +399,23 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
- IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+ drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
goto drop;
+ }
}
return NET_RX_SUCCESS;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return NET_RX_DROP;
drop_error:
- if (err == -EXDEV)
+ if (err == -EXDEV) {
+ drop_reason = SKB_DROP_REASON_IP_RPFILTER;
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+ }
goto drop;
}
@@ -436,13 +443,16 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
+ int drop_reason;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
- if (skb->pkt_type == PACKET_OTHERHOST)
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ drop_reason = SKB_DROP_REASON_OTHERHOST;
goto drop;
+ }
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
@@ -452,6 +462,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
goto out;
}
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
@@ -488,6 +499,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
len = ntohs(iph->tot_len);
if (skb->len < len) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
@@ -516,11 +528,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
return skb;
csum_error:
+ drop_reason = SKB_DROP_REASON_IP_CSUM;
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
+ if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ drop_reason = SKB_DROP_REASON_IP_INHDR;
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out:
return NULL;
}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index da1b5038bdfd..a9e22a098872 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -42,7 +42,7 @@
*/
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
- __be32 daddr, struct rtable *rt, int is_frag)
+ __be32 daddr, struct rtable *rt)
{
unsigned char *iph = skb_network_header(skb);
@@ -53,28 +53,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
if (opt->srr)
memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
- if (!is_frag) {
- if (opt->rr_needaddr)
- ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
- if (opt->ts_needaddr)
- ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
- if (opt->ts_needtime) {
- __be32 midtime;
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+ if (opt->ts_needtime) {
+ __be32 midtime;
- midtime = inet_current_timestamp();
- memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
- }
- return;
- }
- if (opt->rr) {
- memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]);
- opt->rr = 0;
- opt->rr_needaddr = 0;
- }
- if (opt->ts) {
- memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]);
- opt->ts = 0;
- opt->ts_needaddr = opt->ts_needtime = 0;
+ midtime = inet_current_timestamp();
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 139cec29ed06..0c0574eb5f5b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -179,7 +179,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
- ip_options_build(skb, &opt->opt, daddr, rt, 0);
+ ip_options_build(skb, &opt->opt, daddr, rt);
}
skb->priority = sk->sk_priority;
@@ -519,7 +519,7 @@ packet_routed:
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
- ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
}
ip_select_ident_segs(net, skb, sk,
@@ -1541,7 +1541,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (opt) {
iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, cork->addr, rt, 0);
+ ip_options_build(skb, opt, cork->addr, rt);
}
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 29bbe2b08ae9..c860519d57ee 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -268,13 +268,12 @@ static void __net_exit ipmr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
list_del(&mrt->list);
ipmr_free_table(mrt);
}
fib_rules_unregister(net->ipv4.mr_rules_ops);
- rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -330,10 +329,9 @@ static int __net_init ipmr_rules_init(struct net *net)
static void __net_exit ipmr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ipmr_free_table(net->ipv4.mrt);
net->ipv4.mrt = NULL;
- rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -3077,7 +3075,9 @@ static int __net_init ipmr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ipmr_rules_exit(net);
+ rtnl_unlock();
#endif
ipmr_rules_fail:
ipmr_notifier_exit(net);
@@ -3092,12 +3092,22 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_notifier_exit(net);
- ipmr_rules_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ipmr_rules_exit(net);
+ rtnl_unlock();
}
static struct pernet_operations ipmr_net_ops = {
.init = ipmr_net_init,
.exit = ipmr_net_exit,
+ .exit_batch = ipmr_net_exit_batch,
};
int __init ip_mr_init(void)
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 3f248a19faa3..fab357cc8559 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -295,28 +295,24 @@ pptp_inbound_pkt(struct sk_buff *skb,
return NF_ACCEPT;
}
+static const struct nf_nat_pptp_hook pptp_hooks = {
+ .outbound = pptp_outbound_pkt,
+ .inbound = pptp_inbound_pkt,
+ .exp_gre = pptp_exp_gre,
+ .expectfn = pptp_nat_expected,
+};
+
static int __init nf_nat_helper_pptp_init(void)
{
- BUG_ON(nf_nat_pptp_hook_outbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_inbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+ WARN_ON(nf_nat_pptp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
- BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
return 0;
}
static void __exit nf_nat_helper_pptp_fini(void)
{
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
synchronize_rcu();
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index eeafeccebb8d..e459a391e607 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -3733,12 +3733,16 @@ out:
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);
-static void __net_exit nexthop_net_exit(struct net *net)
+static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- flush_all_nexthops(net);
+ list_for_each_entry(net, net_list, exit_list) {
+ flush_all_nexthops(net);
+ kfree(net->nexthop.devhash);
+ }
rtnl_unlock();
- kfree(net->nexthop.devhash);
}
static int __net_init nexthop_net_init(struct net *net)
@@ -3756,7 +3760,7 @@ static int __net_init nexthop_net_init(struct net *net)
static struct pernet_operations nexthop_net_ops = {
.init = nexthop_net_init,
- .exit = nexthop_net_exit,
+ .exit_batch = nexthop_net_exit_batch,
};
static int __init nexthop_init(void)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f30273afb539..28836071f0a6 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,8 +59,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
- proto_memory_allocated(&tcp_prot));
+ refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1,
+ sockets, proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f33ad1f383b6..f444f5983405 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -84,6 +84,7 @@
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
@@ -112,14 +113,13 @@
#define DEFAULT_MIN_PMTU (512 + 20 + 20)
#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
-
+#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_min_advmss __read_mostly = 256;
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
@@ -458,7 +458,7 @@ static u32 *ip_tstamps __read_mostly;
* if one generator is seldom used. This makes hard for an attacker
* to infer how many packets were sent between two points in time.
*/
-u32 ip_idents_reserve(u32 hash, int segs)
+static u32 ip_idents_reserve(u32 hash, int segs)
{
u32 bucket, old, now = (u32)jiffies;
atomic_t *p_id;
@@ -479,7 +479,6 @@ u32 ip_idents_reserve(u32 hash, int segs)
*/
return atomic_add_return(segs + delta, p_id) - segs;
}
-EXPORT_SYMBOL(ip_idents_reserve);
void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
@@ -1298,9 +1297,10 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
+ struct net *net = dev_net(dst->dev);
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
- ip_rt_min_advmss);
+ net->ipv4.ip_rt_min_advmss);
return min(advmss, IPV4_MAX_PMTU - header_size);
}
@@ -1485,6 +1485,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
struct uncached_list {
spinlock_t lock;
struct list_head head;
+ struct list_head quarantine;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
@@ -1506,7 +1507,7 @@ void rt_del_uncached_list(struct rtable *rt)
struct uncached_list *ul = rt->rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del(&rt->rt_uncached);
+ list_del_init(&rt->rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
@@ -1521,20 +1522,24 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
void rt_flush_dev(struct net_device *dev)
{
- struct rtable *rt;
+ struct rtable *rt, *safe;
int cpu;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+ if (list_empty(&ul->head))
+ continue;
+
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
rt->dst.dev = blackhole_netdev;
dev_replace_track(dev, blackhole_netdev,
&rt->dst.dev_tracker,
GFP_ATOMIC);
+ list_move(&rt->rt_uncached, &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
}
@@ -3392,7 +3397,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (fa->fa_slen == slen &&
fa->tb_id == fri.tb_id &&
- fa->fa_tos == fri.tos &&
+ fa->fa_dscp == inet_dsfield_to_dscp(fri.tos) &&
fa->fa_info == res.fi &&
fa->fa_type == fri.type) {
fri.offload = READ_ONCE(fa->offload);
@@ -3535,13 +3540,6 @@ static struct ctl_table ipv4_route_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "min_adv_mss",
- .data = &ip_rt_min_advmss,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
@@ -3569,6 +3567,13 @@ static struct ctl_table ipv4_route_netns_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "min_adv_mss",
+ .data = &init_net.ipv4.ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ },
};
@@ -3631,6 +3636,7 @@ static __net_init int netns_ip_rt_init(struct net *net)
/* Set default value for namespaceified sysctls */
net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+ net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
return 0;
}
@@ -3705,6 +3711,7 @@ int __init ip_rt_init(void)
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
+ INIT_LIST_HEAD(&ul->quarantine);
spin_lock_init(&ul->lock);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 97eb54774924..1cae27b5dcd8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -589,6 +589,14 @@ static struct ctl_table ipv4_table[] = {
};
static struct ctl_table ipv4_net_table[] = {
+ /* tcp_max_tw_buckets must be first in this table. */
+ {
+ .procname = "tcp_max_tw_buckets",
+/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{
.procname = "icmp_echo_ignore_all",
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
@@ -1001,13 +1009,6 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &two,
},
{
- .procname = "tcp_max_tw_buckets",
- .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
@@ -1400,7 +1401,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table)
goto err_alloc;
- for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+ /* skip first entry (sysctl_max_tw_buckets) */
+ for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
if (table[i].data) {
/* Update the variables to point into
* the current struct net
@@ -1415,6 +1417,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
}
}
+ table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
+
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (!net->ipv4.ipv4_hdr)
goto err_reg;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 02cb275e5487..760e8221d321 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -894,8 +894,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return mss_now;
/* Note : tcp_tso_autosize() will eventually split this later */
- new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
- new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
/* We try hard to avoid divides here */
size_goal = tp->gso_segs * mss_now;
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index ec5550089b4d..02e8626ccb27 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1154,7 +1154,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.set_state = bbr_set_state,
};
-BTF_SET_START(tcp_bbr_kfunc_ids)
+BTF_SET_START(tcp_bbr_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, bbr_init)
@@ -1167,25 +1167,27 @@ BTF_ID(func, bbr_min_tso_segs)
BTF_ID(func, bbr_set_state)
#endif
#endif
-BTF_SET_END(tcp_bbr_kfunc_ids)
+BTF_SET_END(tcp_bbr_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_bbr_check_kfunc_ids,
+};
static int __init bbr_register(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
- ret = tcp_register_congestion_control(&tcp_bbr_cong_ops);
- if (ret)
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
}
static void __exit bbr_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
}
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index e07837e23b3f..24d562dd6225 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.name = "cubic",
};
-BTF_SET_START(tcp_cubic_kfunc_ids)
+BTF_SET_START(tcp_cubic_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, cubictcp_init)
@@ -496,9 +496,12 @@ BTF_ID(func, cubictcp_cwnd_event)
BTF_ID(func, cubictcp_acked)
#endif
#endif
-BTF_SET_END(tcp_cubic_kfunc_ids)
+BTF_SET_END(tcp_cubic_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_cubic_check_kfunc_ids,
+};
static int __init cubictcp_register(void)
{
@@ -534,16 +537,14 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
- ret = tcp_register_congestion_control(&cubictcp);
- if (ret)
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&cubictcp);
}
static void __exit cubictcp_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
tcp_unregister_congestion_control(&cubictcp);
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 0d7ab3cc7b61..1943a6630341 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -238,7 +238,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.name = "dctcp-reno",
};
-BTF_SET_START(tcp_dctcp_kfunc_ids)
+BTF_SET_START(tcp_dctcp_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, dctcp_init)
@@ -249,25 +249,27 @@ BTF_ID(func, dctcp_cwnd_undo)
BTF_ID(func, dctcp_state)
#endif
#endif
-BTF_SET_END(tcp_dctcp_kfunc_ids)
+BTF_SET_END(tcp_dctcp_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_dctcp_check_kfunc_ids,
+};
static int __init dctcp_register(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
- ret = tcp_register_congestion_control(&dctcp);
- if (ret)
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&dctcp);
}
static void __exit dctcp_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
tcp_unregister_congestion_control(&dctcp);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bfe4112e000c..92e65d56dc2c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6703,7 +6703,8 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
- ireq->smc_ok = rx_opt->smc_ok;
+ ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+ tcp_sk(sk)->smc_hs_congested(sk));
#endif
}
@@ -6725,6 +6726,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
+ req->timeout = TCP_TIMEOUT_INIT;
}
return req;
@@ -6941,9 +6943,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
- if (!want_cookie)
- inet_csk_reqsk_queue_hash_add(sk, req,
- tcp_timeout_init((struct sock *)req));
+ if (!want_cookie) {
+ req->timeout = tcp_timeout_init((struct sock *)req);
+ inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+ }
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fec656f5a39e..6873f46fc8ba 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
+static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
return secure_tcp_seq(ip_hdr(skb)->daddr,
@@ -206,7 +208,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+ struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -810,7 +812,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
@@ -825,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
transmit_time);
ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
@@ -908,7 +912,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -921,6 +926,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
transmit_time);
ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
}
@@ -3111,41 +3117,18 @@ EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
- int cpu;
+ struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner);
-
- for_each_possible_cpu(cpu)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
- free_percpu(net->ipv4.tcp_sk);
+ if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
+ kfree(tcp_death_row);
}
static int __net_init tcp_sk_init(struct net *net)
{
- int res, cpu, cnt;
-
- net->ipv4.tcp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.tcp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(cpu) {
- struct sock *sk;
-
- res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
- IPPROTO_TCP, net);
- if (res)
- goto fail;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-
- /* Please enforce IP_DF and IPID==0 for RST and
- * ACK sent in SYN-RECV and TIME-WAIT state.
- */
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
-
- *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
- }
+ int cnt;
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
@@ -3172,9 +3155,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
+ net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
+ if (!net->ipv4.tcp_death_row)
+ return -ENOMEM;
+ refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
- net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+ net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
+ net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
@@ -3229,18 +3216,12 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_congestion_control = &tcp_reno;
return 0;
-fail:
- tcp_sk_exit(net);
-
- return res;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
- inet_twsk_purge(&tcp_hashinfo, AF_INET);
-
list_for_each_entry(net, net_exit_list, exit_list)
tcp_fastopen_ctx_destroy(net);
}
@@ -3326,6 +3307,24 @@ static void __init bpf_iter_register(void)
void __init tcp_v4_init(void)
{
+ int cpu, res;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, &init_net);
+ if (res)
+ panic("Failed to create the TCP control socket.\n");
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Please enforce IP_DF and IPID==0 for RST and
+ * ACK sent in SYN-RECV and TIME-WAIT state.
+ */
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+ per_cpu(ipv4_tcp_sk, cpu) = sk;
+ }
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c2d3ac2363a..6366df7aaf2a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -248,7 +248,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+ struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
tw = inet_twsk_alloc(sk, tcp_death_row, state);
@@ -583,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -622,8 +622,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
!inet_rtx_syn_ack(sk, req)) {
unsigned long expires = jiffies;
- expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
- TCP_RTO_MAX);
+ expires += reqsk_timeout(req, TCP_RTO_MAX);
if (!fastopen)
mod_timer_pending(&req->rsk_timer, expires);
else
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5079832af5c1..e76bf1e9251e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1960,7 +1960,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
bytes = min_t(unsigned long,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ sk->sk_gso_max_size);
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
@@ -4092,7 +4092,9 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl;
int res;
- tcp_rsk(req)->txhash = net_tx_rndhash();
+ /* Paired with WRITE_ONCE() in sock_setsockopt() */
+ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+ tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL);
if (!res) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 090360939401..6b4d8361560f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2093,16 +2093,20 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
+ int drop_reason;
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
+ if (rc == -ENOMEM) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
- else
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
@@ -2120,14 +2124,17 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
@@ -2204,8 +2211,10 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto drop;
+ }
udp_csum_pull_header(skb);
@@ -2213,11 +2222,12 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
return __udp_queue_rcv_skb(sk, skb);
csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return -1;
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3f23da8c0b10..2f307da17f21 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -146,18 +146,11 @@ static int ipv6_generate_stable_address(struct in6_addr *addr,
#define IN6_ADDR_HSIZE_SHIFT 8
#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT)
-/*
- * Configured unicast address hash table
- */
-static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
-static DEFINE_SPINLOCK(addrconf_hash_lock);
-static void addrconf_verify(void);
-static void addrconf_verify_rtnl(void);
-static void addrconf_verify_work(struct work_struct *);
+static void addrconf_verify(struct net *net);
+static void addrconf_verify_rtnl(struct net *net);
static struct workqueue_struct *addrconf_wq;
-static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
@@ -379,7 +372,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ASSERT_RTNL();
- if (dev->mtu < IPV6_MIN_MTU)
+ if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev)
return ERR_PTR(-EINVAL);
ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
@@ -416,12 +409,13 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
return ERR_PTR(err);
}
- if (snmp6_register_dev(ndev) < 0) {
- netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
- __func__, dev->name);
- goto err_release;
+ if (dev != blackhole_netdev) {
+ if (snmp6_register_dev(ndev) < 0) {
+ netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
+ __func__, dev->name);
+ goto err_release;
+ }
}
-
/* One reference from device. */
refcount_set(&ndev->refcnt, 1);
@@ -452,25 +446,28 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ipv6_mc_init_dev(ndev);
ndev->tstamp = jiffies;
- err = addrconf_sysctl_register(ndev);
- if (err) {
- ipv6_mc_destroy_dev(ndev);
- snmp6_unregister_dev(ndev);
- goto err_release;
+ if (dev != blackhole_netdev) {
+ err = addrconf_sysctl_register(ndev);
+ if (err) {
+ ipv6_mc_destroy_dev(ndev);
+ snmp6_unregister_dev(ndev);
+ goto err_release;
+ }
}
/* protected by rtnl_lock */
rcu_assign_pointer(dev->ip6_ptr, ndev);
- /* Join interface-local all-node multicast group */
- ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
-
- /* Join all-node multicast group */
- ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ if (dev != blackhole_netdev) {
+ /* Join interface-local all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
- /* Join all-router multicast group if forwarding is set */
- if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
- ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ /* Join all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ /* Join all-router multicast group if forwarding is set */
+ if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ }
return ndev;
err_release:
@@ -554,7 +551,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
#ifdef CONFIG_IPV6_MROUTE
if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
- devconf->mc_forwarding) < 0)
+ atomic_read(&devconf->mc_forwarding)) < 0)
goto nla_put_failure;
#endif
if ((all || type == NETCONFA_PROXY_NEIGH) &&
@@ -1011,9 +1008,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
{
struct inet6_ifaddr *ifp;
- hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
if (!dev || ifp->idev->dev == dev)
return true;
@@ -1024,20 +1019,21 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
{
- unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr);
+ struct net *net = dev_net(dev);
+ unsigned int hash = inet6_addr_hash(net, &ifa->addr);
int err = 0;
- spin_lock(&addrconf_hash_lock);
+ spin_lock(&net->ipv6.addrconf_hash_lock);
/* Ignore adding duplicate addresses on an interface */
- if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) {
+ if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
err = -EEXIST;
} else {
- hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
+ hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
}
- spin_unlock(&addrconf_hash_lock);
+ spin_unlock(&net->ipv6.addrconf_hash_lock);
return err;
}
@@ -1261,9 +1257,10 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
static void ipv6_del_addr(struct inet6_ifaddr *ifp)
{
- int state;
enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
+ struct net *net = dev_net(ifp->idev->dev);
unsigned long expires;
+ int state;
ASSERT_RTNL();
@@ -1275,9 +1272,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (state == INET6_IFADDR_STATE_DEAD)
goto out;
- spin_lock_bh(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
hlist_del_init_rcu(&ifp->addr_lst);
- spin_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
write_lock_bh(&ifp->idev->lock);
@@ -1920,10 +1917,8 @@ __ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
if (skip_dev_check)
dev = NULL;
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
ndev = ifp->idev->dev;
- if (!net_eq(dev_net(ndev), net))
- continue;
if (l3mdev_master_dev_rcu(ndev) != l3mdev)
continue;
@@ -2027,9 +2022,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
struct inet6_ifaddr *ifp, *result = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
if (!dev || ifp->idev->dev == dev ||
!(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
@@ -2096,7 +2089,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
- struct net *net = dev_net(ifp->idev->dev);
+ struct net *net = dev_net(idev->dev);
if (addrconf_dad_end(ifp)) {
in6_ifa_put(ifp);
@@ -2675,7 +2668,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
create, now);
in6_ifa_put(ifp);
- addrconf_verify();
+ addrconf_verify(net);
}
return 0;
@@ -2987,7 +2980,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
manage_tempaddrs(idev, ifp, cfg->valid_lft,
cfg->preferred_lft, true, jiffies);
in6_ifa_put(ifp);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
return 0;
} else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
@@ -3027,7 +3020,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
manage_tempaddrs(idev, ifp, 0, 0, false,
jiffies);
ipv6_del_addr(ifp);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
if (ipv6_addr_is_multicast(pfx)) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk,
false, pfx, dev->ifindex);
@@ -3772,9 +3765,9 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister)
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
- struct hlist_head *h = &inet6_addr_lst[i];
+ struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
- spin_lock_bh(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
restart:
hlist_for_each_entry_rcu(ifa, h, addr_lst) {
if (ifa->idev == idev) {
@@ -3790,7 +3783,7 @@ restart:
}
}
}
- spin_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
}
write_lock_bh(&idev->lock);
@@ -4246,7 +4239,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
* before this temporary address becomes deprecated.
*/
if (ifp->flags & IFA_F_TEMPORARY)
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(dev_net(dev));
}
static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
@@ -4288,10 +4281,8 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
}
for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
- hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket],
+ hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
/* sync with offset */
if (p < state->offset) {
p++;
@@ -4314,8 +4305,6 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
state->offset++;
return ifa;
}
@@ -4323,9 +4312,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
state->offset = 0;
while (++state->bucket < IN6_ADDR_HSIZE) {
hlist_for_each_entry_rcu(ifa,
- &inet6_addr_lst[state->bucket], addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
+ &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
return ifa;
}
}
@@ -4413,9 +4400,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
int ret = 0;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr) &&
(ifp->flags & IFA_F_HOMEADDRESS)) {
ret = 1;
@@ -4453,9 +4438,7 @@ int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
hash = inet6_addr_hash(net, addr);
hash_found = false;
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
hash_found = true;
@@ -4484,7 +4467,7 @@ int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
* Periodic address status verification
*/
-static void addrconf_verify_rtnl(void)
+static void addrconf_verify_rtnl(struct net *net)
{
unsigned long now, next, next_sec, next_sched;
struct inet6_ifaddr *ifp;
@@ -4496,11 +4479,11 @@ static void addrconf_verify_rtnl(void)
now = jiffies;
next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
- cancel_delayed_work(&addr_chk_work);
+ cancel_delayed_work(&net->ipv6.addr_chk_work);
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
restart:
- hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) {
+ hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
unsigned long age;
/* When setting preferred_lft to a value not zero or
@@ -4599,20 +4582,23 @@ restart:
pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
now, next, next_sec, next_sched);
- mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
rcu_read_unlock_bh();
}
static void addrconf_verify_work(struct work_struct *w)
{
+ struct net *net = container_of(to_delayed_work(w), struct net,
+ ipv6.addr_chk_work);
+
rtnl_lock();
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
rtnl_unlock();
}
-static void addrconf_verify(void)
+static void addrconf_verify(struct net *net)
{
- mod_delayed_work(addrconf_wq, &addr_chk_work, 0);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
}
static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
@@ -4708,7 +4694,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
return 0;
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
+static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
+ struct ifa6_config *cfg)
{
u32 flags;
clock_t expires;
@@ -4822,7 +4809,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
jiffies);
}
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
return 0;
}
@@ -4909,7 +4896,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
!(nlh->nlmsg_flags & NLM_F_REPLACE))
err = -EEXIST;
else
- err = inet6_addr_modify(ifa, &cfg);
+ err = inet6_addr_modify(net, ifa, &cfg);
in6_ifa_put(ifa);
@@ -5533,7 +5520,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
- array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+ array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
#endif
array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
@@ -5794,7 +5781,7 @@ update_lft:
write_unlock_bh(&idev->lock);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(dev_net(dev));
return 0;
}
@@ -7111,6 +7098,14 @@ static int __net_init addrconf_init_net(struct net *net)
int err = -ENOMEM;
struct ipv6_devconf *all, *dflt;
+ spin_lock_init(&net->ipv6.addrconf_hash_lock);
+ INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
+ net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->ipv6.inet6_addr_lst)
+ goto err_alloc_addr;
+
all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
if (!all)
goto err_alloc_all;
@@ -7172,11 +7167,15 @@ err_reg_all:
err_alloc_dflt:
kfree(all);
err_alloc_all:
+ kfree(net->ipv6.inet6_addr_lst);
+err_alloc_addr:
return err;
}
static void __net_exit addrconf_exit_net(struct net *net)
{
+ int i;
+
#ifdef CONFIG_SYSCTL
__addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
NETCONFA_IFINDEX_DEFAULT);
@@ -7184,7 +7183,19 @@ static void __net_exit addrconf_exit_net(struct net *net)
NETCONFA_IFINDEX_ALL);
#endif
kfree(net->ipv6.devconf_dflt);
+ net->ipv6.devconf_dflt = NULL;
kfree(net->ipv6.devconf_all);
+ net->ipv6.devconf_all = NULL;
+
+ cancel_delayed_work_sync(&net->ipv6.addr_chk_work);
+ /*
+ * Check hash table, then free it.
+ */
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));
+
+ kfree(net->ipv6.inet6_addr_lst);
+ net->ipv6.inet6_addr_lst = NULL;
}
static struct pernet_operations addrconf_ops = {
@@ -7207,7 +7218,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
int __init addrconf_init(void)
{
struct inet6_dev *idev;
- int i, err;
+ int err;
err = ipv6_addr_label_init();
if (err < 0) {
@@ -7226,26 +7237,8 @@ int __init addrconf_init(void)
goto out_nowq;
}
- /* The addrconf netdev notifier requires that loopback_dev
- * has it's ipv6 private information allocated and setup
- * before it can bring up and give link-local addresses
- * to other devices which are up.
- *
- * Unfortunately, loopback_dev is not necessarily the first
- * entry in the global dev_base list of net devices. In fact,
- * it is likely to be the very last entry on that list.
- * So this causes the notifier registry below to try and
- * give link-local addresses to all devices besides loopback_dev
- * first, then loopback_dev, which cases all the non-loopback_dev
- * devices to fail to get a link-local address.
- *
- * So, as a temporary fix, allocate the ipv6 structure for
- * loopback_dev first by hand.
- * Longer term, all of the dependencies ipv6 has upon the loopback
- * device and it being up should be removed.
- */
rtnl_lock();
- idev = ipv6_add_dev(init_net.loopback_dev);
+ idev = ipv6_add_dev(blackhole_netdev);
rtnl_unlock();
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
@@ -7254,12 +7247,9 @@ int __init addrconf_init(void)
ip6_route_init_special_entries();
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
- INIT_HLIST_HEAD(&inet6_addr_lst[i]);
-
register_netdevice_notifier(&ipv6_dev_notf);
- addrconf_verify();
+ addrconf_verify(&init_net);
rtnl_af_register(&inet6_ops);
@@ -7317,7 +7307,6 @@ out:
void addrconf_cleanup(void)
{
struct net_device *dev;
- int i;
unregister_netdevice_notifier(&ipv6_dev_notf);
unregister_pernet_subsys(&addrconf_ops);
@@ -7335,14 +7324,6 @@ void addrconf_cleanup(void)
}
addrconf_ifdown(init_net.loopback_dev, true);
- /*
- * Check hash table.
- */
- spin_lock_bh(&addrconf_hash_lock);
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
- WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
- spin_unlock_bh(&addrconf_hash_lock);
- cancel_delayed_work(&addr_chk_work);
rtnl_unlock();
destroy_workqueue(addrconf_wq);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 77e34aec7e82..658d5eabaf7e 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1344,14 +1344,14 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return opt2;
}
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
- struct ipv6_txoptions *opt)
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
{
/*
* ignore the dest before srcrt unless srcrt is being included.
* --yoshfuji
*/
- if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt->dst0opt && !opt->srcrt) {
if (opt_space != opt) {
memcpy(opt_space, opt, sizeof(*opt_space));
opt = opt_space;
@@ -1362,7 +1362,7 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
return opt;
}
-EXPORT_SYMBOL_GPL(ipv6_fixup_options);
+EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
/**
* fl6_update_dst - update flowi destination address with info given
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ec029c86ae06..7c2003833010 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -16,6 +16,7 @@
#include <linux/indirect_call_wrapper.h>
#include <net/fib_rules.h>
+#include <net/inet_dscp.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
@@ -25,14 +26,14 @@ struct fib6_rule {
struct fib_rule common;
struct rt6key src;
struct rt6key dst;
- u8 tclass;
+ dscp_t dscp;
};
static bool fib6_rule_matchall(const struct fib_rule *rule)
{
struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
- if (r->dst.plen || r->src.plen || r->tclass)
+ if (r->dst.plen || r->src.plen || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -323,7 +324,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
return 0;
}
- if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
+ if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
@@ -349,6 +350,13 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ rule6->dscp = inet_dsfield_to_dscp(frh->tos);
+
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid table");
@@ -369,7 +377,6 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule6->src.plen = frh->src_len;
rule6->dst.plen = frh->dst_len;
- rule6->tclass = frh->tos;
if (fib_rule_requires_fldissect(rule))
net->ipv6.fib6_rules_require_fldissect++;
@@ -402,7 +409,7 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
return 0;
- if (frh->tos && (rule6->tclass != frh->tos))
+ if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos)
return 0;
if (frh->src_len &&
@@ -423,7 +430,7 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule6->dst.plen;
frh->src_len = rule6->src.plen;
- frh->tos = rule6->tclass;
+ frh->tos = inet_dscp_to_dsfield(rule6->dscp);
if ((rule6->dst.plen &&
nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
@@ -486,16 +493,21 @@ out_fib6_rules_ops:
goto out;
}
-static void __net_exit fib6_rules_net_exit(struct net *net)
+static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ list_for_each_entry(net, net_list, exit_list) {
+ fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ cond_resched();
+ }
rtnl_unlock();
}
static struct pernet_operations fib6_rules_net_ops = {
.init = fib6_rules_net_init,
- .exit = fib6_rules_net_exit,
+ .exit_batch = fib6_rules_net_exit_batch,
};
int __init fib6_rules_init(void)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 96c5cc0f30ce..e6b978ea0e87 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -69,17 +69,7 @@
#include <linux/uaccess.h>
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmpv6_sk(struct net *net)
-{
- return this_cpu_read(*net->ipv6.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);
static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
@@ -110,11 +100,11 @@ static const struct inet6_protocol icmpv6_protocol = {
};
/* Called with BH disabled */
-static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
+static struct sock *icmpv6_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmpv6_sk(net);
+ sk = this_cpu_read(ipv6_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path (f.e. SIT or
* ip6ip6 tunnel) signals dst_link_failure() for an
@@ -122,11 +112,13 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
-static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
+static void icmpv6_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
@@ -1034,59 +1026,27 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
-static void __net_exit icmpv6_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv6.icmp_sk, i));
- free_percpu(net->ipv6.icmp_sk);
-}
-
-static int __net_init icmpv6_sk_init(struct net *net)
+int __init icmpv6_init(void)
{
struct sock *sk;
int err, i;
- net->ipv6.icmp_sk = alloc_percpu(struct sock *);
- if (!net->ipv6.icmp_sk)
- return -ENOMEM;
-
for_each_possible_cpu(i) {
err = inet_ctl_sock_create(&sk, PF_INET6,
- SOCK_RAW, IPPROTO_ICMPV6, net);
+ SOCK_RAW, IPPROTO_ICMPV6, &init_net);
if (err < 0) {
pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
err);
- goto fail;
+ return err;
}
- *per_cpu_ptr(net->ipv6.icmp_sk, i) = sk;
+ per_cpu(ipv6_icmp_sk, i) = sk;
/* Enough space for 2 64K ICMP packets, including
* sk_buff struct overhead.
*/
sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
}
- return 0;
-
- fail:
- icmpv6_sk_exit(net);
- return err;
-}
-
-static struct pernet_operations icmpv6_sk_ops = {
- .init = icmpv6_sk_init,
- .exit = icmpv6_sk_exit,
-};
-
-int __init icmpv6_init(void)
-{
- int err;
-
- err = register_pernet_subsys(&icmpv6_sk_ops);
- if (err < 0)
- return err;
err = -EAGAIN;
if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
@@ -1101,14 +1061,12 @@ sender_reg_err:
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
fail:
pr_err("Failed to register ICMP6 protocol\n");
- unregister_pernet_subsys(&icmpv6_sk_ops);
return err;
}
void icmpv6_cleanup(void)
{
inet6_unregister_icmp_sender(icmp6_send);
- unregister_pernet_subsys(&icmpv6_sk_ops);
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 4514444e96c8..4740afecf7c6 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
index f90a87389fcc..f6f5b83dd954 100644
--- a/net/ipv6/ioam6_iptunnel.c
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -32,13 +32,25 @@ struct ioam6_lwt_encap {
struct ioam6_trace_hdr traceh;
} __packed;
+struct ioam6_lwt_freq {
+ u32 k;
+ u32 n;
+};
+
struct ioam6_lwt {
struct dst_cache cache;
+ struct ioam6_lwt_freq freq;
+ atomic_t pkt_cnt;
u8 mode;
struct in6_addr tundst;
struct ioam6_lwt_encap tuninfo;
};
+static struct netlink_range_validation freq_range = {
+ .min = IOAM6_IPTUNNEL_FREQ_MIN,
+ .max = IOAM6_IPTUNNEL_FREQ_MAX,
+};
+
static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
{
return (struct ioam6_lwt *)lwt->data;
@@ -55,6 +67,8 @@ static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
}
static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+ [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
[IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
IOAM6_IPTUNNEL_MODE_MIN,
IOAM6_IPTUNNEL_MODE_MAX),
@@ -96,6 +110,7 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
struct lwtunnel_state *lwt;
struct ioam6_lwt *ilwt;
int len_aligned, err;
+ u32 freq_k, freq_n;
u8 mode;
if (family != AF_INET6)
@@ -106,6 +121,23 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
if (err < 0)
return err;
+ if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
+ (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
+ NL_SET_ERR_MSG(extack, "freq: missing parameter");
+ return -EINVAL;
+ } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
+ freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
+ freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
+ } else {
+ freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
+ freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
+
+ if (freq_k > freq_n) {
+ NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
+ return -EINVAL;
+ }
+ }
+
if (!tb[IOAM6_IPTUNNEL_MODE])
mode = IOAM6_IPTUNNEL_MODE_INLINE;
else
@@ -140,6 +172,10 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
return err;
}
+ atomic_set(&ilwt->pkt_cnt, 0);
+ ilwt->freq.k = freq_k;
+ ilwt->freq.n = freq_n;
+
ilwt->mode = mode;
if (tb[IOAM6_IPTUNNEL_DST])
ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
@@ -263,11 +299,18 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct in6_addr orig_daddr;
struct ioam6_lwt *ilwt;
int err = -EINVAL;
+ u32 pkt_cnt;
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
ilwt = ioam6_lwt_state(dst->lwtstate);
+
+ /* Check for insertion frequency (i.e., "k over n" insertions) */
+ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
+ if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
+ goto out;
+
orig_daddr = ipv6_hdr(skb)->daddr;
switch (ilwt->mode) {
@@ -358,6 +401,14 @@ static int ioam6_fill_encap_info(struct sk_buff *skb,
struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
int err;
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
+ if (err)
+ goto ret;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
+ if (err)
+ goto ret;
+
err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
if (err)
goto ret;
@@ -379,7 +430,9 @@ static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
int nlsize;
- nlsize = nla_total_size(sizeof(ilwt->mode)) +
+ nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
+ nla_total_size(sizeof(ilwt->freq.n)) +
+ nla_total_size(sizeof(ilwt->mode)) +
nla_total_size(sizeof(ilwt->tuninfo.traceh));
if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE)
@@ -395,7 +448,9 @@ static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
- return (ilwt_a->mode != ilwt_b->mode ||
+ return (ilwt_a->freq.k != ilwt_b->freq.k ||
+ ilwt_a->freq.n != ilwt_b->freq.n ||
+ ilwt_a->mode != ilwt_b->mode ||
(ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
!ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
trace_a->namespace_id != trace_b->namespace_id);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 80256717868e..d4b1e2c5aa76 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -508,7 +508,7 @@ int ip6_mc_input(struct sk_buff *skb)
/*
* IPv6 multicast router mode is now supported ;)
*/
- if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+ if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
!(ipv6_addr_type(&hdr->daddr) &
(IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index b29e9ba5e113..d37a79a8554e 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -249,7 +249,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
if ((first_word & htonl(0xF00FFFFF)) ||
!ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
!ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
- *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) {
+ iph->nexthdr != iph2->nexthdr) {
not_same_flow:
NAPI_GRO_CB(p)->same_flow = 0;
continue;
@@ -260,7 +260,8 @@ not_same_flow:
goto not_same_flow;
}
/* flush if Traffic Class fields are different */
- NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
+ NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
+ (__force __be32)(iph->hop_limit ^ iph2->hop_limit));
NAPI_GRO_CB(p)->flush |= flush;
/* If the previous IP ID value was based on an atomic
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2995f8d89e7e..3286b64ec03d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1350,11 +1350,16 @@ static void ip6_append_data_mtu(unsigned int *mtu,
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
- struct rt6_info *rt, struct flowi6 *fl6)
+ struct rt6_info *rt)
{
struct ipv6_pinfo *np = inet6_sk(sk);
unsigned int mtu;
- struct ipv6_txoptions *opt = ipc6->opt;
+ struct ipv6_txoptions *nopt, *opt = ipc6->opt;
+
+ /* callers pass dst together with a reference, set it first so
+ * ip6_cork_release() can put it down even in case of an error.
+ */
+ cork->base.dst = &rt->dst;
/*
* setup for corking
@@ -1363,39 +1368,32 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (WARN_ON(v6_cork->opt))
return -EINVAL;
- v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
- if (unlikely(!v6_cork->opt))
+ nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+ if (unlikely(!nopt))
return -ENOBUFS;
- v6_cork->opt->tot_len = sizeof(*opt);
- v6_cork->opt->opt_flen = opt->opt_flen;
- v6_cork->opt->opt_nflen = opt->opt_nflen;
+ nopt->tot_len = sizeof(*opt);
+ nopt->opt_flen = opt->opt_flen;
+ nopt->opt_nflen = opt->opt_nflen;
- v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
- sk->sk_allocation);
- if (opt->dst0opt && !v6_cork->opt->dst0opt)
+ nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
+ if (opt->dst0opt && !nopt->dst0opt)
return -ENOBUFS;
- v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
- sk->sk_allocation);
- if (opt->dst1opt && !v6_cork->opt->dst1opt)
+ nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
+ if (opt->dst1opt && !nopt->dst1opt)
return -ENOBUFS;
- v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
- sk->sk_allocation);
- if (opt->hopopt && !v6_cork->opt->hopopt)
+ nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
+ if (opt->hopopt && !nopt->hopopt)
return -ENOBUFS;
- v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
- sk->sk_allocation);
- if (opt->srcrt && !v6_cork->opt->srcrt)
+ nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
+ if (opt->srcrt && !nopt->srcrt)
return -ENOBUFS;
/* need source address above miyazawa*/
}
- dst_hold(&rt->dst);
- cork->base.dst = &rt->dst;
- cork->fl.u.ip6 = *fl6;
v6_cork->hop_limit = ipc6->hlimit;
v6_cork->tclass = ipc6->tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
@@ -1426,9 +1424,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
}
static int __ip6_append_data(struct sock *sk,
- struct flowi6 *fl6,
struct sk_buff_head *queue,
- struct inet_cork *cork,
+ struct inet_cork_full *cork_full,
struct inet6_cork *v6_cork,
struct page_frag *pfrag,
int getfrag(void *from, char *to, int offset,
@@ -1437,6 +1434,8 @@ static int __ip6_append_data(struct sock *sk,
unsigned int flags, struct ipcm6_cookie *ipc6)
{
struct sk_buff *skb, *skb_prev = NULL;
+ struct inet_cork *cork = &cork_full->base;
+ struct flowi6 *fl6 = &cork_full->fl.u.ip6;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
@@ -1488,6 +1487,7 @@ static int __ip6_append_data(struct sock *sk,
if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
(sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_ICMPV6 ||
sk->sk_protocol == IPPROTO_RAW)) {
ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
sizeof(struct ipv6hdr));
@@ -1788,34 +1788,46 @@ int ip6_append_data(struct sock *sk,
/*
* setup for corking
*/
+ dst_hold(&rt->dst);
err = ip6_setup_cork(sk, &inet->cork, &np->cork,
- ipc6, rt, fl6);
+ ipc6, rt);
if (err)
return err;
+ inet->cork.fl.u.ip6 = *fl6;
exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
length += exthdrlen;
transhdrlen += exthdrlen;
} else {
- fl6 = &inet->cork.fl.u.ip6;
transhdrlen = 0;
}
- return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
+ return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
&np->cork, sk_page_frag(sk), getfrag,
from, length, transhdrlen, flags, ipc6);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
+static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
+{
+ struct dst_entry *dst = cork->base.dst;
+
+ cork->base.dst = NULL;
+ cork->base.flags &= ~IPCORK_ALLFRAG;
+ skb_dst_set(skb, dst);
+}
+
static void ip6_cork_release(struct inet_cork_full *cork,
struct inet6_cork *v6_cork)
{
if (v6_cork->opt) {
- kfree(v6_cork->opt->dst0opt);
- kfree(v6_cork->opt->dst1opt);
- kfree(v6_cork->opt->hopopt);
- kfree(v6_cork->opt->srcrt);
- kfree(v6_cork->opt);
+ struct ipv6_txoptions *opt = v6_cork->opt;
+
+ kfree(opt->dst0opt);
+ kfree(opt->dst1opt);
+ kfree(opt->hopopt);
+ kfree(opt->srcrt);
+ kfree(opt);
v6_cork->opt = NULL;
}
@@ -1824,7 +1836,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
cork->base.dst = NULL;
cork->base.flags &= ~IPCORK_ALLFRAG;
}
- memset(&cork->fl, 0, sizeof(cork->fl));
}
struct sk_buff *__ip6_make_skb(struct sock *sk,
@@ -1834,7 +1845,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
- struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+ struct in6_addr *final_dst;
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
@@ -1864,9 +1875,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
/* Allow local fragmentation. */
skb->ignore_df = ip6_sk_ignore_df(sk);
-
- *final_dst = fl6->daddr;
__skb_pull(skb, skb_network_header_len(skb));
+
+ final_dst = &fl6->daddr;
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
@@ -1886,10 +1897,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
skb->priority = sk->sk_priority;
skb->mark = cork->base.mark;
-
skb->tstamp = cork->base.transmit_time;
- skb_dst_set(skb, dst_clone(&rt->dst));
+ ip6_cork_steal_dst(skb, cork);
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if (proto == IPPROTO_ICMPV6) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
@@ -1961,26 +1971,26 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags,
- struct inet_cork_full *cork)
+ struct ipcm6_cookie *ipc6, struct rt6_info *rt,
+ unsigned int flags, struct inet_cork_full *cork)
{
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
int err;
- if (flags & MSG_PROBE)
+ if (flags & MSG_PROBE) {
+ dst_release(&rt->dst);
return NULL;
+ }
__skb_queue_head_init(&queue);
cork->base.flags = 0;
cork->base.addr = 0;
cork->base.opt = NULL;
- cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
if (err) {
ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
@@ -1988,7 +1998,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
+ err = __ip6_append_data(sk, &queue, cork, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 97ade833f58c..53f632a560ec 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1121,6 +1121,14 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
+ } else if (skb->protocol == htons(ETH_P_IP)) {
+ const struct rtable *rt = skb_rtable(skb);
+
+ if (!rt)
+ goto tx_err_link_failure;
+
+ if (rt->rt_gw_family == AF_INET6)
+ memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr));
}
} else if (t->parms.proto != 0 && !(t->parms.flags &
(IP6_TNL_F_USE_ORIG_TCLASS |
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 8a2db926b5eb..0ebaaec3faf9 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -255,13 +255,12 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
list_del(&mrt->list);
ip6mr_free_table(mrt);
}
fib_rules_unregister(net->ipv6.mr6_rules_ops);
- rtnl_unlock();
}
static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -318,10 +317,9 @@ static int __net_init ip6mr_rules_init(struct net *net)
static void __net_exit ip6mr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ip6mr_free_table(net->ipv6.mrt6);
net->ipv6.mrt6 = NULL;
- rtnl_unlock();
}
static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -734,7 +732,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
in6_dev = __in6_dev_get(dev);
if (in6_dev) {
- in6_dev->cnf.mc_forwarding--;
+ atomic_dec(&in6_dev->cnf.mc_forwarding);
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in6_dev->cnf);
@@ -902,7 +900,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
in6_dev = __in6_dev_get(dev);
if (in6_dev) {
- in6_dev->cnf.mc_forwarding++;
+ atomic_inc(&in6_dev->cnf.mc_forwarding);
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in6_dev->cnf);
@@ -1325,7 +1323,9 @@ static int __net_init ip6mr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip6_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ip6mr_rules_exit(net);
+ rtnl_unlock();
#endif
ip6mr_rules_fail:
ip6mr_notifier_exit(net);
@@ -1338,13 +1338,23 @@ static void __net_exit ip6mr_net_exit(struct net *net)
remove_proc_entry("ip6_mr_cache", net->proc_net);
remove_proc_entry("ip6_mr_vif", net->proc_net);
#endif
- ip6mr_rules_exit(net);
ip6mr_notifier_exit(net);
}
+static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip6mr_rules_exit(net);
+ rtnl_unlock();
+}
+
static struct pernet_operations ip6mr_net_ops = {
.init = ip6mr_net_init,
.exit = ip6mr_net_exit,
+ .exit_batch = ip6mr_net_exit_batch,
};
int __init ip6_mr_init(void)
@@ -1553,7 +1563,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
} else {
rcu_assign_pointer(mrt->mroute_sk, sk);
sock_set_flag(sk, SOCK_RCU_FREE);
- net->ipv6.devconf_all->mc_forwarding++;
+ atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
}
write_unlock_bh(&mrt_lock);
@@ -1569,14 +1579,19 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
int ip6mr_sk_done(struct sock *sk)
{
- int err = -EACCES;
struct net *net = sock_net(sk);
+ struct ipv6_devconf *devconf;
struct mr_table *mrt;
+ int err = -EACCES;
if (sk->sk_type != SOCK_RAW ||
inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return err;
+ devconf = net->ipv6.devconf_all;
+ if (!devconf || !atomic_read(&devconf->mc_forwarding))
+ return err;
+
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
@@ -1586,7 +1601,7 @@ int ip6mr_sk_done(struct sock *sk)
* so the RCU grace period before sk freeing
* is guaranteed by sk_destruct()
*/
- net->ipv6.devconf_all->mc_forwarding--;
+ atomic_dec(&devconf->mc_forwarding);
write_unlock_bh(&mrt_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f03b597e4121..1c06d0cd02f7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1337,8 +1337,12 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
neigh->flags |= NTF_ROUTER;
- } else if (rt) {
+ } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) {
+ struct nl_info nlinfo = {
+ .nl_net = net,
+ };
rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
}
if (rt)
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 9256f6ba87ef..ff033d16549e 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -59,8 +59,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct pingfakehdr pfh;
struct ipcm6_cookie ipc6;
- pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
-
err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph,
sizeof(user_icmph));
if (err)
@@ -99,7 +97,25 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
return -EINVAL;
- /* TODO: use ip6_datagram_send_ctl to get options from cmsg */
+ ipcm6_init_sk(&ipc6, np);
+ ipc6.sockc.tsflags = sk->sk_tsflags;
+ ipc6.sockc.mark = sk->sk_mark;
+
+ if (msg->msg_controllen) {
+ struct ipv6_txoptions opt = {};
+
+ opt.tot_len = sizeof(opt);
+ ipc6.opt = &opt;
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
+ if (err < 0)
+ return err;
+
+ /* Changes to txoptions and flow info are not implemented, yet.
+ * Drop the options, fl6 is wiped below.
+ */
+ ipc6.opt = NULL;
+ }
memset(&fl6, 0, sizeof(fl6));
@@ -107,14 +123,12 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.saddr = np->saddr;
fl6.daddr = *daddr;
fl6.flowi6_oif = oif;
- fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_mark = ipc6.sockc.mark;
fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- ipcm6_init_sk(&ipc6, np);
- ipc6.sockc.mark = sk->sk_mark;
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
@@ -136,7 +150,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.wcheck = 0;
pfh.family = AF_INET6;
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
lock_sock(sk);
err = ip6_append_data(sk, ping_getfrag, &pfh, len,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ea1cf414a92e..6188712f24b0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -130,6 +130,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
struct uncached_list {
spinlock_t lock;
struct list_head head;
+ struct list_head quarantine;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
@@ -149,35 +150,34 @@ void rt6_uncached_list_del(struct rt6_info *rt)
{
if (!list_empty(&rt->rt6i_uncached)) {
struct uncached_list *ul = rt->rt6i_uncached_list;
- struct net *net = dev_net(rt->dst.dev);
spin_lock_bh(&ul->lock);
- list_del(&rt->rt6i_uncached);
- atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
+ list_del_init(&rt->rt6i_uncached);
spin_unlock_bh(&ul->lock);
}
}
-static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
+static void rt6_uncached_list_flush_dev(struct net_device *dev)
{
- struct net_device *loopback_dev = net->loopback_dev;
int cpu;
- if (dev == loopback_dev)
- return;
-
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
- struct rt6_info *rt;
+ struct rt6_info *rt, *safe;
+
+ if (list_empty(&ul->head))
+ continue;
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
+ bool handled = false;
if (rt_idev->dev == dev) {
- rt->rt6i_idev = in6_dev_get(loopback_dev);
+ rt->rt6i_idev = in6_dev_get(blackhole_netdev);
in6_dev_put(rt_idev);
+ handled = true;
}
if (rt_dev == dev) {
@@ -185,7 +185,11 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
dev_replace_track(rt_dev, blackhole_netdev,
&rt->dst.dev_tracker,
GFP_ATOMIC);
+ handled = true;
}
+ if (handled)
+ list_move(&rt->rt6i_uncached,
+ &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
}
@@ -373,13 +377,12 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
- struct net_device *loopback_dev =
- dev_net(dev)->loopback_dev;
- if (idev && idev->dev != loopback_dev) {
- struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
- if (loopback_idev) {
- rt->rt6i_idev = loopback_idev;
+ if (idev && idev->dev != blackhole_netdev) {
+ struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
+
+ if (blackhole_idev) {
+ rt->rt6i_idev = blackhole_idev;
in6_dev_put(idev);
}
}
@@ -2244,7 +2247,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
* if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
rt6_uncached_list_add(rt);
- atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
rcu_read_unlock();
return rt;
@@ -3287,7 +3289,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
* do proper release of the net_device
*/
rt6_uncached_list_add(rt);
- atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
@@ -4896,7 +4897,7 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
rt6_sync_down_dev(dev, event);
- rt6_uncached_list_flush_dev(dev_net(dev), dev);
+ rt6_uncached_list_flush_dev(dev);
neigh_ifdown(&nd_tbl, dev);
}
@@ -5009,6 +5010,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
err = -EINVAL;
rtm = nlmsg_data(nlh);
+ if (rtm->rtm_tos) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): option not available for IPv6");
+ goto errout;
+ }
+
*cfg = (struct fib6_config){
.fc_table = rtm->rtm_table,
.fc_dst_len = rtm->rtm_dst_len,
@@ -6731,6 +6738,7 @@ int __init ip6_route_init(void)
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
+ INIT_LIST_HEAD(&ul->quarantine);
spin_lock_init(&ul->lock);
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 075ee8a2df3b..0c648bf07f39 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -148,6 +148,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_timewait_death_row *tcp_death_row;
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
@@ -156,7 +157,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct dst_entry *dst;
int addr_type;
int err;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -308,6 +308,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
+ tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
err = inet6_hash_connect(tcp_death_row, sk);
if (err)
goto late_failure;
@@ -2237,15 +2238,9 @@ static void __net_exit tcpv6_net_exit(struct net *net)
inet_ctl_sock_destroy(net->ipv6.tcp_sk);
}
-static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&tcp_hashinfo, AF_INET6);
-}
-
static struct pernet_operations tcpv6_net_ops = {
.init = tcpv6_net_init,
.exit = tcpv6_net_exit,
- .exit_batch = tcpv6_net_exit_batch,
};
int __init tcpv6_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 528b81ef19c9..7f0fa9bd9ffe 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -912,6 +912,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
@@ -988,6 +989,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
return udp6_unicast_rcv_skb(sk, skb, uh);
}
+ reason = SKB_DROP_REASON_NO_SOCKET;
+
if (!uh->check)
goto report_csum_error;
@@ -1000,10 +1003,12 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
__UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
short_packet:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
saddr, ntohs(uh->source),
@@ -1014,10 +1019,12 @@ short_packet:
report_csum_error:
udp6_csum_zero_error(skb);
csum_error:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_UDP_CSUM;
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
discard:
__UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
}
@@ -1266,23 +1273,17 @@ static int udp_v6_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb;
struct udp_sock *up = udp_sk(sk);
- struct flowi6 fl6;
int err = 0;
if (up->pending == AF_INET)
return udp_push_pending_frames(sk);
- /* ip6_finish_skb will release the cork, so make a copy of
- * fl6 here.
- */
- fl6 = inet_sk(sk)->cork.fl.u.ip6;
-
skb = ip6_finish_skb(sk);
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
-
+ err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
+ &inet_sk(sk)->cork.base);
out:
up->len = 0;
up->pending = 0;
@@ -1300,7 +1301,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipv6_txoptions *opt = NULL;
struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
- struct flowi6 fl6;
+ struct inet_cork_full cork;
+ struct flowi6 *fl6 = &cork.fl.u.ip6;
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
@@ -1363,9 +1365,6 @@ do_udp_sendmsg:
}
}
- if (up->pending == AF_INET)
- return udp_sendmsg(sk, msg, len);
-
/* Rough check on arithmetic overflow,
better check is made in ip6_append_data().
*/
@@ -1374,6 +1373,8 @@ do_udp_sendmsg:
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
if (up->pending) {
+ if (up->pending == AF_INET)
+ return udp_sendmsg(sk, msg, len);
/*
* There are pending frames.
* The socket lock must be held while it's corked.
@@ -1391,19 +1392,19 @@ do_udp_sendmsg:
}
ulen += sizeof(struct udphdr);
- memset(&fl6, 0, sizeof(fl6));
+ memset(fl6, 0, sizeof(*fl6));
if (sin6) {
if (sin6->sin6_port == 0)
return -EINVAL;
- fl6.fl6_dport = sin6->sin6_port;
+ fl6->fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
if (np->sndflow) {
- fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
}
@@ -1420,24 +1421,24 @@ do_udp_sendmsg:
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
__ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
- fl6.flowi6_oif = sin6->sin6_scope_id;
+ fl6->flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- fl6.fl6_dport = inet->inet_dport;
+ fl6->fl6_dport = inet->inet_dport;
daddr = &sk->sk_v6_daddr;
- fl6.flowlabel = np->flow_label;
+ fl6->flowlabel = np->flow_label;
connected = true;
}
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = sk->sk_bound_dev_if;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = sk->sk_bound_dev_if;
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
- fl6.flowi6_uid = sk->sk_uid;
+ fl6->flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
@@ -1447,14 +1448,14 @@ do_udp_sendmsg:
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
if (err > 0)
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
&ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
}
@@ -1471,16 +1472,17 @@ do_udp_sendmsg:
opt = ipv6_fixup_options(&opt_space, opt);
ipc6.opt = opt;
- fl6.flowi6_proto = sk->sk_protocol;
- fl6.flowi6_mark = ipc6.sockc.mark;
- fl6.daddr = *daddr;
- if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
- fl6.saddr = np->saddr;
- fl6.fl6_sport = inet->inet_sport;
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->flowi6_mark = ipc6.sockc.mark;
+ fl6->daddr = *daddr;
+ if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
+ fl6->saddr = np->saddr;
+ fl6->fl6_sport = inet->inet_sport;
if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
- (struct sockaddr *)sin6, &fl6.saddr);
+ (struct sockaddr *)sin6,
+ &fl6->saddr);
if (err)
goto out_no_dst;
if (sin6) {
@@ -1496,32 +1498,32 @@ do_udp_sendmsg:
err = -EINVAL;
goto out_no_dst;
}
- fl6.fl6_dport = sin6->sin6_port;
- fl6.daddr = sin6->sin6_addr;
+ fl6->fl6_dport = sin6->sin6_port;
+ fl6->daddr = sin6->sin6_addr;
}
}
- if (ipv6_addr_any(&fl6.daddr))
- fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6->daddr))
+ fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- final_p = fl6_update_dst(&fl6, opt, &final);
+ final_p = fl6_update_dst(fl6, opt, &final);
if (final_p)
connected = false;
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
- fl6.flowi6_oif = np->mcast_oif;
+ if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
+ fl6->flowi6_oif = np->mcast_oif;
connected = false;
- } else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ } else if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
- fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+ fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
- dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected);
+ dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
@@ -1529,7 +1531,7 @@ do_udp_sendmsg:
}
if (ipc6.hlimit < 0)
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -1537,17 +1539,17 @@ back_from_confirm:
/* Lockless fast path for the non-corking case */
if (!corkreq) {
- struct inet_cork_full cork;
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
- &fl6, (struct rt6_info *)dst,
+ (struct rt6_info *)dst,
msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6, &cork.base);
- goto out;
+ err = udp_v6_send_skb(skb, fl6, &cork.base);
+ /* ip6_make_skb steals dst reference */
+ goto out_no_dst;
}
lock_sock(sk);
@@ -1568,7 +1570,7 @@ do_append_data:
ipc6.dontfrag = np->dontfrag;
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
- &ipc6, &fl6, (struct rt6_info *)dst,
+ &ipc6, fl6, (struct rt6_info *)dst,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
@@ -1603,7 +1605,7 @@ out_no_dst:
do_confirm:
if (msg->msg_flags & MSG_PROBE)
- dst_confirm_neigh(dst, &fl6.daddr);
+ dst_confirm_neigh(dst, &fl6->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index fad687ee6dd8..55bb2cbae13d 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -92,7 +92,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt6.rt6i_src = rt->rt6i_src;
INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
rt6_uncached_list_add(&xdst->u.rt6);
- atomic_inc(&dev_net(dev)->ipv6.rt6_stats->fib_rt_uncache);
return 0;
}
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
index 26d2f8ba7029..64e8ce9760b9 100644
--- a/net/mac80211/airtime.c
+++ b/net/mac80211/airtime.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: ISC
/*
* Copyright (C) 2019 Felix Fietkau <nbd@nbd.name>
+ * Copyright (C) 2021 Intel Corporation
*/
#include <net/mac80211.h>
@@ -67,17 +68,11 @@
#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */
#define IEEE80211_HE_MAX_STREAMS 8
-#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */
#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
IEEE80211_HT_STREAM_GROUPS)
#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
IEEE80211_VHT_STREAM_GROUPS)
-#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \
- IEEE80211_HE_STREAM_GROUPS)
-#define IEEE80211_GROUPS_NB (IEEE80211_HT_GROUPS_NB + \
- IEEE80211_VHT_GROUPS_NB + \
- IEEE80211_HE_GROUPS_NB)
#define IEEE80211_HT_GROUP_0 0
#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB)
@@ -650,12 +645,12 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
struct sta_info *sta = container_of(pubsta, struct sta_info,
sta);
struct ieee80211_rx_status stat;
- struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate;
+ struct ieee80211_tx_rate *tx_rate = &sta->tx_stats.last_rate;
struct rate_info *ri = &sta->tx_stats.last_rate_info;
u32 duration, overhead;
u8 agg_shift;
- if (ieee80211_fill_rx_status(&stat, hw, rate, ri, band, len))
+ if (ieee80211_fill_rx_status(&stat, hw, tx_rate, ri, band, len))
return 0;
if (stat.encoding == RX_ENC_LEGACY || !ampdu)
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 8dbfe325ee66..f4c9a92f50f9 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -634,8 +634,10 @@ static const struct file_operations stats_ ##name## _ops = { \
.llseek = generic_file_llseek, \
};
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
#define DEBUGFS_STATS_ADD(name) \
debugfs_create_u32(#name, 0400, statsd, &local->name);
+#endif
#define DEBUGFS_DEVSTATS_ADD(name) \
debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops);
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index f53dec8a3d5c..edc7792e1361 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -4,6 +4,7 @@
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2021 Intel Corporation
*/
#include <linux/kobject.h>
@@ -22,7 +23,6 @@ static ssize_t key_##name##_read(struct file *file, \
return mac80211_format_buffer(userbuf, count, ppos, \
format_string, key->prop); \
}
-#define KEY_READ_D(name) KEY_READ(name, name, "%d\n")
#define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n")
#define KEY_OPS(name) \
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index db724fc10a5f..e490c3da3aca 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -2,7 +2,7 @@
/*
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2020 Intel Corporation
+ * Copyright (C) 2020-2021 Intel Corporation
*/
#include <linux/kernel.h>
@@ -77,8 +77,6 @@ static ssize_t ieee80211_if_fmt_##name( \
IEEE80211_IF_FMT(name, field, "%#x\n")
#define IEEE80211_IF_FMT_LHEX(name, field) \
IEEE80211_IF_FMT(name, field, "%#lx\n")
-#define IEEE80211_IF_FMT_SIZE(name, field) \
- IEEE80211_IF_FMT(name, field, "%zd\n")
#define IEEE80211_IF_FMT_HEXARRAY(name, field) \
static ssize_t ieee80211_if_fmt_##name( \
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 330ea62231fa..da35791b8378 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2380,7 +2380,7 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
const struct cfg80211_chan_def *chandef);
u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype);
-u8 *ieee80211_ie_build_he_cap(u8 *pos,
+u8 *ieee80211_ie_build_he_cap(u32 disable_flags, u8 *pos,
const struct ieee80211_sta_he_cap *he_cap,
u8 *end);
void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 15ac08d111ea..6847fdf93439 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -580,7 +580,7 @@ int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata,
return -ENOMEM;
pos = skb_put(skb, ie_len);
- ieee80211_ie_build_he_cap(pos, he_cap, pos + ie_len);
+ ieee80211_ie_build_he_cap(0, pos, he_cap, pos + ie_len);
return 0;
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e5ccf17618ab..20b57ddf149c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -635,7 +635,7 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb,
struct ieee80211_supported_band *sband)
{
- u8 *pos;
+ u8 *pos, *pre_he_pos;
const struct ieee80211_sta_he_cap *he_cap = NULL;
struct ieee80211_chanctx_conf *chanctx_conf;
u8 he_cap_size;
@@ -652,16 +652,21 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
he_cap = ieee80211_get_he_iftype_cap(sband,
ieee80211_vif_type_p2p(&sdata->vif));
- if (!he_cap || !reg_cap)
+ if (!he_cap || !chanctx_conf || !reg_cap)
return;
+ /* get a max size estimate */
he_cap_size =
2 + 1 + sizeof(he_cap->he_cap_elem) +
ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) +
ieee80211_he_ppe_size(he_cap->ppe_thres[0],
he_cap->he_cap_elem.phy_cap_info);
pos = skb_put(skb, he_cap_size);
- ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size);
+ pre_he_pos = pos;
+ pos = ieee80211_ie_build_he_cap(sdata->u.mgd.flags,
+ pos, he_cap, pos + he_cap_size);
+ /* trim excess if any */
+ skb_trim(skb, skb->len - (pre_he_pos + he_cap_size - pos));
ieee80211_ie_build_he_6ghz_cap(sdata, skb);
}
@@ -4915,13 +4920,20 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
rcu_read_unlock();
}
-static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
- struct cfg80211_bss *cbss)
+static u8 ieee80211_max_rx_chains(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_bss *cbss)
{
+ struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
const struct element *ht_cap_elem, *vht_cap_elem;
+ const struct cfg80211_bss_ies *ies;
const struct ieee80211_ht_cap *ht_cap;
const struct ieee80211_vht_cap *vht_cap;
+ const struct ieee80211_he_cap_elem *he_cap;
+ const struct element *he_cap_elem;
+ u16 mcs_80_map, mcs_160_map;
+ int i, mcs_nss_size;
+ bool support_160;
u8 chains = 1;
if (ifmgd->flags & IEEE80211_STA_DISABLE_HT)
@@ -4956,10 +4968,156 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
chains = max(chains, nss);
}
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HE)
+ return chains;
+
+ ies = rcu_dereference(cbss->ies);
+ he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY,
+ ies->data, ies->len);
+
+ if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap))
+ return chains;
+
+ /* skip one byte ext_tag_id */
+ he_cap = (void *)(he_cap_elem->data + 1);
+ mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap);
+
+ /* invalid HE IE */
+ if (he_cap_elem->datalen < 1 + mcs_nss_size + sizeof(*he_cap))
+ return chains;
+
+ /* mcs_nss is right after he_cap info */
+ he_mcs_nss_supp = (void *)(he_cap + 1);
+
+ mcs_80_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80);
+
+ for (i = 7; i >= 0; i--) {
+ u8 mcs_80 = mcs_80_map >> (2 * i) & 3;
+
+ if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ chains = max_t(u8, chains, i + 1);
+ break;
+ }
+ }
+
+ support_160 = he_cap->phy_cap_info[0] &
+ IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G;
+
+ if (!support_160)
+ return chains;
+
+ mcs_160_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_160);
+ for (i = 7; i >= 0; i--) {
+ u8 mcs_160 = mcs_160_map >> (2 * i) & 3;
+
+ if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ chains = max_t(u8, chains, i + 1);
+ break;
+ }
+ }
+
return chains;
}
static bool
+ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_bss_ies *ies,
+ const struct ieee80211_he_operation *he_op)
+{
+ const struct element *he_cap_elem;
+ const struct ieee80211_he_cap_elem *he_cap;
+ struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp;
+ u16 mcs_80_map_tx, mcs_80_map_rx;
+ u16 ap_min_req_set;
+ int mcs_nss_size;
+ int nss;
+
+ he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY,
+ ies->data, ies->len);
+
+ /* invalid HE IE */
+ if (!he_cap_elem || he_cap_elem->datalen < 1 + sizeof(*he_cap)) {
+ sdata_info(sdata,
+ "Invalid HE elem, Disable HE\n");
+ return false;
+ }
+
+ /* skip one byte ext_tag_id */
+ he_cap = (void *)(he_cap_elem->data + 1);
+ mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap);
+
+ /* invalid HE IE */
+ if (he_cap_elem->datalen < 1 + sizeof(*he_cap) + mcs_nss_size) {
+ sdata_info(sdata,
+ "Invalid HE elem with nss size, Disable HE\n");
+ return false;
+ }
+
+ /* mcs_nss is right after he_cap info */
+ he_mcs_nss_supp = (void *)(he_cap + 1);
+
+ mcs_80_map_tx = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80);
+ mcs_80_map_rx = le16_to_cpu(he_mcs_nss_supp->rx_mcs_80);
+
+ /* P802.11-REVme/D0.3
+ * 27.1.1 Introduction to the HE PHY
+ * ...
+ * An HE STA shall support the following features:
+ * ...
+ * Single spatial stream HE-MCSs 0 to 7 (transmit and receive) in all
+ * supported channel widths for HE SU PPDUs
+ */
+ if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) {
+ sdata_info(sdata,
+ "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n",
+ mcs_80_map_tx, mcs_80_map_rx);
+ return false;
+ }
+
+ if (!he_op)
+ return true;
+
+ ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set);
+
+ /* make sure the AP is consistent with itself
+ *
+ * P802.11-REVme/D0.3
+ * 26.17.1 Basic HE BSS operation
+ *
+ * A STA that is operating in an HE BSS shall be able to receive and
+ * transmit at each of the <HE-MCS, NSS> tuple values indicated by the
+ * Basic HE-MCS And NSS Set field of the HE Operation parameter of the
+ * MLME-START.request primitive and shall be able to receive at each of
+ * the <HE-MCS, NSS> tuple values indicated by the Supported HE-MCS and
+ * NSS Set field in the HE Capabilities parameter of the MLMESTART.request
+ * primitive
+ */
+ for (nss = 8; nss > 0; nss--) {
+ u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3;
+ u8 ap_rx_val;
+ u8 ap_tx_val;
+
+ if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED)
+ continue;
+
+ ap_rx_val = (mcs_80_map_rx >> (2 * (nss - 1))) & 3;
+ ap_tx_val = (mcs_80_map_tx >> (2 * (nss - 1))) & 3;
+
+ if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) {
+ sdata_info(sdata,
+ "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n",
+ nss, ap_rx_val, ap_rx_val, ap_op_val);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata,
struct ieee80211_supported_band *sband,
const struct ieee80211_he_operation *he_op)
@@ -5005,7 +5163,15 @@ ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata,
/*
* Make sure the HE AP doesn't require MCSs that aren't
- * supported by the client
+ * supported by the client as required by spec
+ *
+ * P802.11-REVme/D0.3
+ * 26.17.1 Basic HE BSS operation
+ *
+ * An HE STA shall not attempt to join * (MLME-JOIN.request primitive)
+ * a BSS, unless it supports (i.e., is able to both transmit and
+ * receive using) all of the <HE-MCS, NSS> tuples in the basic
+ * HE-MCS and NSS set.
*/
if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
@@ -5136,7 +5302,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
- if (!ieee80211_verify_sta_he_mcs_support(sdata, sband, he_oper))
+ if (!ieee80211_verify_peer_he_mcs_support(sdata, ies, he_oper) ||
+ !ieee80211_verify_sta_he_mcs_support(sdata, sband, he_oper))
ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
}
@@ -5170,7 +5337,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
s1g_oper,
&chandef, false);
- sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
+ sdata->needed_rx_chains = min(ieee80211_max_rx_chains(sdata, cbss),
local->rx_chains);
rcu_read_unlock();
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 9c3b7fc377c1..9c6ace858107 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org>
- * Copyright (C) 2019-2020 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
*/
#include <linux/netdevice.h>
#include <linux/types.h>
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 93680af62c47..d54a4d98c648 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4629,6 +4629,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
/* do the header conversion - first grab the addresses */
ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs);
ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs);
+ skb_postpull_rcsum(skb, skb->data + snap_offs,
+ sizeof(rfc1042_header) + 2);
/* remove the SNAP but leave the ethertype */
skb_pull(skb, snap_offs + sizeof(rfc1042_header));
/* push the addresses in front */
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 537535a88990..91fbb1ee5c38 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -364,8 +364,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
goto free;
sta->mesh->plink_sta = sta;
spin_lock_init(&sta->mesh->plink_lock);
- if (ieee80211_vif_is_mesh(&sdata->vif) &&
- !sdata->u.mesh.user_mpm)
+ if (!sdata->u.mesh.user_mpm)
timer_setup(&sta->mesh->plink_timer, mesh_plink_timer,
0);
sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index f6f63a0b1b72..e81e8a5bb774 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -5,6 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2021-2022 Intel Corporation
*/
#include <linux/export.h>
@@ -628,6 +629,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie;
struct ieee80211_sub_if_data *sdata;
struct ieee80211_hdr *hdr = (void *)skb->data;
+ bool is_valid_ack_signal =
+ !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID);
rcu_read_lock();
sdata = ieee80211_sdata_from_skb(local, skb);
@@ -644,7 +647,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
cfg80211_probe_status(sdata->dev, hdr->addr1,
cookie, acked,
info->status.ack_signal,
- info->status.is_valid_ack_signal,
+ is_valid_ack_signal,
GFP_ATOMIC);
else if (ieee80211_is_mgmt(hdr->frame_control))
cfg80211_mgmt_tx_status(&sdata->wdev, cookie,
@@ -754,7 +757,6 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
*/
#define STA_LOST_PKT_THRESHOLD 50
#define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */
-#define STA_LOST_TDLS_PKT_THRESHOLD 10
#define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */
static void ieee80211_lost_packet(struct sta_info *sta,
@@ -781,7 +783,7 @@ static void ieee80211_lost_packet(struct sta_info *sta,
}
/*
- * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD
+ * If we're in TDLS mode, make sure that all STA_LOST_PKT_THRESHOLD
* of the last packets were lost, and that no ACK was received in the
* last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss
* mechanism.
@@ -1102,7 +1104,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
struct ieee80211_supported_band *sband;
struct sta_info *sta = NULL;
int rates_idx, retry_count;
- bool acked, noack_success;
+ bool acked, noack_success, ack_signal_valid;
u16 tx_time_est;
if (pubsta) {
@@ -1133,6 +1135,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
acked = !!(info->flags & IEEE80211_TX_STAT_ACK);
noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
+ ack_signal_valid =
+ !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID);
if (pubsta) {
struct ieee80211_sub_if_data *sdata = sta->sdata;
@@ -1161,7 +1165,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
unlikely(sdata->u.mgd.probe_send_count > 0))
sdata->u.mgd.probe_send_count = 0;
- if (info->status.is_valid_ack_signal) {
+ if (ack_signal_valid) {
sta->status_stats.last_ack_signal =
(s8)info->status.ack_signal;
sta->status_stats.ack_signal_filled = true;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f71b042a5c8b..abc29df6834c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1974,7 +1974,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata,
if (he_cap &&
cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band),
IEEE80211_CHAN_NO_HE)) {
- pos = ieee80211_ie_build_he_cap(pos, he_cap, end);
+ pos = ieee80211_ie_build_he_cap(0, pos, he_cap, end);
if (!pos)
goto out_err;
}
@@ -2918,10 +2918,11 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype)
he_cap->he_cap_elem.phy_cap_info);
}
-u8 *ieee80211_ie_build_he_cap(u8 *pos,
+u8 *ieee80211_ie_build_he_cap(u32 disable_flags, u8 *pos,
const struct ieee80211_sta_he_cap *he_cap,
u8 *end)
{
+ struct ieee80211_he_cap_elem elem;
u8 n;
u8 ie_len;
u8 *orig_pos = pos;
@@ -2934,7 +2935,23 @@ u8 *ieee80211_ie_build_he_cap(u8 *pos,
if (!he_cap)
return orig_pos;
- n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem);
+ /* modify on stack first to calculate 'n' and 'ie_len' correctly */
+ elem = he_cap->he_cap_elem;
+
+ if (disable_flags & IEEE80211_STA_DISABLE_40MHZ)
+ elem.phy_cap_info[0] &=
+ ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
+ IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G);
+
+ if (disable_flags & IEEE80211_STA_DISABLE_160MHZ)
+ elem.phy_cap_info[0] &=
+ ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G;
+
+ if (disable_flags & IEEE80211_STA_DISABLE_80P80MHZ)
+ elem.phy_cap_info[0] &=
+ ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G;
+
+ n = ieee80211_he_mcs_nss_size(&elem);
ie_len = 2 + 1 +
sizeof(he_cap->he_cap_elem) + n +
ieee80211_he_ppe_size(he_cap->ppe_thres[0],
@@ -2948,8 +2965,8 @@ u8 *ieee80211_ie_build_he_cap(u8 *pos,
*pos++ = WLAN_EID_EXT_HE_CAPABILITY;
/* Fixed data */
- memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem));
- pos += sizeof(he_cap->he_cap_elem);
+ memcpy(pos, &elem, sizeof(elem));
+ pos += sizeof(elem);
memcpy(pos, &he_cap->he_mcs_nss_supp, n);
pos += n;
@@ -3262,7 +3279,6 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
channel_type = NL80211_CHAN_HT40MINUS;
break;
default:
- channel_type = NL80211_CHAN_NO_HT;
return false;
}
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index e856f9092137..a45dacde96f7 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -501,7 +501,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
for (i = 7; i >= 0; i--) {
u8 mcs_160 = (mcs_160_map >> (2 * i)) & 3;
- if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ if (mcs_160 != IEEE80211_HE_MCS_NOT_SUPPORTED) {
rx_mcs_160 = i + 1;
break;
}
@@ -509,7 +509,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
for (i = 7; i >= 0; i--) {
u8 mcs_80 = (mcs_80_map >> (2 * i)) & 3;
- if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ if (mcs_80 != IEEE80211_HE_MCS_NOT_SUPPORTED) {
rx_mcs_80 = i + 1;
break;
}
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index c921de63b494..f0702d920d8d 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -6,6 +6,7 @@
* Copyright (c) 2021 Google
*/
+#include <linux/compat.h>
#include <linux/if_arp.h>
#include <linux/net.h>
#include <linux/mctp.h>
@@ -21,6 +22,8 @@
/* socket implementation */
+static void mctp_sk_expire_keys(struct timer_list *timer);
+
static int mctp_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -99,13 +102,20 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
struct sk_buff *skb;
if (addr) {
+ const u8 tagbits = MCTP_TAG_MASK | MCTP_TAG_OWNER |
+ MCTP_TAG_PREALLOC;
+
if (addrlen < sizeof(struct sockaddr_mctp))
return -EINVAL;
if (addr->smctp_family != AF_MCTP)
return -EINVAL;
if (!mctp_sockaddr_is_ok(addr))
return -EINVAL;
- if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER))
+ if (addr->smctp_tag & ~tagbits)
+ return -EINVAL;
+ /* can't preallocate a non-owned tag */
+ if (addr->smctp_tag & MCTP_TAG_PREALLOC &&
+ !(addr->smctp_tag & MCTP_TAG_OWNER))
return -EINVAL;
} else {
@@ -248,6 +258,32 @@ out_free:
return rc;
}
+/* We're done with the key; invalidate, stop reassembly, and remove from lists.
+ */
+static void __mctp_key_remove(struct mctp_sk_key *key, struct net *net,
+ unsigned long flags, unsigned long reason)
+__releases(&key->lock)
+__must_hold(&net->mctp.keys_lock)
+{
+ struct sk_buff *skb;
+
+ trace_mctp_key_release(key, reason);
+ skb = key->reasm_head;
+ key->reasm_head = NULL;
+ key->reasm_dead = true;
+ key->valid = false;
+ mctp_dev_release_key(key->dev, key);
+ spin_unlock_irqrestore(&key->lock, flags);
+
+ hlist_del(&key->hlist);
+ hlist_del(&key->sklist);
+
+ /* unref for the lists */
+ mctp_key_unref(key);
+
+ kfree_skb(skb);
+}
+
static int mctp_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -293,6 +329,115 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname,
return -EINVAL;
}
+static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
+{
+ struct net *net = sock_net(&msk->sk);
+ struct mctp_sk_key *key = NULL;
+ struct mctp_ioc_tag_ctl ctl;
+ unsigned long flags;
+ u8 tag;
+
+ if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
+ return -EFAULT;
+
+ if (ctl.tag)
+ return -EINVAL;
+
+ if (ctl.flags)
+ return -EINVAL;
+
+ key = mctp_alloc_local_tag(msk, ctl.peer_addr, MCTP_ADDR_ANY,
+ true, &tag);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC;
+ if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) {
+ spin_lock_irqsave(&key->lock, flags);
+ __mctp_key_remove(key, net, flags, MCTP_TRACE_KEY_DROPPED);
+ mctp_key_unref(key);
+ return -EFAULT;
+ }
+
+ mctp_key_unref(key);
+ return 0;
+}
+
+static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg)
+{
+ struct net *net = sock_net(&msk->sk);
+ struct mctp_ioc_tag_ctl ctl;
+ unsigned long flags, fl2;
+ struct mctp_sk_key *key;
+ struct hlist_node *tmp;
+ int rc;
+ u8 tag;
+
+ if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
+ return -EFAULT;
+
+ if (ctl.flags)
+ return -EINVAL;
+
+ /* Must be a local tag, TO set, preallocated */
+ if ((ctl.tag & ~MCTP_TAG_MASK) != (MCTP_TAG_OWNER | MCTP_TAG_PREALLOC))
+ return -EINVAL;
+
+ tag = ctl.tag & MCTP_TAG_MASK;
+ rc = -EINVAL;
+
+ spin_lock_irqsave(&net->mctp.keys_lock, flags);
+ hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
+ /* we do an irqsave here, even though we know the irq state,
+ * so we have the flags to pass to __mctp_key_remove
+ */
+ spin_lock_irqsave(&key->lock, fl2);
+ if (key->manual_alloc &&
+ ctl.peer_addr == key->peer_addr &&
+ tag == key->tag) {
+ __mctp_key_remove(key, net, fl2,
+ MCTP_TRACE_KEY_DROPPED);
+ rc = 0;
+ } else {
+ spin_unlock_irqrestore(&key->lock, fl2);
+ }
+ }
+ spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+
+ return rc;
+}
+
+static int mctp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk);
+
+ switch (cmd) {
+ case SIOCMCTPALLOCTAG:
+ return mctp_ioctl_alloctag(msk, arg);
+ case SIOCMCTPDROPTAG:
+ return mctp_ioctl_droptag(msk, arg);
+ }
+
+ return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+static int mctp_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+
+ switch (cmd) {
+ /* These have compatible ptr layouts */
+ case SIOCMCTPALLOCTAG:
+ case SIOCMCTPDROPTAG:
+ return mctp_ioctl(sock, cmd, (unsigned long)argp);
+ }
+
+ return -ENOIOCTLCMD;
+}
+#endif
+
static const struct proto_ops mctp_dgram_ops = {
.family = PF_MCTP,
.release = mctp_release,
@@ -302,7 +447,7 @@ static const struct proto_ops mctp_dgram_ops = {
.accept = sock_no_accept,
.getname = sock_no_getname,
.poll = datagram_poll,
- .ioctl = sock_no_ioctl,
+ .ioctl = mctp_ioctl,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -312,6 +457,9 @@ static const struct proto_ops mctp_dgram_ops = {
.recvmsg = mctp_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = mctp_compat_ioctl,
+#endif
};
static void mctp_sk_expire_keys(struct timer_list *timer)
@@ -319,7 +467,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
struct mctp_sock *msk = container_of(timer, struct mctp_sock,
key_expiry);
struct net *net = sock_net(&msk->sk);
- unsigned long next_expiry, flags;
+ unsigned long next_expiry, flags, fl2;
struct mctp_sk_key *key;
struct hlist_node *tmp;
bool next_expiry_valid = false;
@@ -327,15 +475,16 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
spin_lock_irqsave(&net->mctp.keys_lock, flags);
hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
- spin_lock(&key->lock);
+ /* don't expire. manual_alloc is immutable, no locking
+ * required.
+ */
+ if (key->manual_alloc)
+ continue;
+ spin_lock_irqsave(&key->lock, fl2);
if (!time_after_eq(key->expiry, jiffies)) {
- trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT);
- key->valid = false;
- hlist_del_rcu(&key->hlist);
- hlist_del_rcu(&key->sklist);
- spin_unlock(&key->lock);
- mctp_key_unref(key);
+ __mctp_key_remove(key, net, fl2,
+ MCTP_TRACE_KEY_TIMEOUT);
continue;
}
@@ -346,7 +495,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
next_expiry = key->expiry;
next_expiry_valid = true;
}
- spin_unlock(&key->lock);
+ spin_unlock_irqrestore(&key->lock, fl2);
}
spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
@@ -387,9 +536,9 @@ static void mctp_sk_unhash(struct sock *sk)
{
struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
struct net *net = sock_net(sk);
+ unsigned long flags, fl2;
struct mctp_sk_key *key;
struct hlist_node *tmp;
- unsigned long flags;
/* remove from any type-based binds */
mutex_lock(&net->mctp.bind_lock);
@@ -399,20 +548,8 @@ static void mctp_sk_unhash(struct sock *sk)
/* remove tag allocations */
spin_lock_irqsave(&net->mctp.keys_lock, flags);
hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
- hlist_del(&key->sklist);
- hlist_del(&key->hlist);
-
- trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED);
-
- spin_lock(&key->lock);
- kfree_skb(key->reasm_head);
- key->reasm_head = NULL;
- key->reasm_dead = true;
- key->valid = false;
- spin_unlock(&key->lock);
-
- /* key is no longer on the lookup lists, unref */
- mctp_key_unref(key);
+ spin_lock_irqsave(&key->lock, fl2);
+ __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_CLOSED);
}
spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
}
diff --git a/net/mctp/device.c b/net/mctp/device.c
index ef2755f82f87..02ddc0f1bd3e 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -6,6 +6,7 @@
* Copyright (c) 2021 Google
*/
+#include <linux/if_arp.h>
#include <linux/if_link.h>
#include <linux/mctp.h>
#include <linux/netdevice.h>
diff --git a/net/mctp/route.c b/net/mctp/route.c
index e52cef750500..0c4c56e1bd6e 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -64,8 +64,7 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
if (msk->bind_type != type)
continue;
- if (msk->bind_addr != MCTP_ADDR_ANY &&
- msk->bind_addr != mh->dest)
+ if (!mctp_address_matches(msk->bind_addr, mh->dest))
continue;
return msk;
@@ -77,7 +76,7 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
mctp_eid_t peer, u8 tag)
{
- if (key->local_addr != local)
+ if (!mctp_address_matches(key->local_addr, local))
return false;
if (key->peer_addr != peer)
@@ -204,29 +203,38 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
return rc;
}
-/* We're done with the key; unset valid and remove from lists. There may still
- * be outstanding refs on the key though...
+/* Helper for mctp_route_input().
+ * We're done with the key; unlock and unref the key.
+ * For the usual case of automatic expiry we remove the key from lists.
+ * In the case that manual allocation is set on a key we release the lock
+ * and local ref, reset reassembly, but don't remove from lists.
*/
-static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net,
- unsigned long flags)
- __releases(&key->lock)
+static void __mctp_key_done_in(struct mctp_sk_key *key, struct net *net,
+ unsigned long flags, unsigned long reason)
+__releases(&key->lock)
{
struct sk_buff *skb;
+ trace_mctp_key_release(key, reason);
skb = key->reasm_head;
key->reasm_head = NULL;
- key->reasm_dead = true;
- key->valid = false;
- mctp_dev_release_key(key->dev, key);
+
+ if (!key->manual_alloc) {
+ key->reasm_dead = true;
+ key->valid = false;
+ mctp_dev_release_key(key->dev, key);
+ }
spin_unlock_irqrestore(&key->lock, flags);
- spin_lock_irqsave(&net->mctp.keys_lock, flags);
- hlist_del(&key->hlist);
- hlist_del(&key->sklist);
- spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+ if (!key->manual_alloc) {
+ spin_lock_irqsave(&net->mctp.keys_lock, flags);
+ hlist_del(&key->hlist);
+ hlist_del(&key->sklist);
+ spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
- /* one unref for the lists */
- mctp_key_unref(key);
+ /* unref for the lists */
+ mctp_key_unref(key);
+ }
/* and one for the local reference */
mctp_key_unref(key);
@@ -380,9 +388,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
/* we've hit a pending reassembly; not much we
* can do but drop it
*/
- trace_mctp_key_release(key,
- MCTP_TRACE_KEY_REPLIED);
- __mctp_key_unlock_drop(key, net, f);
+ __mctp_key_done_in(key, net, f,
+ MCTP_TRACE_KEY_REPLIED);
key = NULL;
}
rc = 0;
@@ -425,9 +432,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
} else {
if (key->reasm_head || key->reasm_dead) {
/* duplicate start? drop everything */
- trace_mctp_key_release(key,
- MCTP_TRACE_KEY_INVALIDATED);
- __mctp_key_unlock_drop(key, net, f);
+ __mctp_key_done_in(key, net, f,
+ MCTP_TRACE_KEY_INVALIDATED);
rc = -EEXIST;
key = NULL;
} else {
@@ -450,10 +456,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
* the reassembly/response key
*/
if (!rc && flags & MCTP_HDR_FLAG_EOM) {
+ msk = container_of(key->sk, struct mctp_sock, sk);
sock_queue_rcv_skb(key->sk, key->reasm_head);
key->reasm_head = NULL;
- trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED);
- __mctp_key_unlock_drop(key, net, f);
+ __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED);
key = NULL;
}
@@ -581,9 +587,9 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
/* Allocate a locally-owned tag value for (saddr, daddr), and reserve
* it for the socket msk
*/
-static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
- mctp_eid_t saddr,
- mctp_eid_t daddr, u8 *tagp)
+struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+ mctp_eid_t daddr, mctp_eid_t saddr,
+ bool manual, u8 *tagp)
{
struct net *net = sock_net(&msk->sk);
struct netns_mctp *mns = &net->mctp;
@@ -617,9 +623,8 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
if (tmp->tag & MCTP_HDR_FLAG_TO)
continue;
- if (!((tmp->peer_addr == daddr ||
- tmp->peer_addr == MCTP_ADDR_ANY) &&
- tmp->local_addr == saddr))
+ if (!(mctp_address_matches(tmp->peer_addr, daddr) &&
+ mctp_address_matches(tmp->local_addr, saddr)))
continue;
spin_lock(&tmp->lock);
@@ -639,6 +644,7 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
mctp_reserve_tag(net, key, msk);
trace_mctp_key_acquire(key);
+ key->manual_alloc = manual;
*tagp = key->tag;
}
@@ -652,6 +658,50 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
return key;
}
+static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
+ mctp_eid_t daddr,
+ u8 req_tag, u8 *tagp)
+{
+ struct net *net = sock_net(&msk->sk);
+ struct netns_mctp *mns = &net->mctp;
+ struct mctp_sk_key *key, *tmp;
+ unsigned long flags;
+
+ req_tag &= ~(MCTP_TAG_PREALLOC | MCTP_TAG_OWNER);
+ key = NULL;
+
+ spin_lock_irqsave(&mns->keys_lock, flags);
+
+ hlist_for_each_entry(tmp, &mns->keys, hlist) {
+ if (tmp->tag != req_tag)
+ continue;
+
+ if (!mctp_address_matches(tmp->peer_addr, daddr))
+ continue;
+
+ if (!tmp->manual_alloc)
+ continue;
+
+ spin_lock(&tmp->lock);
+ if (tmp->valid) {
+ key = tmp;
+ refcount_inc(&key->refs);
+ spin_unlock(&tmp->lock);
+ break;
+ }
+ spin_unlock(&tmp->lock);
+ }
+ spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+ if (!key)
+ return ERR_PTR(-ENOENT);
+
+ if (tagp)
+ *tagp = key->tag;
+
+ return key;
+}
+
/* routing lookups */
static bool mctp_rt_match_eid(struct mctp_route *rt,
unsigned int net, mctp_eid_t eid)
@@ -846,8 +896,14 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
if (rc)
goto out_release;
- if (req_tag & MCTP_HDR_FLAG_TO) {
- key = mctp_alloc_local_tag(msk, saddr, daddr, &tag);
+ if (req_tag & MCTP_TAG_OWNER) {
+ if (req_tag & MCTP_TAG_PREALLOC)
+ key = mctp_lookup_prealloc_tag(msk, daddr,
+ req_tag, &tag);
+ else
+ key = mctp_alloc_local_tag(msk, daddr, saddr,
+ false, &tag);
+
if (IS_ERR(key)) {
rc = PTR_ERR(key);
goto out_release;
@@ -858,7 +914,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
tag |= MCTP_HDR_FLAG_TO;
} else {
key = NULL;
- tag = req_tag;
+ tag = req_tag & MCTP_TAG_MASK;
}
skb->protocol = htons(ETH_P_MCTP);
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index 750f9f9b4daf..61205cf40074 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -369,14 +369,15 @@ static void mctp_test_route_input_sk(struct kunit *test)
#define FL_S (MCTP_HDR_FLAG_SOM)
#define FL_E (MCTP_HDR_FLAG_EOM)
-#define FL_T (MCTP_HDR_FLAG_TO)
+#define FL_TO (MCTP_HDR_FLAG_TO)
+#define FL_T(t) ((t) & MCTP_HDR_TAG_MASK)
static const struct mctp_route_input_sk_test mctp_route_input_sk_tests[] = {
- { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 0, .deliver = true },
- { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 1, .deliver = false },
+ { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO), .type = 0, .deliver = true },
+ { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO), .type = 1, .deliver = false },
{ .hdr = RX_HDR(1, 10, 8, FL_S | FL_E), .type = 0, .deliver = false },
- { .hdr = RX_HDR(1, 10, 8, FL_E | FL_T), .type = 0, .deliver = false },
- { .hdr = RX_HDR(1, 10, 8, FL_T), .type = 0, .deliver = false },
+ { .hdr = RX_HDR(1, 10, 8, FL_E | FL_TO), .type = 0, .deliver = false },
+ { .hdr = RX_HDR(1, 10, 8, FL_TO), .type = 0, .deliver = false },
{ .hdr = RX_HDR(1, 10, 8, 0), .type = 0, .deliver = false },
};
@@ -436,7 +437,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test)
__mctp_route_test_fini(test, dev, rt, sock);
}
-#define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_T | (f) | ((s) << MCTP_HDR_SEQ_SHIFT))
+#define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT))
static const struct mctp_route_input_sk_reasm_test mctp_route_input_sk_reasm_tests[] = {
{
@@ -522,12 +523,156 @@ static void mctp_route_input_sk_reasm_to_desc(
KUNIT_ARRAY_PARAM(mctp_route_input_sk_reasm, mctp_route_input_sk_reasm_tests,
mctp_route_input_sk_reasm_to_desc);
+struct mctp_route_input_sk_keys_test {
+ const char *name;
+ mctp_eid_t key_peer_addr;
+ mctp_eid_t key_local_addr;
+ u8 key_tag;
+ struct mctp_hdr hdr;
+ bool deliver;
+};
+
+/* test packet rx in the presence of various key configurations */
+static void mctp_test_route_input_sk_keys(struct kunit *test)
+{
+ const struct mctp_route_input_sk_keys_test *params;
+ struct mctp_test_route *rt;
+ struct sk_buff *skb, *skb2;
+ struct mctp_test_dev *dev;
+ struct mctp_sk_key *key;
+ struct netns_mctp *mns;
+ struct mctp_sock *msk;
+ struct socket *sock;
+ unsigned long flags;
+ int rc;
+ u8 c;
+
+ params = test->param_value;
+
+ dev = mctp_test_create_dev();
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
+
+ rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt);
+
+ rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock);
+ KUNIT_ASSERT_EQ(test, rc, 0);
+
+ msk = container_of(sock->sk, struct mctp_sock, sk);
+ mns = &sock_net(sock->sk)->mctp;
+
+ /* set the incoming tag according to test params */
+ key = mctp_key_alloc(msk, params->key_local_addr, params->key_peer_addr,
+ params->key_tag, GFP_KERNEL);
+
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, key);
+
+ spin_lock_irqsave(&mns->keys_lock, flags);
+ mctp_reserve_tag(&init_net, key, msk);
+ spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+ /* create packet and route */
+ c = 0;
+ skb = mctp_test_create_skb_data(&params->hdr, &c);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb);
+
+ skb->dev = dev->ndev;
+ __mctp_cb(skb);
+
+ rc = mctp_route_input(&rt->rt, skb);
+
+ /* (potentially) receive message */
+ skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc);
+
+ if (params->deliver)
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2);
+ else
+ KUNIT_EXPECT_PTR_EQ(test, skb2, NULL);
+
+ if (skb2)
+ skb_free_datagram(sock->sk, skb2);
+
+ mctp_key_unref(key);
+ __mctp_route_test_fini(test, dev, rt, sock);
+}
+
+static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = {
+ {
+ .name = "direct match",
+ .key_peer_addr = 9,
+ .key_local_addr = 8,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1)),
+ .deliver = true,
+ },
+ {
+ .name = "flipped src/dest",
+ .key_peer_addr = 8,
+ .key_local_addr = 9,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1)),
+ .deliver = false,
+ },
+ {
+ .name = "peer addr mismatch",
+ .key_peer_addr = 9,
+ .key_local_addr = 8,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T(1)),
+ .deliver = false,
+ },
+ {
+ .name = "tag value mismatch",
+ .key_peer_addr = 9,
+ .key_local_addr = 8,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(2)),
+ .deliver = false,
+ },
+ {
+ .name = "TO mismatch",
+ .key_peer_addr = 9,
+ .key_local_addr = 8,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1) | FL_TO),
+ .deliver = false,
+ },
+ {
+ .name = "broadcast response",
+ .key_peer_addr = MCTP_ADDR_ANY,
+ .key_local_addr = 8,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 11, 8, FL_S | FL_E | FL_T(1)),
+ .deliver = true,
+ },
+ {
+ .name = "any local match",
+ .key_peer_addr = 12,
+ .key_local_addr = MCTP_ADDR_ANY,
+ .key_tag = 1,
+ .hdr = RX_HDR(1, 12, 8, FL_S | FL_E | FL_T(1)),
+ .deliver = true,
+ },
+};
+
+static void mctp_route_input_sk_keys_to_desc(
+ const struct mctp_route_input_sk_keys_test *t,
+ char *desc)
+{
+ sprintf(desc, "%s", t->name);
+}
+
+KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests,
+ mctp_route_input_sk_keys_to_desc);
+
static struct kunit_case mctp_test_cases[] = {
KUNIT_CASE_PARAM(mctp_test_fragment, mctp_frag_gen_params),
KUNIT_CASE_PARAM(mctp_test_rx_input, mctp_rx_input_gen_params),
KUNIT_CASE_PARAM(mctp_test_route_input_sk, mctp_route_input_sk_gen_params),
KUNIT_CASE_PARAM(mctp_test_route_input_sk_reasm,
mctp_route_input_sk_reasm_gen_params),
+ KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys,
+ mctp_route_input_sk_keys_gen_params),
{}
};
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 645dd984fef0..ac10a04ccd7c 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -336,6 +336,8 @@ static void mptcp_parse_option(const struct sk_buff *skb,
flags = *ptr++;
mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
mp_opt->reset_reason = *ptr;
+ pr_debug("MP_RST: transient=%u reason=%u",
+ mp_opt->reset_transient, mp_opt->reset_reason);
break;
case MPTCPOPT_MP_FAIL:
@@ -353,8 +355,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
}
}
-void mptcp_get_options(const struct sock *sk,
- const struct sk_buff *skb,
+void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt)
{
const struct tcphdr *th = tcp_hdr(skb);
@@ -651,7 +652,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
bool drop_other_suboptions = false;
unsigned int opt_size = *size;
bool echo;
- bool port;
int len;
/* add addr will strip the existing options, be sure to avoid breaking
@@ -660,12 +660,12 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
!mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
- &echo, &port, &drop_other_suboptions))
+ &echo, &drop_other_suboptions))
return false;
if (drop_other_suboptions)
remaining += opt_size;
- len = mptcp_add_addr_len(opts->addr.family, echo, port);
+ len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
if (remaining < len)
return false;
@@ -1084,8 +1084,7 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
&mp_opt->addr);
pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
- msk, (unsigned long long)hmac,
- (unsigned long long)mp_opt->ahmac);
+ msk, hmac, mp_opt->ahmac);
return hmac == mp_opt->ahmac;
}
@@ -1112,7 +1111,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true;
}
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
/* The subflow can be in close state only if check_fully_established()
* just sent a reset. If so, tell the caller to ignore the current packet.
@@ -1264,22 +1263,30 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
struct mptcp_out_options *opts)
{
- if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
- const struct sock *ssk = (const struct sock *)tp;
- struct mptcp_subflow_context *subflow;
-
- subflow = mptcp_subflow_ctx(ssk);
- subflow->send_mp_fail = 0;
-
- *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
- TCPOLEN_MPTCP_FAIL,
- 0, 0);
- put_unaligned_be64(opts->fail_seq, ptr);
- ptr += 2;
- }
-
- /* DSS, MPC, MPJ, ADD_ADDR, FASTCLOSE and RST are mutually exclusive,
- * see mptcp_established_options*()
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+
+ /* Which options can be used together?
+ *
+ * X: mutually exclusive
+ * O: often used together
+ * C: can be used together in some cases
+ * P: could be used together but we prefer not to (optimisations)
+ *
+ * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC |
+ * ------|------|------|------|------|------|------|------|------|
+ * MPC |------|------|------|------|------|------|------|------|
+ * MPJ | X |------|------|------|------|------|------|------|
+ * DSS | X | X |------|------|------|------|------|------|
+ * ADD | X | X | P |------|------|------|------|------|
+ * RM | C | C | C | P |------|------|------|------|
+ * PRIO | X | C | C | C | C |------|------|------|
+ * FAIL | X | X | C | X | X | X |------|------|
+ * FC | X | X | X | X | X | X | X |------|
+ * RST | X | X | X | X | X | X | O | O |
+ * ------|------|------|------|------|------|------|------|------|
+ *
+ * The same applies in mptcp_established_options() function.
*/
if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
struct mptcp_ext *mpext = &opts->ext_copy;
@@ -1336,6 +1343,10 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
}
ptr += 1;
}
+
+ /* We might need to add MP_FAIL options in rare cases */
+ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions))
+ goto mp_fail;
} else if (OPTIONS_MPTCP_MPC & opts->suboptions) {
u8 len, flag = MPTCP_CAP_HMAC_SHA256;
@@ -1479,6 +1490,21 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
if (OPTION_MPTCP_RST & opts->suboptions)
goto mp_rst;
return;
+ } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
+mp_fail:
+ /* MP_FAIL is mutually exclusive with others except RST */
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->send_mp_fail = 0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
+ TCPOLEN_MPTCP_FAIL,
+ 0, 0);
+ put_unaligned_be64(opts->fail_seq, ptr);
+ ptr += 2;
+
+ if (OPTION_MPTCP_RST & opts->suboptions)
+ goto mp_rst;
+ return;
} else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
mp_rst:
*ptr++ = mptcp_option(MPTCPOPT_RST,
@@ -1489,9 +1515,6 @@ mp_rst:
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
- const struct sock *ssk = (const struct sock *)tp;
- struct mptcp_subflow_context *subflow;
-
subflow = mptcp_subflow_ctx(ssk);
subflow->send_mp_prio = 0;
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 696b2c4613a7..8755b81896de 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -219,7 +219,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
}
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_data *pm = &msk->pm;
@@ -275,14 +275,15 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
/* path manager helpers */
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
unsigned int opt_size, unsigned int remaining,
struct mptcp_addr_info *addr, bool *echo,
- bool *port, bool *drop_other_suboptions)
+ bool *drop_other_suboptions)
{
int ret = false;
u8 add_addr;
u8 family;
+ bool port;
spin_lock_bh(&msk->pm.lock);
@@ -300,10 +301,10 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
}
*echo = mptcp_pm_should_add_signal_echo(msk);
- *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
+ port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
family = *echo ? msk->pm.remote.family : msk->pm.local.family;
- if (remaining < mptcp_add_addr_len(family, *echo, *port))
+ if (remaining < mptcp_add_addr_len(family, *echo, port))
goto out_unlock;
if (*echo) {
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 356f596e2032..9aeee30e50ba 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -120,7 +120,7 @@ static void remote_address(const struct sock_common *skc,
}
static bool lookup_subflow_by_saddr(const struct list_head *list,
- struct mptcp_addr_info *saddr)
+ const struct mptcp_addr_info *saddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
@@ -138,7 +138,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list,
}
static bool lookup_subflow_by_daddr(const struct list_head *list,
- struct mptcp_addr_info *daddr)
+ const struct mptcp_addr_info *daddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
@@ -157,10 +157,10 @@ static bool lookup_subflow_by_daddr(const struct list_head *list,
static struct mptcp_pm_addr_entry *
select_local_address(const struct pm_nl_pernet *pernet,
- struct mptcp_sock *msk)
+ const struct mptcp_sock *msk)
{
+ const struct sock *sk = (const struct sock *)msk;
struct mptcp_pm_addr_entry *entry, *ret = NULL;
- struct sock *sk = (struct sock *)msk;
msk_owned_by_me(msk);
@@ -190,7 +190,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
}
static struct mptcp_pm_addr_entry *
-select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
+select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
@@ -214,16 +214,16 @@ select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
return ret;
}
-unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
{
- struct pm_nl_pernet *pernet;
+ const struct pm_nl_pernet *pernet;
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ pernet = net_generic(sock_net((const struct sock *)msk), pm_nl_pernet_id);
return READ_ONCE(pernet->add_addr_signal_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
-unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet;
@@ -232,7 +232,7 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk)
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
-unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet;
@@ -241,7 +241,7 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk)
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
-unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet;
@@ -264,8 +264,8 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
}
struct mptcp_pm_add_entry *
-mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
@@ -346,7 +346,7 @@ out:
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr, bool check_id)
+ const struct mptcp_addr_info *addr, bool check_id)
{
struct mptcp_pm_add_entry *entry;
struct sock *sk = (struct sock *)msk;
@@ -364,7 +364,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
}
static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry)
+ const struct mptcp_pm_addr_entry *entry)
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
@@ -410,8 +410,8 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
}
}
-static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr,
- struct mptcp_addr_info *addr)
+static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr,
+ const struct mptcp_addr_info *addr)
{
int i;
@@ -493,9 +493,9 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info,
}
static int
-lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr)
+lookup_id_by_addr(const struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr)
{
- struct mptcp_pm_addr_entry *entry;
+ const struct mptcp_pm_addr_entry *entry;
int ret = -1;
rcu_read_lock();
@@ -1178,14 +1178,8 @@ skip_family:
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
- if (tb[MPTCP_PM_ADDR_ATTR_PORT]) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
- NL_SET_ERR_MSG_ATTR(info->extack, attr,
- "flags must have signal when using port");
- return -EINVAL;
- }
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
- }
return 0;
}
@@ -1231,6 +1225,11 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
+ if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ GENL_SET_ERR_MSG(info, "flags must have signal when using port");
+ return -EINVAL;
+ }
+
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
@@ -1282,7 +1281,7 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
}
static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
@@ -1297,7 +1296,7 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
}
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr,
+ const struct mptcp_addr_info *addr,
bool force)
{
struct mptcp_rm_list list = { .nr = 0 };
@@ -1732,9 +1731,20 @@ fail:
return -EMSGSIZE;
}
-static int mptcp_nl_addr_backup(struct net *net,
- struct mptcp_addr_info *addr,
- u8 bkup)
+static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+
+ list.ids[list.nr++] = addr->id;
+
+ mptcp_pm_nl_rm_subflow_received(msk, &list);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+static int mptcp_nl_set_flags(struct net *net,
+ struct mptcp_addr_info *addr,
+ u8 bkup, u8 changed)
{
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
@@ -1748,7 +1758,10 @@ static int mptcp_nl_addr_backup(struct net *net,
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
- ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+ if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+ if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
+ mptcp_pm_nl_fullmesh(msk, addr);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
@@ -1765,6 +1778,8 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry;
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
+ MPTCP_PM_ADDR_FLAG_FULLMESH;
struct net *net = sock_net(skb->sk);
u8 bkup = 0, lookup_by_id = 0;
int ret;
@@ -1787,15 +1802,18 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
spin_unlock_bh(&pernet->lock);
return -EINVAL;
}
+ if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
+ (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
+ }
- if (bkup)
- entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
- else
- entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ changed = (addr.flags ^ entry->flags) & mask;
+ entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
addr = *entry;
spin_unlock_bh(&pernet->lock);
- mptcp_nl_addr_backup(net, &addr.addr, bkup);
+ mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
return 0;
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 85317ce38e3f..9d0ee6cee07f 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -468,9 +468,7 @@ struct mptcp_subflow_context {
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
const struct inet_connection_sock_af_ops *icsk_af_ops;
- void (*tcp_data_ready)(struct sock *sk);
void (*tcp_state_change)(struct sock *sk);
- void (*tcp_write_space)(struct sock *sk);
void (*tcp_error_report)(struct sock *sk);
struct rcu_head rcu;
@@ -614,9 +612,9 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
{
- sk->sk_data_ready = ctx->tcp_data_ready;
+ sk->sk_data_ready = sock_def_readable;
sk->sk_state_change = ctx->tcp_state_change;
- sk->sk_write_space = ctx->tcp_write_space;
+ sk->sk_write_space = sk_stream_write_space;
sk->sk_error_report = ctx->tcp_error_report;
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
@@ -643,8 +641,7 @@ int __init mptcp_proto_v6_init(void);
struct sock *mptcp_sk_clone(const struct sock *sk,
const struct mptcp_options_received *mp_opt,
struct request_sock *req);
-void mptcp_get_options(const struct sock *sk,
- const struct sk_buff *skb,
+void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
@@ -743,7 +740,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr);
+ const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
@@ -754,10 +751,10 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr, bool check_id);
+ const struct mptcp_addr_info *addr, bool check_id);
struct mptcp_pm_add_entry *
-mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr);
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
u8 *flags, int *ifindex);
@@ -816,10 +813,10 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
}
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
unsigned int opt_size, unsigned int remaining,
struct mptcp_addr_info *addr, bool *echo,
- bool *port, bool *drop_other_suboptions);
+ bool *drop_other_suboptions);
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
@@ -830,10 +827,10 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
-unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);
void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index dacf3cee0027..f949d22f52bd 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -343,6 +343,8 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_RCVLOWAT:
case SO_RCVTIMEO_OLD:
case SO_RCVTIMEO_NEW:
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
case SO_BUSY_POLL:
case SO_PREFER_BUSY_POLL:
case SO_BUSY_POLL_BUDGET:
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index bea47a1180dc..45c004f87f5a 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -153,7 +153,7 @@ static int subflow_check_req(struct request_sock *req,
return -EINVAL;
#endif
- mptcp_get_options(sk_listener, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
@@ -250,7 +250,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
int err;
subflow_init_req(req, sk_listener);
- mptcp_get_options(sk_listener, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
@@ -344,9 +344,7 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
thmac = get_unaligned_be64(hmac);
pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
- subflow, subflow->token,
- (unsigned long long)thmac,
- (unsigned long long)subflow->thmac);
+ subflow, subflow->token, thmac, subflow->thmac);
return thmac == subflow->thmac;
}
@@ -410,7 +408,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp) {
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
MPTCP_INC_STATS(sock_net(sk),
@@ -484,8 +482,7 @@ do_reset:
}
struct request_sock_ops mptcp_subflow_request_sock_ops;
-EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
-static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
@@ -506,9 +503,9 @@ drop:
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
-static struct inet_connection_sock_af_ops subflow_v6_specific;
-static struct inet_connection_sock_af_ops subflow_v6m_specific;
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
+static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
+static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
static struct proto tcpv6_prot_override;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -663,7 +660,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
* reordered MPC will cause fallback, but we don't have other
* options.
*/
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
fallback = true;
goto create_child;
@@ -673,7 +670,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
if (!new_msk)
fallback = true;
} else if (subflow_req->mp_join) {
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) ||
!subflow_hmac_valid(req, &mp_opt) ||
!mptcp_can_accept_new_subflow(subflow_req->msk)) {
@@ -790,7 +787,7 @@ dispose_child:
return child;
}
-static struct inet_connection_sock_af_ops subflow_specific;
+static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
static struct proto tcp_prot_override;
enum mapping_status {
@@ -1311,7 +1308,7 @@ static void subflow_write_space(struct sock *ssk)
mptcp_write_space(sk);
}
-static struct inet_connection_sock_af_ops *
+static const struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -1326,7 +1323,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_connection_sock_af_ops *target;
+ const struct inet_connection_sock_af_ops *target;
target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
@@ -1657,10 +1654,12 @@ static int subflow_ulp_init(struct sock *sk)
tp->is_mptcp = 1;
ctx->icsk_af_ops = icsk->icsk_af_ops;
icsk->icsk_af_ops = subflow_default_af_ops(sk);
- ctx->tcp_data_ready = sk->sk_data_ready;
ctx->tcp_state_change = sk->sk_state_change;
- ctx->tcp_write_space = sk->sk_write_space;
ctx->tcp_error_report = sk->sk_error_report;
+
+ WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
+ WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);
+
sk->sk_data_ready = subflow_data_ready;
sk->sk_write_space = subflow_write_space;
sk->sk_state_change = subflow_state_change;
@@ -1715,9 +1714,7 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
- new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
- new_ctx->tcp_write_space = old_ctx->tcp_write_space;
new_ctx->tcp_error_report = old_ctx->tcp_error_report;
new_ctx->rel_write_seq = 1;
new_ctx->tcp_sock = newsk;
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a135b1a46014..238b6a620e88 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -14,6 +14,11 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
obj-$(CONFIG_NETFILTER) = netfilter.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 354cb472f386..d1c9dfbb11fa 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -621,7 +621,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
case NF_ACCEPT:
break;
case NF_DROP:
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_NETFILTER_DROP);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 91bc8df3e4b0..385a5f458aba 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -22,26 +22,7 @@ static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
-static const struct nf_ct_ext_type acct_extend = {
- .len = sizeof(struct nf_conn_acct),
- .align = __alignof__(struct nf_conn_acct),
- .id = NF_CT_EXT_ACCT,
-};
-
void nf_conntrack_acct_pernet_init(struct net *net)
{
net->ct.sysctl_acct = nf_ct_acct;
}
-
-int nf_conntrack_acct_init(void)
-{
- int ret = nf_ct_extend_register(&acct_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_acct_fini(void)
-{
- nf_ct_extend_unregister(&acct_extend);
-}
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644
index 000000000000..8ad3f52579f3
--- /dev/null
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx (xdp_md, __sk_buff)
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - Passed NULL for bpf_tuple pointer
+ * -EINVAL - opts->reserved is not 0
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ * -ENONET - No network namespace found for netns_id
+ * -ENOENT - Conntrack lookup could not find entry for tuple
+ * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ * or sizeof(tuple->ipv6)
+ * @l4proto - Layer 4 protocol
+ * Values:
+ * IPPROTO_TCP, IPPROTO_UDP
+ * @reserved - Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ */
+struct bpf_ct_opts {
+ s32 netns_id;
+ s32 error;
+ u8 l4proto;
+ u8 reserved[3];
+};
+
+enum {
+ NF_BPF_CT_OPTS_SZ = 12,
+};
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+ struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, u8 protonum,
+ s32 netns_id)
+{
+ struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple tuple;
+
+ if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+ return ERR_PTR(-EPROTO);
+ if (unlikely(netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ memset(&tuple, 0, sizeof(tuple));
+ switch (tuple_len) {
+ case sizeof(bpf_tuple->ipv4):
+ tuple.src.l3num = AF_INET;
+ tuple.src.u3.ip = bpf_tuple->ipv4.saddr;
+ tuple.src.u.tcp.port = bpf_tuple->ipv4.sport;
+ tuple.dst.u3.ip = bpf_tuple->ipv4.daddr;
+ tuple.dst.u.tcp.port = bpf_tuple->ipv4.dport;
+ break;
+ case sizeof(bpf_tuple->ipv6):
+ tuple.src.l3num = AF_INET6;
+ memcpy(tuple.src.u3.ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+ tuple.src.u.tcp.port = bpf_tuple->ipv6.sport;
+ memcpy(tuple.dst.u3.ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+ tuple.dst.u.tcp.port = bpf_tuple->ipv6.dport;
+ break;
+ default:
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ tuple.dst.protonum = protonum;
+
+ if (netns_id >= 0) {
+ net = get_net_ns_by_id(net, netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+ if (netns_id >= 0)
+ put_net(net);
+ if (!hash)
+ return ERR_PTR(-ENOENT);
+ return nf_ct_tuplehash_to_ctrack(hash);
+}
+
+__diag_push();
+__diag_ignore(GCC, 8, "-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_conntrack BTF");
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+
+ if (!opts)
+ return NULL;
+ if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+ caller_net = dev_net(ctx->rxq->dev);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
+ opts->netns_id);
+ if (IS_ERR(nfct)) {
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ BUILD_BUG_ON(sizeof(struct bpf_ct_opts) != NF_BPF_CT_OPTS_SZ);
+
+ if (!opts)
+ return NULL;
+ if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+ caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
+ opts->netns_id);
+ if (IS_ERR(nfct)) {
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+void bpf_ct_release(struct nf_conn *nfct)
+{
+ if (!nfct)
+ return;
+ nf_ct_put(nfct);
+}
+
+__diag_pop()
+
+BTF_SET_START(nf_ct_xdp_check_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_xdp_check_kfunc_ids)
+
+BTF_SET_START(nf_ct_tc_check_kfunc_ids)
+BTF_ID(func, bpf_skb_ct_lookup)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_tc_check_kfunc_ids)
+
+BTF_SET_START(nf_ct_acquire_kfunc_ids)
+BTF_ID(func, bpf_xdp_ct_lookup)
+BTF_ID(func, bpf_skb_ct_lookup)
+BTF_SET_END(nf_ct_acquire_kfunc_ids)
+
+BTF_SET_START(nf_ct_release_kfunc_ids)
+BTF_ID(func, bpf_ct_release)
+BTF_SET_END(nf_ct_release_kfunc_ids)
+
+/* Both sets are identical */
+#define nf_ct_ret_null_kfunc_ids nf_ct_acquire_kfunc_ids
+
+static const struct btf_kfunc_id_set nf_conntrack_xdp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &nf_ct_xdp_check_kfunc_ids,
+ .acquire_set = &nf_ct_acquire_kfunc_ids,
+ .release_set = &nf_ct_release_kfunc_ids,
+ .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+};
+
+static const struct btf_kfunc_id_set nf_conntrack_tc_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &nf_ct_tc_check_kfunc_ids,
+ .acquire_set = &nf_ct_acquire_kfunc_ids,
+ .release_set = &nf_ct_release_kfunc_ids,
+ .ret_null_set = &nf_ct_ret_null_kfunc_ids,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_xdp_kfunc_set);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_tc_kfunc_set);
+}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index d6aa5b47031e..9b7f9c966f73 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -34,10 +34,10 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -47,7 +47,6 @@
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_act_ct.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netns/hash.h>
@@ -594,7 +593,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
- nf_ct_ext_destroy(tmpl);
+ kfree(tmpl->ext);
if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -1597,7 +1596,17 @@ void nf_conntrack_free(struct nf_conn *ct)
*/
WARN_ON(refcount_read(&ct->ct_general.use) != 0);
- nf_ct_ext_destroy(ct);
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ const struct nf_nat_hook *nat_hook;
+
+ rcu_read_lock();
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (nat_hook)
+ nat_hook->remove_nat_bysrc(ct);
+ rcu_read_unlock();
+ }
+
+ kfree(ct->ext);
kmem_cache_free(nf_conntrack_cachep, ct);
cnet = nf_ct_pernet(net);
@@ -2467,13 +2476,7 @@ void nf_conntrack_cleanup_end(void)
kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
- nf_conntrack_seqadj_fini();
- nf_conntrack_labels_fini();
nf_conntrack_helper_fini();
- nf_conntrack_timeout_fini();
- nf_conntrack_ecache_fini();
- nf_conntrack_tstamp_fini();
- nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2628,39 +2631,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return nf_conntrack_hash_resize(hashsize);
}
-static __always_inline unsigned int total_extension_size(void)
-{
- /* remember to add new extensions below */
- BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
-
- return sizeof(struct nf_ct_ext) +
- sizeof(struct nf_conn_help)
-#if IS_ENABLED(CONFIG_NF_NAT)
- + sizeof(struct nf_conn_nat)
-#endif
- + sizeof(struct nf_conn_seqadj)
- + sizeof(struct nf_conn_acct)
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- + sizeof(struct nf_conntrack_ecache)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- + sizeof(struct nf_conn_tstamp)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- + sizeof(struct nf_conn_timeout)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
- + sizeof(struct nf_conn_labels)
-#endif
-#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
- + sizeof(struct nf_conn_synproxy)
-#endif
-#if IS_ENABLED(CONFIG_NET_ACT_CT)
- + sizeof(struct nf_conn_act_ct_ext)
-#endif
- ;
-};
-
int nf_conntrack_init_start(void)
{
unsigned long nr_pages = totalram_pages();
@@ -2668,9 +2638,6 @@ int nf_conntrack_init_start(void)
int ret = -ENOMEM;
int i;
- /* struct nf_ct_ext uses u8 to store offsets/size */
- BUILD_BUG_ON(total_extension_size() > 255u);
-
seqcount_spinlock_init(&nf_conntrack_generation,
&nf_conntrack_locks_all_lock);
@@ -2715,34 +2682,10 @@ int nf_conntrack_init_start(void)
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_init();
- if (ret < 0)
- goto err_acct;
-
- ret = nf_conntrack_tstamp_init();
- if (ret < 0)
- goto err_tstamp;
-
- ret = nf_conntrack_ecache_init();
- if (ret < 0)
- goto err_ecache;
-
- ret = nf_conntrack_timeout_init();
- if (ret < 0)
- goto err_timeout;
-
ret = nf_conntrack_helper_init();
if (ret < 0)
goto err_helper;
- ret = nf_conntrack_labels_init();
- if (ret < 0)
- goto err_labels;
-
- ret = nf_conntrack_seqadj_init();
- if (ret < 0)
- goto err_seqadj;
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -2750,23 +2693,18 @@ int nf_conntrack_init_start(void)
conntrack_gc_work_init(&conntrack_gc_work);
queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
+ ret = register_nf_conntrack_bpf();
+ if (ret < 0)
+ goto err_kfunc;
+
return 0;
+err_kfunc:
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+ nf_conntrack_proto_fini();
err_proto:
- nf_conntrack_seqadj_fini();
-err_seqadj:
- nf_conntrack_labels_fini();
-err_labels:
nf_conntrack_helper_fini();
err_helper:
- nf_conntrack_timeout_fini();
-err_timeout:
- nf_conntrack_ecache_fini();
-err_ecache:
- nf_conntrack_tstamp_fini();
-err_tstamp:
- nf_conntrack_acct_fini();
-err_acct:
nf_conntrack_expect_fini();
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 41768ff19464..07e65b4e92f8 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -131,13 +131,13 @@ static void ecache_work(struct work_struct *work)
}
static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
- const unsigned int events,
- const unsigned long missed,
+ const u32 events,
+ const u32 missed,
const struct nf_ct_event *item)
{
- struct nf_conn *ct = item->ct;
struct net *net = nf_ct_net(item->ct);
struct nf_ct_event_notifier *notify;
+ u32 old, want;
int ret;
if (!((events | missed) & e->ctmask))
@@ -157,12 +157,13 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
if (likely(ret >= 0 && missed == 0))
return 0;
- spin_lock_bh(&ct->lock);
- if (ret < 0)
- e->missed |= events;
- else
- e->missed &= ~missed;
- spin_unlock_bh(&ct->lock);
+ do {
+ old = READ_ONCE(e->missed);
+ if (ret < 0)
+ want = old | events;
+ else
+ want = old & ~missed;
+ } while (cmpxchg(&e->missed, old, want) != old);
return ret;
}
@@ -172,7 +173,7 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
{
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- unsigned long missed;
+ unsigned int missed;
int ret;
if (!nf_ct_is_confirmed(ct))
@@ -211,7 +212,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- unsigned long events;
+ unsigned int events;
if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
return;
@@ -304,12 +305,6 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
#define NF_CT_EVENTS_DEFAULT 1
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
-static const struct nf_ct_ext_type event_extend = {
- .len = sizeof(struct nf_conntrack_ecache),
- .align = __alignof__(struct nf_conntrack_ecache),
- .id = NF_CT_EXT_ECACHE,
-};
-
void nf_conntrack_ecache_pernet_init(struct net *net)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
@@ -317,6 +312,8 @@ void nf_conntrack_ecache_pernet_init(struct net *net)
net->ct.sysctl_events = nf_ct_events;
cnet->ct_net = &net->ct;
INIT_DELAYED_WORK(&cnet->ecache_dwork, ecache_work);
+
+ BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
void nf_conntrack_ecache_pernet_fini(struct net *net)
@@ -325,19 +322,3 @@ void nf_conntrack_ecache_pernet_fini(struct net *net)
cancel_delayed_work_sync(&cnet->ecache_dwork);
}
-
-int nf_conntrack_ecache_init(void)
-{
- int ret = nf_ct_extend_register(&event_extend);
- if (ret < 0)
- pr_err("Unable to register event extension\n");
-
- BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */
-
- return ret;
-}
-
-void nf_conntrack_ecache_fini(void)
-{
- nf_ct_extend_unregister(&event_extend);
-}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 3dbe2329c3f1..1296fda54ac6 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -13,40 +13,90 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
-static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_nat.h>
+
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
-void nf_ct_ext_destroy(struct nf_conn *ct)
+static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
+ [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
+#if IS_ENABLED(CONFIG_NF_NAT)
+ [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
+#endif
+ [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
+ [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
+#endif
+};
+
+static __always_inline unsigned int total_extension_size(void)
{
- unsigned int i;
- struct nf_ct_ext_type *t;
-
- for (i = 0; i < NF_CT_EXT_NUM; i++) {
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[i]);
-
- /* Here the nf_ct_ext_type might have been unregisterd.
- * I.e., it has responsible to cleanup private
- * area in all conntracks when it is unregisterd.
- */
- if (t && t->destroy)
- t->destroy(ct);
- rcu_read_unlock();
- }
-
- kfree(ct->ext);
+ /* remember to add new extensions below */
+ BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
+
+ return sizeof(struct nf_ct_ext) +
+ sizeof(struct nf_conn_help)
+#if IS_ENABLED(CONFIG_NF_NAT)
+ + sizeof(struct nf_conn_nat)
+#endif
+ + sizeof(struct nf_conn_seqadj)
+ + sizeof(struct nf_conn_acct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ + sizeof(struct nf_conntrack_ecache)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + sizeof(struct nf_conn_tstamp)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ + sizeof(struct nf_conn_timeout)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ + sizeof(struct nf_conn_labels)
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ + sizeof(struct nf_conn_synproxy)
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ + sizeof(struct nf_conn_act_ct_ext)
+#endif
+ ;
}
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int newlen, newoff, oldlen, alloc;
- struct nf_ct_ext_type *t;
struct nf_ct_ext *new;
/* Conntrack must not be confirmed to avoid races on reallocation. */
WARN_ON(nf_ct_is_confirmed(ct));
+ /* struct nf_ct_ext uses u8 to store offsets/size */
+ BUILD_BUG_ON(total_extension_size() > 255u);
if (ct->ext) {
const struct nf_ct_ext *old = ct->ext;
@@ -58,16 +108,8 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
oldlen = sizeof(*new);
}
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[id]);
- if (!t) {
- rcu_read_unlock();
- return NULL;
- }
-
- newoff = ALIGN(oldlen, t->align);
- newlen = newoff + t->len;
- rcu_read_unlock();
+ newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
+ newlen = newoff + nf_ct_ext_type_len[id];
alloc = max(newlen, NF_CT_EXT_PREALLOC);
new = krealloc(ct->ext, alloc, gfp);
@@ -85,31 +127,3 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
return (void *)new + newoff;
}
EXPORT_SYMBOL(nf_ct_ext_add);
-
-/* This MUST be called in process context. */
-int nf_ct_extend_register(const struct nf_ct_ext_type *type)
-{
- int ret = 0;
-
- mutex_lock(&nf_ct_ext_type_mutex);
- if (nf_ct_ext_types[type->id]) {
- ret = -EBUSY;
- goto out;
- }
-
- rcu_assign_pointer(nf_ct_ext_types[type->id], type);
-out:
- mutex_unlock(&nf_ct_ext_type_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_extend_register);
-
-/* This MUST be called in process context. */
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type)
-{
- mutex_lock(&nf_ct_ext_type_mutex);
- RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
- mutex_unlock(&nf_ct_ext_type_mutex);
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index ae4488a13c70..a97ddb1497aa 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -550,12 +550,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
}
EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
-static const struct nf_ct_ext_type helper_extend = {
- .len = sizeof(struct nf_conn_help),
- .align = __alignof__(struct nf_conn_help),
- .id = NF_CT_EXT_HELPER,
-};
-
void nf_conntrack_helper_pernet_init(struct net *net)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
@@ -565,28 +559,17 @@ void nf_conntrack_helper_pernet_init(struct net *net)
int nf_conntrack_helper_init(void)
{
- int ret;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
nf_ct_helper_hash =
nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
- ret = nf_ct_extend_register(&helper_extend);
- if (ret < 0) {
- pr_err("nf_ct_helper: Unable to register helper extension.\n");
- goto out_extend;
- }
-
INIT_LIST_HEAD(&nf_ct_nat_helpers);
return 0;
-out_extend:
- kvfree(nf_ct_helper_hash);
- return ret;
}
void nf_conntrack_helper_fini(void)
{
- nf_ct_extend_unregister(&helper_extend);
kvfree(nf_ct_helper_hash);
}
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 522792556632..6e70e137a0a6 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -67,6 +67,8 @@ int nf_connlabels_get(struct net *net, unsigned int bits)
net->ct.labels_used++;
spin_unlock(&nf_connlabels_lock);
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_get);
@@ -78,21 +80,3 @@ void nf_connlabels_put(struct net *net)
spin_unlock(&nf_connlabels_lock);
}
EXPORT_SYMBOL_GPL(nf_connlabels_put);
-
-static const struct nf_ct_ext_type labels_extend = {
- .len = sizeof(struct nf_conn_labels),
- .align = __alignof__(struct nf_conn_labels),
- .id = NF_CT_EXT_LABELS,
-};
-
-int nf_conntrack_labels_init(void)
-{
- BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
-
- return nf_ct_extend_register(&labels_extend);
-}
-
-void nf_conntrack_labels_fini(void)
-{
- nf_ct_extend_unregister(&labels_extend);
-}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7032402ffd33..1ea2ad732d57 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -58,6 +58,12 @@
MODULE_LICENSE("GPL");
+struct ctnetlink_list_dump_ctx {
+ struct nf_conn *last;
+ unsigned int cpu;
+ bool done;
+};
+
static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l4proto *l4proto)
@@ -1694,14 +1700,18 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
static int ctnetlink_done_list(struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+
+ if (ctx->last)
+ nf_ct_put(ctx->last);
+
return 0;
}
static int
ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
{
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
struct nf_conn *ct, *last;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
@@ -1712,12 +1722,12 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying
struct hlist_nulls_head *list;
struct net *net = sock_net(skb->sk);
- if (cb->args[2])
+ if (ctx->done)
return 0;
- last = (struct nf_conn *)cb->args[1];
+ last = ctx->last;
- for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ for (cpu = ctx->cpu; cpu < nr_cpu_ids; cpu++) {
struct ct_pcpu *pcpu;
if (!cpu_possible(cpu))
@@ -1731,10 +1741,10 @@ restart:
ct = nf_ct_tuplehash_to_ctrack(h);
if (l3proto && nf_ct_l3num(ct) != l3proto)
continue;
- if (cb->args[1]) {
+ if (ctx->last) {
if (ct != last)
continue;
- cb->args[1] = 0;
+ ctx->last = NULL;
}
/* We can't dump extension info for the unconfirmed
@@ -1751,19 +1761,19 @@ restart:
if (res < 0) {
if (!refcount_inc_not_zero(&ct->ct_general.use))
continue;
- cb->args[0] = cpu;
- cb->args[1] = (unsigned long)ct;
+ ctx->cpu = cpu;
+ ctx->last = ct;
spin_unlock_bh(&pcpu->lock);
goto out;
}
}
- if (cb->args[1]) {
- cb->args[1] = 0;
+ if (ctx->last) {
+ ctx->last = NULL;
goto restart;
}
spin_unlock_bh(&pcpu->lock);
}
- cb->args[2] = 1;
+ ctx->done = true;
out:
if (last)
nf_ct_put(last);
@@ -3878,6 +3888,8 @@ static int __init ctnetlink_init(void)
{
int ret;
+ BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx));
+
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 7d5708b92138..f3fa367b455f 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -45,30 +45,8 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
static DEFINE_SPINLOCK(nf_pptp_lock);
-int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
-
-int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
-
-void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
- struct nf_conntrack_expect *expect_reply)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
-
-void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
- struct nf_conntrack_expect *exp) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+const struct nf_nat_pptp_hook *nf_nat_pptp_hook;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
/* PptpControlMessageType names */
@@ -111,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name);
static void pptp_expectfn(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ const struct nf_nat_pptp_hook *hook;
struct net *net = nf_ct_net(ct);
- typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
pr_debug("increasing timeouts\n");
/* increase timeout of GRE data channel conntrack entry */
@@ -122,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct,
/* Can you see how rusty this code is, compared with the pre-2.6.11
* one? That's what happened to my shiny newnat of 2002 ;( -HW */
- nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
- if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
- nf_nat_pptp_expectfn(ct, exp);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->master->status & IPS_NAT_MASK)
+ hook->expectfn(ct, exp);
else {
struct nf_conntrack_tuple inv_t;
struct nf_conntrack_expect *exp_other;
@@ -209,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
{
struct nf_conntrack_expect *exp_orig, *exp_reply;
+ const struct nf_nat_pptp_hook *hook;
enum ip_conntrack_dir dir;
int ret = 1;
- typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
exp_orig = nf_ct_expect_alloc(ct);
if (exp_orig == NULL)
@@ -239,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
IPPROTO_GRE, &callid, &peer_callid);
exp_reply->expectfn = pptp_expectfn;
- nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
- if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
- nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ hook->exp_gre(exp_orig, exp_reply);
if (nf_ct_expect_related(exp_orig, 0) != 0)
goto out_put_both;
if (nf_ct_expect_related(exp_reply, 0) != 0)
@@ -279,9 +257,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
msg = ntohs(ctlh->messageType);
pr_debug("inbound control message %s\n", pptp_msg_name(msg));
@@ -383,10 +361,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
- if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_inbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
@@ -407,9 +384,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
msg = ntohs(ctlh->messageType);
pr_debug("outbound control message %s\n", pptp_msg_name(msg));
@@ -479,10 +456,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
- if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_outbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3b516cffc779..12f793d8fe0c 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -63,8 +63,10 @@ static bool udp_error(struct sk_buff *skb,
}
/* Packet with no checksum */
- if (!hdr->check)
+ if (!hdr->check) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return false;
+ }
/* Checksum invalid? Ignore.
* We skip checking packets on the outgoing path
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 3066449f8bd8..7ab2b25b57bc 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -232,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct,
this_way->offset_after : this_way->offset_before;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
-
-static const struct nf_ct_ext_type nf_ct_seqadj_extend = {
- .len = sizeof(struct nf_conn_seqadj),
- .align = __alignof__(struct nf_conn_seqadj),
- .id = NF_CT_EXT_SEQADJ,
-};
-
-int nf_conntrack_seqadj_init(void)
-{
- return nf_ct_extend_register(&nf_ct_seqadj_extend);
-}
-
-void nf_conntrack_seqadj_fini(void)
-{
- nf_ct_extend_unregister(&nf_ct_seqadj_extend);
-}
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 14387e0b8008..cec166ecba77 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -22,12 +22,8 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-struct nf_ct_timeout *
-(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-
-void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+const struct nf_ct_timeout_hooks *nf_ct_timeout_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
static int untimeout(struct nf_conn *ct, void *timeout)
{
@@ -48,31 +44,30 @@ EXPORT_SYMBOL_GPL(nf_ct_untimeout);
static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout)
{
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook);
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
- if (timeout_put)
- timeout_put(timeout);
+ if (h)
+ h->timeout_put(timeout);
}
int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
u8 l3num, u8 l4num, const char *timeout_name)
{
- typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+ const struct nf_ct_timeout_hooks *h;
struct nf_ct_timeout *timeout;
struct nf_conn_timeout *timeout_ext;
const char *errmsg = NULL;
int ret = 0;
rcu_read_lock();
- timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
- if (!timeout_find_get) {
+ h = rcu_dereference(nf_ct_timeout_hook);
+ if (!h) {
ret = -ENOENT;
errmsg = "Timeout policy base is empty";
goto out;
}
- timeout = timeout_find_get(net, timeout_name);
+ timeout = h->timeout_find_get(net, timeout_name);
if (!timeout) {
ret = -ENOENT;
pr_info_ratelimited("No such timeout policy \"%s\"\n",
@@ -119,37 +114,18 @@ EXPORT_SYMBOL_GPL(nf_ct_set_timeout);
void nf_ct_destroy_timeout(struct nf_conn *ct)
{
struct nf_conn_timeout *timeout_ext;
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h;
rcu_read_lock();
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+ h = rcu_dereference(nf_ct_timeout_hook);
- if (timeout_put) {
+ if (h) {
timeout_ext = nf_ct_timeout_find(ct);
if (timeout_ext) {
- timeout_put(timeout_ext->timeout);
+ h->timeout_put(timeout_ext->timeout);
RCU_INIT_POINTER(timeout_ext->timeout, NULL);
}
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
-
-static const struct nf_ct_ext_type timeout_extend = {
- .len = sizeof(struct nf_conn_timeout),
- .align = __alignof__(struct nf_conn_timeout),
- .id = NF_CT_EXT_TIMEOUT,
-};
-
-int nf_conntrack_timeout_init(void)
-{
- int ret = nf_ct_extend_register(&timeout_extend);
- if (ret < 0)
- pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
- return ret;
-}
-
-void nf_conntrack_timeout_fini(void)
-{
- nf_ct_extend_unregister(&timeout_extend);
-}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index f656d393fa92..9e43a0a59e73 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -19,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly;
module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
-static const struct nf_ct_ext_type tstamp_extend = {
- .len = sizeof(struct nf_conn_tstamp),
- .align = __alignof__(struct nf_conn_tstamp),
- .id = NF_CT_EXT_TSTAMP,
-};
-
void nf_conntrack_tstamp_pernet_init(struct net *net)
{
net->ct.sysctl_tstamp = nf_ct_tstamp;
}
-
-int nf_conntrack_tstamp_init(void)
-{
- int ret;
- ret = nf_ct_extend_register(&tstamp_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_tstamp_fini(void)
-{
- nf_ct_extend_unregister(&tstamp_extend);
-}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 2d06a66899b2..58c06ac10179 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -838,7 +838,7 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0;
}
-static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
@@ -860,7 +860,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
- __nf_nat_cleanup_conntrack(ct);
+ nf_nat_cleanup_conntrack(ct);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -868,20 +868,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
return 0;
}
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
- if (ct->status & IPS_SRC_NAT_DONE)
- __nf_nat_cleanup_conntrack(ct);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
- .len = sizeof(struct nf_conn_nat),
- .align = __alignof__(struct nf_conn_nat),
- .destroy = nf_nat_cleanup_conntrack,
- .id = NF_CT_EXT_NAT,
-};
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -1173,6 +1159,7 @@ static const struct nf_nat_hook nat_hook = {
.decode_session = __nf_nat_decode_session,
#endif
.manip_pkt = nf_nat_manip_pkt,
+ .remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
static int __init nf_nat_init(void)
@@ -1188,19 +1175,11 @@ static int __init nf_nat_init(void)
if (!nf_nat_bysource)
return -ENOMEM;
- ret = nf_ct_extend_register(&nat_extend);
- if (ret < 0) {
- kvfree(nf_nat_bysource);
- pr_err("Unable to register extension\n");
- return ret;
- }
-
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
ret = register_pernet_subsys(&nat_net_ops);
if (ret < 0) {
- nf_ct_extend_unregister(&nat_extend);
kvfree(nf_nat_bysource);
return ret;
}
@@ -1219,7 +1198,6 @@ static void __exit nf_nat_cleanup(void)
nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
- nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_hook, NULL);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 2dfc5dae0656..e479dd0561c5 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -236,12 +236,6 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
return 1;
}
-static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
- .len = sizeof(struct nf_conn_synproxy),
- .align = __alignof__(struct nf_conn_synproxy),
- .id = NF_CT_EXT_SYNPROXY,
-};
-
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -387,28 +381,12 @@ static struct pernet_operations synproxy_net_ops = {
static int __init synproxy_core_init(void)
{
- int err;
-
- err = nf_ct_extend_register(&nf_ct_synproxy_extend);
- if (err < 0)
- goto err1;
-
- err = register_pernet_subsys(&synproxy_net_ops);
- if (err < 0)
- goto err2;
-
- return 0;
-
-err2:
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
-err1:
- return err;
+ return register_pernet_subsys(&synproxy_net_ops);
}
static void __exit synproxy_core_exit(void)
{
unregister_pernet_subsys(&synproxy_net_ops);
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
}
module_init(synproxy_core_init);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 36e73f9828c5..c6c05b2412c4 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -67,6 +67,20 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static void nft_cmp16_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ const u64 *reg_data = (const u64 *)&regs->data[priv->sreg];
+ const u64 *mask = (const u64 *)&priv->mask;
+ const u64 *data = (const u64 *)&priv->data;
+
+ if (((reg_data[0] & mask[0]) == data[0] &&
+ ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv)
+ return;
+ regs->verdict.code = NFT_BREAK;
+}
+
static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
const struct nft_chain *chain,
const struct nft_regs *regs)
@@ -225,6 +239,8 @@ next_rule:
nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_cmp16_fast_ops)
+ nft_cmp16_fast_eval(expr, &regs);
else if (expr->ops == &nft_bitwise_fast_ops)
nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index c57673d499be..b0d8888a539b 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -605,6 +605,11 @@ static struct pernet_operations cttimeout_ops = {
.size = sizeof(struct nfct_timeout_pernet),
};
+static const struct nf_ct_timeout_hooks hooks = {
+ .timeout_find_get = ctnl_timeout_find_get,
+ .timeout_put = ctnl_timeout_put,
+};
+
static int __init cttimeout_init(void)
{
int ret;
@@ -619,8 +624,7 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+ RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks);
return 0;
err_out:
@@ -633,8 +637,7 @@ static void __exit cttimeout_exit(void)
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+ RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
synchronize_rcu();
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea2d9c2a44cf..8c15978d9258 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -402,6 +402,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* priority */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
@@ -559,6 +560,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
goto nla_put_failure;
+ if (entskb->priority &&
+ nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+ goto nla_put_failure;
+
if (indev && entskb->dev &&
skb_mac_header_was_set(entskb) &&
skb_mac_header_len(entskb) != 0) {
@@ -1014,11 +1019,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
[NFQA_VLAN] = { .type = NLA_NESTED },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static struct nfqnl_instance *
@@ -1099,6 +1106,9 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
}
return 0;
@@ -1225,6 +1235,9 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
return 0;
}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 47b6d05f1ae6..917072af09df 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -272,12 +272,103 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.offload = nft_cmp_fast_offload,
};
+static u32 nft_cmp_mask(u32 bitlen)
+{
+ return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen));
+}
+
+static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen)
+{
+ int len = bitlen / BITS_PER_BYTE;
+ int i, words = len / sizeof(u32);
+
+ for (i = 0; i < words; i++) {
+ data->data[i] = 0xffffffff;
+ bitlen -= sizeof(u32) * BITS_PER_BYTE;
+ }
+
+ if (len % sizeof(u32))
+ data->data[i++] = nft_cmp_mask(bitlen);
+
+ for (; i < 4; i++)
+ data->data[i] = 0;
+}
+
+static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ int err;
+
+ err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
+ tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE);
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
+ priv->len = desc.len;
+
+ return 0;
+}
+
+static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = priv->data,
+ .sreg = priv->sreg,
+ .len = priv->len,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+
+const struct nft_expr_ops nft_cmp16_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp16_fast_init,
+ .dump = nft_cmp16_fast_dump,
+ .offload = nft_cmp16_fast_offload,
+};
+
static const struct nft_expr_ops *
nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
struct nft_data_desc desc;
struct nft_data data;
enum nft_cmp_ops op;
+ u8 sreg;
int err;
if (tb[NFTA_CMP_SREG] == NULL ||
@@ -306,9 +397,16 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
if (desc.type != NFT_DATA_VALUE)
goto err1;
- if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ))
- return &nft_cmp_fast_ops;
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
+ if (desc.len <= sizeof(u32))
+ return &nft_cmp_fast_ops;
+ else if (desc.len <= sizeof(data) &&
+ ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) ||
+ (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0)))
+ return &nft_cmp16_fast_ops;
+ }
return &nft_cmp_ops;
err1:
nft_data_release(&data, desc.type);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index f69cc73c5813..5a46d8289d1d 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -731,6 +731,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
static struct nft_expr_type nft_match_type;
+static bool nft_match_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct xt_match *match = expr->ops->data;
+
+ return strcmp(match->name, "comment") == 0;
+}
+
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -773,6 +781,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_match_dump;
ops->validate = nft_match_validate;
ops->data = match;
+ ops->reduce = nft_match_reduce;
matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
if (matchsize > NFT_MATCH_LARGE_THRESH) {
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 9e927ab4df15..d2b9378164bb 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -308,6 +308,63 @@ err:
regs->verdict.code = NFT_BREAK;
}
+static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int i, tcphdr_len, optl;
+ struct tcphdr *tcph;
+ u8 *opt;
+
+ tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!tcph)
+ goto err;
+
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto drop;
+
+ opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!opt)
+ goto err;
+ for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+ unsigned int j;
+
+ optl = optlen(opt, i);
+ if (priv->type != opt[i])
+ continue;
+
+ if (i + optl > tcphdr_len)
+ goto drop;
+
+ for (j = 0; j < optl; ++j) {
+ u16 n = TCPOPT_NOP;
+ u16 o = opt[i+j];
+
+ if ((i + j) % 2 == 0) {
+ o <<= 8;
+ n <<= 8;
+ }
+ inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o),
+ htons(n), false);
+ }
+ memset(opt + i, TCPOPT_NOP, optl);
+ return;
+ }
+
+ /* option not found, continue. This allows to do multiple
+ * option removals per rule.
+ */
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+ return;
+drop:
+ /* can't remove, no choice but to drop */
+ regs->verdict.code = NF_DROP;
+}
+
static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -457,6 +514,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
priv->len);
}
+static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_EXTHDR_SREG] ||
+ tb[NFTA_EXTHDR_DREG] ||
+ tb[NFTA_EXTHDR_FLAGS] ||
+ tb[NFTA_EXTHDR_OFFSET] ||
+ tb[NFTA_EXTHDR_LEN])
+ return -EINVAL;
+
+ if (!tb[NFTA_EXTHDR_TYPE])
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->op = NFT_EXTHDR_OP_TCPOPT;
+
+ return 0;
+}
+
static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -517,6 +596,13 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
+static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ return nft_exthdr_dump_common(skb, priv);
+}
+
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -549,6 +635,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
.dump = nft_exthdr_dump_set,
};
+static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_tcp_strip_eval,
+ .init = nft_exthdr_tcp_strip_init,
+ .dump = nft_exthdr_dump_strip,
+};
+
static const struct nft_expr_ops nft_exthdr_sctp_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -576,7 +670,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_tcp_set_ops;
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_tcp_ops;
- break;
+ return &nft_exthdr_tcp_strip_ops;
case NFT_EXTHDR_OP_IPV6:
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 67ad08320886..7e8a39a35627 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -37,6 +37,7 @@
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/pkt_cls.h>
#include "datapath.h"
#include "flow.h"
@@ -1601,8 +1602,6 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
dp->user_features = 0;
}
-DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
-
static int ovs_dp_set_upcall_portids(struct datapath *dp,
const struct nlattr *ids)
{
@@ -1657,7 +1656,7 @@ u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
{
- u32 user_features = 0;
+ u32 user_features = 0, old_features = dp->user_features;
int err;
if (a[OVS_DP_ATTR_USER_FEATURES]) {
@@ -1696,10 +1695,12 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
return err;
}
- if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
- static_branch_enable(&tc_recirc_sharing_support);
- else
- static_branch_disable(&tc_recirc_sharing_support);
+ if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) &&
+ !(old_features & OVS_DP_F_TC_RECIRC_SHARING))
+ tc_skb_ext_tc_enable();
+ else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) &&
+ (old_features & OVS_DP_F_TC_RECIRC_SHARING))
+ tc_skb_ext_tc_disable();
return 0;
}
@@ -1839,6 +1840,9 @@ static void __dp_destroy(struct datapath *dp)
struct flow_table *table = &dp->table;
int i;
+ if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
+ tc_skb_ext_tc_disable();
+
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
struct vport *vport;
struct hlist_node *n;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index fcfe6cb46441..0cd29971a907 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -253,8 +253,6 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
extern struct notifier_block ovs_dp_device_notifier;
extern struct genl_family dp_vport_genl_family;
-DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
-
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
void ovs_dp_detach_port(struct vport *);
int ovs_dp_upcall(struct datapath *, struct sk_buff *,
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 02096f2ec678..f6cd24fd530c 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -34,6 +34,7 @@
#include <net/mpls.h>
#include <net/ndisc.h>
#include <net/nsh.h>
+#include <net/pkt_cls.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include "conntrack.h"
@@ -895,7 +896,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->mac_proto = res;
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- if (static_branch_unlikely(&tc_recirc_sharing_support)) {
+ if (tc_skb_ext_tc_enabled()) {
tc_ext = skb_ext_find(skb, TC_SKB_EXT);
key->recirc_id = tc_ext ? tc_ext->chain : 0;
OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index f99247fc6468..7108e71ce4db 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -57,12 +57,6 @@ static const struct rhashtable_params zones_params = {
.automatic_shrinking = true,
};
-static struct nf_ct_ext_type act_ct_extend __read_mostly = {
- .len = sizeof(struct nf_conn_act_ct_ext),
- .align = __alignof__(struct nf_conn_act_ct_ext),
- .id = NF_CT_EXT_ACT_CT,
-};
-
static struct flow_action_entry *
tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
{
@@ -1608,16 +1602,10 @@ static int __init ct_init_module(void)
if (err)
goto err_register;
- err = nf_ct_extend_register(&act_ct_extend);
- if (err)
- goto err_register_extend;
-
static_branch_inc(&tcf_frag_xmit_count);
return 0;
-err_register_extend:
- tcf_unregister_action(&act_ct_ops, &ct_net_ops);
err_register:
tcf_ct_flow_tables_uninit();
err_tbl_init:
@@ -1628,7 +1616,6 @@ err_tbl_init:
static void __exit ct_cleanup_module(void)
{
static_branch_dec(&tcf_frag_xmit_count);
- nf_ct_extend_unregister(&act_ct_extend);
tcf_unregister_action(&act_ct_ops, &ct_net_ops);
tcf_ct_flow_tables_uninit();
destroy_workqueue(act_ct_wq);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 0923aa2b8f8a..899fe025df77 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -239,6 +239,20 @@ release_idr:
return err;
}
+static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit)
+{
+ u32 len;
+
+ if (skb_is_gso(skb))
+ return skb_gso_validate_mac_len(skb, limit);
+
+ len = qdisc_pkt_len(skb);
+ if (skb_at_tc_ingress(skb))
+ len += skb->mac_len;
+
+ return len <= limit;
+}
+
static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -261,7 +275,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
goto inc_overlimits;
}
- if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
+ if (tcf_police_mtu_check(skb, p->tcfp_mtu)) {
if (!p->rate_present && !p->pps_present) {
ret = p->tcfp_result;
goto end;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5ce1208a6ea3..2957f8f5cea7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -49,6 +49,23 @@ static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc);
+EXPORT_SYMBOL(tc_skb_ext_tc);
+
+void tc_skb_ext_tc_enable(void)
+{
+ static_branch_inc(&tc_skb_ext_tc);
+}
+EXPORT_SYMBOL(tc_skb_ext_tc_enable);
+
+void tc_skb_ext_tc_disable(void)
+{
+ static_branch_dec(&tc_skb_ext_tc);
+}
+EXPORT_SYMBOL(tc_skb_ext_tc_disable);
+#endif
+
static u32 destroy_obj_hashfn(const struct tcf_proto *tp)
{
return jhash_3words(tp->chain->index, tp->prio,
@@ -1615,19 +1632,21 @@ int tcf_classify(struct sk_buff *skb,
ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode,
&last_executed_chain);
- /* If we missed on some chain */
- if (ret == TC_ACT_UNSPEC && last_executed_chain) {
- struct tc_skb_cb *cb = tc_skb_cb(skb);
-
- ext = tc_skb_ext_alloc(skb);
- if (WARN_ON_ONCE(!ext))
- return TC_ACT_SHOT;
- ext->chain = last_executed_chain;
- ext->mru = cb->mru;
- ext->post_ct = cb->post_ct;
- ext->post_ct_snat = cb->post_ct_snat;
- ext->post_ct_dnat = cb->post_ct_dnat;
- ext->zone = cb->zone;
+ if (tc_skb_ext_tc_enabled()) {
+ /* If we missed on some chain */
+ if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+ struct tc_skb_cb *cb = tc_skb_cb(skb);
+
+ ext = tc_skb_ext_alloc(skb);
+ if (WARN_ON_ONCE(!ext))
+ return TC_ACT_SHOT;
+ ext->chain = last_executed_chain;
+ ext->mru = cb->mru;
+ ext->post_ct = cb->post_ct;
+ ext->post_ct_snat = cb->post_ct_snat;
+ ext->post_ct_dnat = cb->post_ct_dnat;
+ ext->zone = cb->zone;
+ }
}
return ret;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 306d9e8cd1dd..b40ae4d28cbf 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -59,12 +59,52 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
+static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
struct workqueue_struct *smc_close_wq; /* wq for close work */
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ void *hdr;
+
+ if (cb_ctx->pos[0])
+ goto out;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_HS_LIMITATION);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
+ sock_net(skb->sk)->smc.limit_smc_hs))
+ goto err;
+
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0] = 1;
+out:
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = true;
+ return 0;
+}
+
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = false;
+ return 0;
+}
+
static void smc_set_keepalive(struct sock *sk, int val)
{
struct smc_sock *smc = smc_sk(sk);
@@ -72,6 +112,51 @@ static void smc_set_keepalive(struct sock *sk, int val)
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}
+static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
+ sk->sk_max_ack_backlog)
+ goto drop;
+
+ if (sk_acceptq_is_full(&smc->sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ /* passthrough to original syn recv sock fct */
+ return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
+ own_req);
+
+drop:
+ dst_release(dst);
+ tcp_listendrop(sk);
+ return NULL;
+}
+
+static bool smc_hs_congested(const struct sock *sk)
+{
+ const struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (!smc)
+ return true;
+
+ if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+ return true;
+
+ return false;
+}
+
static struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};
@@ -1291,8 +1376,14 @@ static int __smc_connect(struct smc_sock *smc)
/* perform CLC handshake */
rc = smc_connect_clc(smc, aclc2, ini);
- if (rc)
+ if (rc) {
+ /* -EAGAIN on timeout, see tcp_recvmsg() */
+ if (rc == -EAGAIN) {
+ rc = -ETIMEDOUT;
+ smc->sk.sk_err = ETIMEDOUT;
+ }
goto vlan_cleanup;
+ }
/* check if smc modes and versions of CLC proposal and accept match */
rc = smc_connect_check_aclc(ini, aclc);
@@ -1598,6 +1689,9 @@ static void smc_listen_out(struct smc_sock *new_smc)
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_dec(&lsmc->queued_smc_hs);
+
if (lsmc->sk.sk_state == SMC_LISTEN) {
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
@@ -2203,6 +2297,9 @@ static void smc_tcp_listen_work(struct work_struct *work)
if (!new_smc)
continue;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_inc(&lsmc->queued_smc_hs);
+
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
new_smc->fallback_rsn = lsmc->fallback_rsn;
@@ -2231,7 +2328,7 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
lsmc->clcsk_data_ready(listen_clcsock);
if (lsmc->sk.sk_state == SMC_LISTEN) {
sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
- if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
sock_put(&lsmc->sk);
}
}
@@ -2269,6 +2366,18 @@ static int smc_listen(struct socket *sock, int backlog)
smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
smc->clcsock->sk->sk_user_data =
(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+
+ /* save original ops */
+ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
+
+ smc->af_ops = *smc->ori_af_ops;
+ smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
+
+ inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
+
+ if (smc->limit_smc_hs)
+ tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+
rc = kernel_listen(smc->clcsock, backlog);
if (rc) {
smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
@@ -2562,6 +2671,67 @@ out:
return rc ? rc : rc1;
}
+static int __smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+ int val, len;
+
+ smc = smc_sk(sock->sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ val = smc->limit_smc_hs;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ smc = smc_sk(sk);
+
+ lock_sock(sk);
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+
+ smc->limit_smc_hs = !!val;
+ rc = 0;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
static int smc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -2571,6 +2741,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
if (level == SOL_TCP && optname == TCP_ULP)
return -EOPNOTSUPP;
+ else if (level == SOL_SMC)
+ return __smc_setsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sk);
@@ -2630,8 +2802,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_CLOSED) {
if (!val) {
SMC_STAT_INC(smc, cork_cnt);
- mod_delayed_work(smc->conn.lgr->tx_wq,
- &smc->conn.tx_work, 0);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
}
}
break;
@@ -2653,6 +2825,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
struct smc_sock *smc;
int rc;
+ if (level == SOL_SMC)
+ return __smc_getsockopt(sock, level, optname, optval, optlen);
+
smc = smc_sk(sock->sk);
mutex_lock(&smc->clcsock_release_lock);
if (!smc->clcsock) {
@@ -2769,8 +2944,10 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
rc = kernel_sendpage(smc->clcsock, page, offset,
size, flags);
} else {
+ lock_sock(sk);
+ rc = smc_tx_sendpage(smc, page, offset, size, flags);
+ release_sock(sk);
SMC_STAT_INC(smc, sendpage_cnt);
- rc = sock_no_sendpage(sock, page, offset, size, flags);
}
out:
@@ -2879,6 +3056,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol,
smc->use_fallback = false; /* assume rdma capability first */
smc->fallback_rsn = 0;
+ /* default behavior from limit_smc_hs in every net namespace */
+ smc->limit_smc_hs = net->smc.limit_smc_hs;
+
rc = 0;
if (!clcsock) {
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
@@ -3026,9 +3206,14 @@ static int __init smc_init(void)
goto out_nl;
rc = -ENOMEM;
+
+ smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
+ if (!smc_tcp_ls_wq)
+ goto out_pnet;
+
smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
if (!smc_hs_wq)
- goto out_pnet;
+ goto out_alloc_tcp_ls_wq;
smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
if (!smc_close_wq)
@@ -3099,6 +3284,8 @@ out_alloc_wqs:
destroy_workqueue(smc_close_wq);
out_alloc_hs_wq:
destroy_workqueue(smc_hs_wq);
+out_alloc_tcp_ls_wq:
+ destroy_workqueue(smc_tcp_ls_wq);
out_pnet:
smc_pnet_exit();
out_nl:
@@ -3117,6 +3304,7 @@ static void __exit smc_exit(void)
smc_core_exit();
smc_ib_unregister_client();
destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_tcp_ls_wq);
destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 37b2001a0255..a096d8af21a0 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -14,6 +14,7 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/compiler.h> /* __aligned */
+#include <net/genetlink.h>
#include <net/sock.h>
#include "smc_ib.h"
@@ -249,9 +250,14 @@ struct smc_sock { /* smc sock container */
struct work_struct smc_listen_work;/* prepare new accept socket */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
+ bool limit_smc_hs; /* put constraint on handshake */
bool use_fallback; /* fallback to tcp */
int fallback_rsn; /* reason for fallback */
u32 peer_diagnosis; /* decline reason from peer */
+ atomic_t queued_smc_hs; /* queued smc handshakes */
+ struct inet_connection_sock_af_ops af_ops;
+ const struct inet_connection_sock_af_ops *ori_af_ops;
+ /* original af ops */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
@@ -276,7 +282,7 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
return (struct smc_sock *)sk;
}
-static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk)
+static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
{
return (struct smc_sock *)
((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
@@ -331,4 +337,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr,
struct smc_gidlist *gidlist,
struct smc_ib_device *known_dev, u8 *known_gid);
+/* smc handshake limitation interface for netlink */
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+
#endif /* __SMC_H */
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
index f13ab0661ed5..c5a62f6f52ba 100644
--- a/net/smc/smc_netlink.c
+++ b/net/smc/smc_netlink.c
@@ -111,6 +111,21 @@ static const struct genl_ops smc_gen_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.doit = smc_nl_disable_seid,
},
+ {
+ .cmd = SMC_NETLINK_DUMP_HS_LIMITATION,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_hs_limitation,
+ },
};
static const struct nla_policy smc_gen_nl_policy[2] = {
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 0599246c0376..ff61b7b95875 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -870,6 +870,9 @@ int smc_pnet_net_init(struct net *net)
smc_pnet_create_pnetids_list(net);
+ /* disable handshake limitation by default */
+ net->smc.limit_smc_hs = 0;
+
return 0;
}
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index be241d53020f..5df3940d4543 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -31,7 +31,6 @@
#include "smc_tracepoint.h"
#define SMC_TX_WORK_DELAY 0
-#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
@@ -236,16 +235,15 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
*/
if ((msg->msg_flags & MSG_OOB) && !send_remaining)
conn->urg_tx_pend = true;
- if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
- (atomic_read(&conn->sndbuf_space) >
- (conn->sndbuf_desc->len >> 1)))
- /* for a corked socket defer the RDMA writes if there
- * is still sufficient sndbuf_space available
+ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) ||
+ msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
+ (atomic_read(&conn->sndbuf_space)))
+ /* for a corked socket defer the RDMA writes if
+ * sndbuf_space is still available. The applications
+ * should known how/when to uncork it.
*/
- queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
- SMC_TX_CORK_DELAY);
- else
- smc_tx_sndbuf_nonempty(conn);
+ continue;
+ smc_tx_sndbuf_nonempty(conn);
trace_smc_tx_sendmsg(smc, copylen);
} /* while (msg_data_left(msg)) */
@@ -260,6 +258,22 @@ out_err:
return rc;
}
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct msghdr msg = {.msg_flags = flags};
+ char *kaddr = kmap(page);
+ struct kvec iov;
+ int rc;
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
+ rc = smc_tx_sendmsg(smc, &msg, size);
+ kunmap(page);
+ return rc;
+}
+
/***************************** sndbuf consumer *******************************/
/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
@@ -598,26 +612,36 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
}
/* Wakeup sndbuf consumers from process context
- * since there is more data to transmit
+ * since there is more data to transmit. The caller
+ * must hold sock lock.
*/
-void smc_tx_work(struct work_struct *work)
+void smc_tx_pending(struct smc_connection *conn)
{
- struct smc_connection *conn = container_of(to_delayed_work(work),
- struct smc_connection,
- tx_work);
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
int rc;
- lock_sock(&smc->sk);
if (smc->sk.sk_err)
- goto out;
+ return;
rc = smc_tx_sndbuf_nonempty(conn);
if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
!atomic_read(&conn->bytes_to_rcv))
conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit in locked
+ * sock.
+ */
+void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(to_delayed_work(work),
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
-out:
+ lock_sock(&smc->sk);
+ smc_tx_pending(conn);
release_sock(&smc->sk);
}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 07e6ad76224a..34b578498b1f 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
+void smc_tx_pending(struct smc_connection *conn);
void smc_tx_work(struct work_struct *work);
void smc_tx_init(struct smc_sock *smc);
int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+ size_t size, int flags);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn, bool force);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 5f42aa5fc612..8eb7e8544815 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -72,7 +72,8 @@ struct gss_auth {
struct gss_api_mech *mech;
enum rpc_gss_svc service;
struct rpc_clnt *client;
- struct net *net;
+ struct net *net;
+ netns_tracker ns_tracker;
/*
* There are two upcall pipes; dentry[1], named "gssd", is used
* for the new text-based upcall; dentry[0] is named after the
@@ -1013,7 +1014,8 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
goto err_free;
}
gss_auth->client = clnt;
- gss_auth->net = get_net(rpc_net_ns(clnt));
+ gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker,
+ GFP_KERNEL);
err = -EINVAL;
gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
if (!gss_auth->mech)
@@ -1068,7 +1070,7 @@ err_destroy_credcache:
err_put_mech:
gss_mech_put(gss_auth->mech);
err_put_net:
- put_net(gss_auth->net);
+ put_net_track(gss_auth->net, &gss_auth->ns_tracker);
err_free:
kfree(gss_auth->target_name);
kfree(gss_auth);
@@ -1084,7 +1086,7 @@ gss_free(struct gss_auth *gss_auth)
gss_pipe_free(gss_auth->gss_pipe[0]);
gss_pipe_free(gss_auth->gss_pipe[1]);
gss_mech_put(gss_auth->mech);
- put_net(gss_auth->net);
+ put_net_track(gss_auth->net, &gss_auth->ns_tracker);
kfree(gss_auth->target_name);
kfree(gss_auth);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b21ad7994147..db878e833b67 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -162,7 +162,7 @@ static void svc_xprt_free(struct kref *kref)
if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
svcauth_unix_info_release(xprt);
put_cred(xprt->xpt_cred);
- put_net(xprt->xpt_net);
+ put_net_track(xprt->xpt_net, &xprt->ns_tracker);
/* See comment on corresponding get in xs_setup_bc_tcp(): */
if (xprt->xpt_bc_xprt)
xprt_put(xprt->xpt_bc_xprt);
@@ -198,7 +198,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
mutex_init(&xprt->xpt_mutex);
spin_lock_init(&xprt->xpt_lock);
set_bit(XPT_BUSY, &xprt->xpt_flags);
- xprt->xpt_net = get_net(net);
+ xprt->xpt_net = get_net_track(net, &xprt->ns_tracker, GFP_ATOMIC);
strcpy(xprt->xpt_remotebuf, "uninitialized");
}
EXPORT_SYMBOL_GPL(svc_xprt_init);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a02de2bddb28..5af484d6ba5e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1835,7 +1835,7 @@ EXPORT_SYMBOL_GPL(xprt_alloc);
void xprt_free(struct rpc_xprt *xprt)
{
- put_net(xprt->xprt_net);
+ put_net_track(xprt->xprt_net, &xprt->ns_tracker);
xprt_free_all_slots(xprt);
xprt_free_id(xprt);
rpc_sysfs_xprt_destroy(xprt);
@@ -2027,7 +2027,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
xprt_init_xid(xprt);
- xprt->xprt_net = get_net(net);
+ xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL);
}
/**
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index b62565278fac..6a00c390547b 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -85,7 +85,7 @@ static int switchdev_deferred_enqueue(struct net_device *dev,
{
struct switchdev_deferred_item *dfitem;
- dfitem = kmalloc(sizeof(*dfitem) + data_len, GFP_ATOMIC);
+ dfitem = kmalloc(struct_size(dfitem, data, data_len), GFP_ATOMIC);
if (!dfitem)
return -ENOMEM;
dfitem->dev = dev;
@@ -409,6 +409,27 @@ static int switchdev_lower_dev_walk(struct net_device *lower_dev,
}
static struct net_device *
+switchdev_lower_dev_find_rcu(struct net_device *dev,
+ bool (*check_cb)(const struct net_device *dev),
+ bool (*foreign_dev_check_cb)(const struct net_device *dev,
+ const struct net_device *foreign_dev))
+{
+ struct switchdev_nested_priv switchdev_priv = {
+ .check_cb = check_cb,
+ .foreign_dev_check_cb = foreign_dev_check_cb,
+ .dev = dev,
+ .lower_dev = NULL,
+ };
+ struct netdev_nested_priv priv = {
+ .data = &switchdev_priv,
+ };
+
+ netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv);
+
+ return switchdev_priv.lower_dev;
+}
+
+static struct net_device *
switchdev_lower_dev_find(struct net_device *dev,
bool (*check_cb)(const struct net_device *dev),
bool (*foreign_dev_check_cb)(const struct net_device *dev,
@@ -424,7 +445,7 @@ switchdev_lower_dev_find(struct net_device *dev,
.data = &switchdev_priv,
};
- netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv);
+ netdev_walk_all_lower_dev(dev, switchdev_lower_dev_walk, &priv);
return switchdev_priv.lower_dev;
}
@@ -451,7 +472,7 @@ static int __switchdev_handle_fdb_event_to_device(struct net_device *dev,
return mod_cb(dev, orig_dev, event, info->ctx, fdb_info);
if (netif_is_lag_master(dev)) {
- if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb))
+ if (!switchdev_lower_dev_find_rcu(dev, check_cb, foreign_dev_check_cb))
goto maybe_bridged_with_us;
/* This is a LAG interface that we offload */
@@ -465,7 +486,7 @@ static int __switchdev_handle_fdb_event_to_device(struct net_device *dev,
* towards a bridge device.
*/
if (netif_is_bridge_master(dev)) {
- if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb))
+ if (!switchdev_lower_dev_find_rcu(dev, check_cb, foreign_dev_check_cb))
return 0;
/* This is a bridge interface that we offload */
@@ -478,8 +499,8 @@ static int __switchdev_handle_fdb_event_to_device(struct net_device *dev,
* that we offload.
*/
if (!check_cb(lower_dev) &&
- !switchdev_lower_dev_find(lower_dev, check_cb,
- foreign_dev_check_cb))
+ !switchdev_lower_dev_find_rcu(lower_dev, check_cb,
+ foreign_dev_check_cb))
continue;
err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev,
@@ -501,7 +522,7 @@ maybe_bridged_with_us:
if (!br || !netif_is_bridge_master(br))
return 0;
- if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb))
+ if (!switchdev_lower_dev_find_rcu(br, check_cb, foreign_dev_check_cb))
return 0;
return __switchdev_handle_fdb_event_to_device(br, orig_dev, event, fdb_info,
@@ -536,13 +557,15 @@ EXPORT_SYMBOL_GPL(switchdev_handle_fdb_event_to_device);
static int __switchdev_handle_port_obj_add(struct net_device *dev,
struct switchdev_notifier_port_obj_info *port_obj_info,
bool (*check_cb)(const struct net_device *dev),
+ bool (*foreign_dev_check_cb)(const struct net_device *dev,
+ const struct net_device *foreign_dev),
int (*add_cb)(struct net_device *dev, const void *ctx,
const struct switchdev_obj *obj,
struct netlink_ext_ack *extack))
{
struct switchdev_notifier_info *info = &port_obj_info->info;
+ struct net_device *br, *lower_dev;
struct netlink_ext_ack *extack;
- struct net_device *lower_dev;
struct list_head *iter;
int err = -EOPNOTSUPP;
@@ -566,15 +589,42 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev,
if (netif_is_bridge_master(lower_dev))
continue;
+ /* When searching for switchdev interfaces that are neighbors
+ * of foreign ones, and @dev is a bridge, do not recurse on the
+ * foreign interface again, it was already visited.
+ */
+ if (foreign_dev_check_cb && !check_cb(lower_dev) &&
+ !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb))
+ continue;
+
err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info,
- check_cb, add_cb);
+ check_cb, foreign_dev_check_cb,
+ add_cb);
if (err && err != -EOPNOTSUPP)
return err;
}
- return err;
+ /* Event is neither on a bridge nor a LAG. Check whether it is on an
+ * interface that is in a bridge with us.
+ */
+ if (!foreign_dev_check_cb)
+ return err;
+
+ br = netdev_master_upper_dev_get(dev);
+ if (!br || !netif_is_bridge_master(br))
+ return err;
+
+ if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb))
+ return err;
+
+ return __switchdev_handle_port_obj_add(br, port_obj_info, check_cb,
+ foreign_dev_check_cb, add_cb);
}
+/* Pass through a port object addition, if @dev passes @check_cb, or replicate
+ * it towards all lower interfaces of @dev that pass @check_cb, if @dev is a
+ * bridge or a LAG.
+ */
int switchdev_handle_port_obj_add(struct net_device *dev,
struct switchdev_notifier_port_obj_info *port_obj_info,
bool (*check_cb)(const struct net_device *dev),
@@ -585,21 +635,46 @@ int switchdev_handle_port_obj_add(struct net_device *dev,
int err;
err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb,
- add_cb);
+ NULL, add_cb);
if (err == -EOPNOTSUPP)
err = 0;
return err;
}
EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add);
+/* Same as switchdev_handle_port_obj_add(), except if object is notified on a
+ * @dev that passes @foreign_dev_check_cb, it is replicated towards all devices
+ * that pass @check_cb and are in the same bridge as @dev.
+ */
+int switchdev_handle_port_obj_add_foreign(struct net_device *dev,
+ struct switchdev_notifier_port_obj_info *port_obj_info,
+ bool (*check_cb)(const struct net_device *dev),
+ bool (*foreign_dev_check_cb)(const struct net_device *dev,
+ const struct net_device *foreign_dev),
+ int (*add_cb)(struct net_device *dev, const void *ctx,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack))
+{
+ int err;
+
+ err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb,
+ foreign_dev_check_cb, add_cb);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add_foreign);
+
static int __switchdev_handle_port_obj_del(struct net_device *dev,
struct switchdev_notifier_port_obj_info *port_obj_info,
bool (*check_cb)(const struct net_device *dev),
+ bool (*foreign_dev_check_cb)(const struct net_device *dev,
+ const struct net_device *foreign_dev),
int (*del_cb)(struct net_device *dev, const void *ctx,
const struct switchdev_obj *obj))
{
struct switchdev_notifier_info *info = &port_obj_info->info;
- struct net_device *lower_dev;
+ struct net_device *br, *lower_dev;
struct list_head *iter;
int err = -EOPNOTSUPP;
@@ -621,15 +696,42 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev,
if (netif_is_bridge_master(lower_dev))
continue;
+ /* When searching for switchdev interfaces that are neighbors
+ * of foreign ones, and @dev is a bridge, do not recurse on the
+ * foreign interface again, it was already visited.
+ */
+ if (foreign_dev_check_cb && !check_cb(lower_dev) &&
+ !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb))
+ continue;
+
err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info,
- check_cb, del_cb);
+ check_cb, foreign_dev_check_cb,
+ del_cb);
if (err && err != -EOPNOTSUPP)
return err;
}
- return err;
+ /* Event is neither on a bridge nor a LAG. Check whether it is on an
+ * interface that is in a bridge with us.
+ */
+ if (!foreign_dev_check_cb)
+ return err;
+
+ br = netdev_master_upper_dev_get(dev);
+ if (!br || !netif_is_bridge_master(br))
+ return err;
+
+ if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb))
+ return err;
+
+ return __switchdev_handle_port_obj_del(br, port_obj_info, check_cb,
+ foreign_dev_check_cb, del_cb);
}
+/* Pass through a port object deletion, if @dev passes @check_cb, or replicate
+ * it towards all lower interfaces of @dev that pass @check_cb, if @dev is a
+ * bridge or a LAG.
+ */
int switchdev_handle_port_obj_del(struct net_device *dev,
struct switchdev_notifier_port_obj_info *port_obj_info,
bool (*check_cb)(const struct net_device *dev),
@@ -639,13 +741,35 @@ int switchdev_handle_port_obj_del(struct net_device *dev,
int err;
err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb,
- del_cb);
+ NULL, del_cb);
if (err == -EOPNOTSUPP)
err = 0;
return err;
}
EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del);
+/* Same as switchdev_handle_port_obj_del(), except if object is notified on a
+ * @dev that passes @foreign_dev_check_cb, it is replicated towards all devices
+ * that pass @check_cb and are in the same bridge as @dev.
+ */
+int switchdev_handle_port_obj_del_foreign(struct net_device *dev,
+ struct switchdev_notifier_port_obj_info *port_obj_info,
+ bool (*check_cb)(const struct net_device *dev),
+ bool (*foreign_dev_check_cb)(const struct net_device *dev,
+ const struct net_device *foreign_dev),
+ int (*del_cb)(struct net_device *dev, const void *ctx,
+ const struct switchdev_obj *obj))
+{
+ int err;
+
+ err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb,
+ foreign_dev_check_cb, del_cb);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del_foreign);
+
static int __switchdev_handle_port_attr_set(struct net_device *dev,
struct switchdev_notifier_port_attr_info *port_attr_info,
bool (*check_cb)(const struct net_device *dev),
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 64ae4c4c44f8..c5eec16213d7 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -226,14 +226,6 @@ static inline void msg_set_bits(struct tipc_msg *m, u32 w,
m->hdr[w] |= htonl(val);
}
-static inline void msg_swap_words(struct tipc_msg *msg, u32 a, u32 b)
-{
- u32 temp = msg->hdr[a];
-
- msg->hdr[a] = msg->hdr[b];
- msg->hdr[b] = temp;
-}
-
/*
* Word 0
*/
@@ -480,11 +472,6 @@ static inline void msg_incr_reroute_cnt(struct tipc_msg *m)
msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1);
}
-static inline void msg_reset_reroute_cnt(struct tipc_msg *m)
-{
- msg_set_bits(m, 1, 21, 0xf, 0);
-}
-
static inline u32 msg_lookup_scope(struct tipc_msg *m)
{
return msg_bits(m, 1, 19, 0x3);
@@ -800,11 +787,6 @@ static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n)
msg_set_word(m, 2, n);
}
-static inline u32 msg_bcgap_after(struct tipc_msg *m)
-{
- return msg_bits(m, 2, 16, 0xffff);
-}
-
static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n)
{
msg_set_bits(m, 2, 16, 0xffff, n);
@@ -868,11 +850,6 @@ static inline void msg_set_next_sent(struct tipc_msg *m, u16 n)
msg_set_bits(m, 4, 0, 0xffff, n);
}
-static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n)
-{
- msg_set_bits(m, 4, 0, 0xffff, n);
-}
-
static inline u32 msg_bc_netid(struct tipc_msg *m)
{
return msg_word(m, 4);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index efc84845bb6b..0024a692f0f8 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1433,7 +1433,8 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
if (*zc && (out_iov || out_sg)) {
if (out_iov)
- n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
+ n_sgout = 1 +
+ iov_iter_npages_cap(out_iov, INT_MAX, data_len);
else
n_sgout = sg_nents(out_sg);
n_sgin = skb_nsg(skb, rxm->offset + prot->prepend_size,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c19569819866..3e0d6281fd1e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3240,49 +3240,58 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
return sk;
}
-static struct sock *unix_next_socket(struct seq_file *seq,
- struct sock *sk,
- loff_t *pos)
+static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
{
unsigned long bucket = get_bucket(*pos);
+ struct sock *sk;
- while (sk > (struct sock *)SEQ_START_TOKEN) {
- sk = sk_next(sk);
- if (!sk)
- goto next_bucket;
- if (sock_net(sk) == seq_file_net(seq))
- return sk;
- }
-
- do {
+ while (bucket < ARRAY_SIZE(unix_socket_table)) {
spin_lock(&unix_table_locks[bucket]);
+
sk = unix_from_bucket(seq, pos);
if (sk)
return sk;
-next_bucket:
- spin_unlock(&unix_table_locks[bucket++]);
- *pos = set_bucket_offset(bucket, 1);
- } while (bucket < ARRAY_SIZE(unix_socket_table));
+ spin_unlock(&unix_table_locks[bucket]);
+
+ *pos = set_bucket_offset(++bucket, 1);
+ }
return NULL;
}
+static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
+ loff_t *pos)
+{
+ unsigned long bucket = get_bucket(*pos);
+
+ for (sk = sk_next(sk); sk; sk = sk_next(sk))
+ if (sock_net(sk) == seq_file_net(seq))
+ return sk;
+
+ spin_unlock(&unix_table_locks[bucket]);
+
+ *pos = set_bucket_offset(++bucket, 1);
+
+ return unix_get_first(seq, pos);
+}
+
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
{
if (!*pos)
return SEQ_START_TOKEN;
- if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
- return NULL;
-
- return unix_next_socket(seq, NULL, pos);
+ return unix_get_first(seq, pos);
}
static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
- return unix_next_socket(seq, v, pos);
+
+ if (v == SEQ_START_TOKEN)
+ return unix_get_first(seq, pos);
+
+ return unix_get_next(seq, v, pos);
}
static void unix_seq_stop(struct seq_file *seq, void *v)
@@ -3347,6 +3356,15 @@ static const struct seq_operations unix_seq_ops = {
};
#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_unix_iter_state {
+ struct seq_net_private p;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ struct sock **batch;
+ bool st_bucket_done;
+};
+
struct bpf_iter__unix {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct unix_sock *, unix_sk);
@@ -3365,24 +3383,156 @@ static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
return bpf_iter_run_prog(prog, &ctx);
}
+static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
+
+{
+ struct bpf_unix_iter_state *iter = seq->private;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(start_sk);
+ iter->batch[iter->end_sk++] = start_sk;
+
+ for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
+ if (sock_net(sk) != seq_file_net(seq))
+ continue;
+
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+
+ expected++;
+ }
+
+ spin_unlock(&unix_table_locks[start_sk->sk_hash]);
+
+ return expected;
+}
+
+static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
+{
+ while (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
+ unsigned int new_batch_sz)
+{
+ struct sock **new_batch;
+
+ new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+ GFP_USER | __GFP_NOWARN);
+ if (!new_batch)
+ return -ENOMEM;
+
+ bpf_iter_unix_put_batch(iter);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
+ loff_t *pos)
+{
+ struct bpf_unix_iter_state *iter = seq->private;
+ unsigned int expected;
+ bool resized = false;
+ struct sock *sk;
+
+ if (iter->st_bucket_done)
+ *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
+
+again:
+ /* Get a new batch */
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ sk = unix_get_first(seq, pos);
+ if (!sk)
+ return NULL; /* Done */
+
+ expected = bpf_iter_unix_hold_batch(seq, sk);
+
+ if (iter->end_sk == expected) {
+ iter->st_bucket_done = true;
+ return sk;
+ }
+
+ if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
+ resized = true;
+ goto again;
+ }
+
+ return sk;
+}
+
+static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ return bpf_iter_unix_batch(seq, pos);
+}
+
+static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_unix_iter_state *iter = seq->private;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so advance to the next sk in
+ * the batch.
+ */
+ if (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++]);
+
+ ++*pos;
+
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk];
+ else
+ sk = bpf_iter_unix_batch(seq, pos);
+
+ return sk;
+}
+
static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
struct sock *sk = v;
uid_t uid;
+ bool slow;
+ int ret;
if (v == SEQ_START_TOKEN)
return 0;
+ slow = lock_sock_fast(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
- return unix_prog_seq_show(prog, &meta, v, uid);
+ ret = unix_prog_seq_show(prog, &meta, v, uid);
+unlock:
+ unlock_sock_fast(sk, slow);
+ return ret;
}
static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
{
+ struct bpf_unix_iter_state *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
@@ -3393,12 +3543,13 @@ static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
(void)unix_prog_seq_show(prog, &meta, v, 0);
}
- unix_seq_stop(seq, v);
+ if (iter->cur_sk < iter->end_sk)
+ bpf_iter_unix_put_batch(iter);
}
static const struct seq_operations bpf_iter_unix_seq_ops = {
- .start = unix_seq_start,
- .next = unix_seq_next,
+ .start = bpf_iter_unix_seq_start,
+ .next = bpf_iter_unix_seq_next,
.stop = bpf_iter_unix_seq_stop,
.show = bpf_iter_unix_seq_show,
};
@@ -3447,13 +3598,55 @@ static struct pernet_operations unix_net_ops = {
DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
struct unix_sock *unix_sk, uid_t uid)
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_unix_iter_state *iter = priv_data;
+ int err;
+
+ err = bpf_iter_init_seq_net(priv_data, aux);
+ if (err)
+ return err;
+
+ err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
+ if (err) {
+ bpf_iter_fini_seq_net(priv_data);
+ return err;
+ }
+
+ return 0;
+}
+
+static void bpf_iter_fini_unix(void *priv_data)
+{
+ struct bpf_unix_iter_state *iter = priv_data;
+
+ bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
+}
+
static const struct bpf_iter_seq_info unix_seq_info = {
.seq_ops = &bpf_iter_unix_seq_ops,
- .init_seq_private = bpf_iter_init_seq_net,
- .fini_seq_private = bpf_iter_fini_seq_net,
- .seq_priv_size = sizeof(struct seq_net_private),
+ .init_seq_private = bpf_iter_init_unix,
+ .fini_seq_private = bpf_iter_fini_unix,
+ .seq_priv_size = sizeof(struct bpf_unix_iter_state),
};
+static const struct bpf_func_proto *
+bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_sk_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ return &bpf_sk_getsockopt_proto;
+ default:
+ return NULL;
+ }
+}
+
static struct bpf_iter_reg unix_reg_info = {
.target = "unix",
.ctx_arg_info_size = 1,
@@ -3461,6 +3654,7 @@ static struct bpf_iter_reg unix_reg_info = {
{ offsetof(struct bpf_iter__unix, unix_sk),
PTR_TO_BTF_ID_OR_NULL },
},
+ .get_func_proto = bpf_iter_unix_get_func_proto,
.seq_info = &unix_seq_info,
};
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 328cf54bda82..2bc647720cda 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -2,8 +2,6 @@
/*
* Copyright (C) 2018 - 2021 Intel Corporation
*/
-#ifndef __PMSR_H
-#define __PMSR_H
#include <net/cfg80211.h>
#include "core.h"
#include "nl80211.h"
@@ -661,5 +659,3 @@ void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid)
}
spin_unlock_bh(&wdev->pmsr_lock);
}
-
-#endif /* __PMSR_H */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index b888522f133b..b2fdac96bab0 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -700,8 +700,12 @@ static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap,
for (i = 0; i < request->n_ssids; i++) {
/* wildcard ssid in the scan request */
- if (!request->ssids[i].ssid_len)
+ if (!request->ssids[i].ssid_len) {
+ if (ap->multi_bss && !ap->transmitted_bssid)
+ continue;
+
return true;
+ }
if (ap->ssid_len &&
ap->ssid_len == request->ssids[i].ssid_len) {
@@ -827,6 +831,9 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
!cfg80211_find_ssid_match(ap, request))
continue;
+ if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid)
+ continue;
+
cfg80211_scan_req_add_chan(request, chan, true);
memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN);
scan_6ghz_params->short_ssid = ap->short_ssid;
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 41ea65deb6e1..e02f1702806e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
* Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
*/
#include <linux/export.h>
#include <linux/bitops.h>
@@ -634,12 +634,14 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
if (likely((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) &&
tmp.h_proto != htons(ETH_P_AARP) &&
tmp.h_proto != htons(ETH_P_IPX)) ||
- ether_addr_equal(payload.hdr, bridge_tunnel_header)))
+ ether_addr_equal(payload.hdr, bridge_tunnel_header))) {
/* remove RFC1042 or Bridge-Tunnel encapsulation and
* replace EtherType */
hdrlen += ETH_ALEN + 2;
- else
+ skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2);
+ } else {
tmp.h_proto = htons(skb->len - hdrlen);
+ }
pskb_pull(skb, hdrlen);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 28ef3f4465ae..2abd64e4d589 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -343,9 +343,9 @@ out:
}
EXPORT_SYMBOL(xsk_tx_peek_desc);
-static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
- u32 max_entries)
+static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
{
+ struct xdp_desc *descs = pool->tx_descs;
u32 nb_pkts = 0;
while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
@@ -355,8 +355,7 @@ static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_d
return nb_pkts;
}
-u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
- u32 max_entries)
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
{
struct xdp_sock *xs;
u32 nb_pkts;
@@ -365,7 +364,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *
if (!list_is_singular(&pool->xsk_tx_list)) {
/* Fallback to the non-batched version */
rcu_read_unlock();
- return xsk_tx_peek_release_fallback(pool, descs, max_entries);
+ return xsk_tx_peek_release_fallback(pool, max_entries);
}
xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
@@ -374,7 +373,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *
goto out;
}
- nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
+ nb_pkts = xskq_cons_peek_desc_batch(xs->tx, pool, max_entries);
if (!nb_pkts) {
xs->tx->queue_empty_descs++;
goto out;
@@ -386,7 +385,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *
* packets. This avoids having to implement any buffering in
* the Tx path.
*/
- nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
+ nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
if (!nb_pkts)
goto out;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index fd39bb660ebc..b34fca6ada86 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -37,6 +37,7 @@ void xp_destroy(struct xsk_buff_pool *pool)
if (!pool)
return;
+ kvfree(pool->tx_descs);
kvfree(pool->heads);
kvfree(pool);
}
@@ -58,6 +59,12 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
if (!pool->heads)
goto out;
+ if (xs->tx) {
+ pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL);
+ if (!pool->tx_descs)
+ goto out;
+ }
+
pool->chunk_mask = ~((u64)umem->chunk_size - 1);
pool->addrs_cnt = umem->size;
pool->heads_cnt = umem->chunks;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index e9aa2c236356..801cda5d1938 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -205,11 +205,11 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
return false;
}
-static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
- struct xdp_desc *descs,
- struct xsk_buff_pool *pool, u32 max)
+static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+ u32 max)
{
u32 cached_cons = q->cached_cons, nb_entries = 0;
+ struct xdp_desc *descs = pool->tx_descs;
while (cached_cons != q->cached_prod && nb_entries < max) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
@@ -282,12 +282,12 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
return xskq_cons_read_desc(q, desc, pool);
}
-static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs,
- struct xsk_buff_pool *pool, u32 max)
+static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+ u32 max)
{
u32 entries = xskq_cons_nb_entries(q, max);
- return xskq_cons_read_desc_batch(q, descs, pool, entries);
+ return xskq_cons_read_desc_batch(q, pool, entries);
}
/* To improve performance in the xskq_cons_release functions, only update local state here.
@@ -304,13 +304,6 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
q->cached_cons += cnt;
}
-static inline bool xskq_cons_is_full(struct xsk_queue *q)
-{
- /* No barriers needed since data is not accessed */
- return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) ==
- q->nentries;
-}
-
static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
{
/* No barriers needed since data is not accessed */