summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/atm/lec.c2
-rw-r--r--net/atm/signaling.c2
-rw-r--r--net/batman-adv/bat_iv_ogm.c1
-rw-r--r--net/batman-adv/bat_v_elp.c1
-rw-r--r--net/batman-adv/bat_v_ogm.c1
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c2
-rw-r--r--net/batman-adv/fragmentation.c2
-rw-r--r--net/batman-adv/hard-interface.c19
-rw-r--r--net/batman-adv/hard-interface.h1
-rw-r--r--net/batman-adv/main.c1
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/multicast.c2
-rw-r--r--net/batman-adv/network-coding.c4
-rw-r--r--net/batman-adv/send.c2
-rw-r--r--net/batman-adv/soft-interface.c4
-rw-r--r--net/batman-adv/types.h4
-rw-r--r--net/bluetooth/hci_conn.c2
-rw-r--r--net/bluetooth/hci_core.c2
-rw-r--r--net/bpfilter/Kconfig1
-rw-r--r--net/bridge/br.c5
-rw-r--r--net/bridge/br_ioctl.c2
-rw-r--r--net/bridge/br_mdb.c258
-rw-r--r--net/bridge/br_multicast.c1290
-rw-r--r--net/bridge/br_netlink.c4
-rw-r--r--net/bridge/br_private.h70
-rw-r--r--net/bridge/br_vlan.c6
-rw-r--r--net/bridge/netfilter/ebt_stp.c1
-rw-r--r--net/caif/cfsrvl.c1
-rw-r--r--net/can/af_can.c4
-rw-r--r--net/can/bcm.c2
-rw-r--r--net/can/gw.c2
-rw-r--r--net/can/proc.c2
-rw-r--r--net/can/raw.c26
-rw-r--r--net/core/bpf_sk_storage.c833
-rw-r--r--net/core/datagram.c33
-rw-r--r--net/core/dev.c104
-rw-r--r--net/core/devlink.c179
-rw-r--r--net/core/filter.c416
-rw-r--r--net/core/net-procfs.c15
-rw-r--r--net/core/netpoll.c2
-rw-r--r--net/core/ptp_classifier.c30
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/core/skmsg.c34
-rw-r--r--net/core/sock.c14
-rw-r--r--net/core/sock_map.c91
-rw-r--r--net/core/sysctl_net_core.c17
-rw-r--r--net/dccp/ackvec.c2
-rw-r--r--net/dccp/ipv4.c8
-rw-r--r--net/dccp/timer.c2
-rw-r--r--net/dsa/dsa.c28
-rw-r--r--net/dsa/dsa2.c19
-rw-r--r--net/dsa/dsa_priv.h2
-rw-r--r--net/dsa/port.c91
-rw-r--r--net/dsa/slave.c191
-rw-r--r--net/dsa/switch.c41
-rw-r--r--net/dsa/tag_8021q.c158
-rw-r--r--net/dsa/tag_sja1105.c21
-rw-r--r--net/ethtool/channels.c2
-rw-r--r--net/ethtool/common.c2
-rw-r--r--net/ethtool/ioctl.c30
-rw-r--r--net/ethtool/linkmodes.c2
-rw-r--r--net/ethtool/pause.c63
-rw-r--r--net/hsr/hsr_debugfs.c21
-rw-r--r--net/ipv4/af_inet.c1
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/fou.c4
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/inet_diag.c17
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/ip_options.c35
-rw-r--r--net/ipv4/ip_output.c11
-rw-r--r--net/ipv4/ip_sockglue.c5
-rw-r--r--net/ipv4/ipmr.c14
-rw-r--r--net/ipv4/nexthop.c66
-rw-r--r--net/ipv4/ping.c29
-rw-r--r--net/ipv4/raw.c5
-rw-r--r--net/ipv4/route.c19
-rw-r--r--net/ipv4/syncookies.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c42
-rw-r--r--net/ipv4/tcp_bpf.c13
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c151
-rw-r--r--net/ipv4/tcp_ipv4.c18
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_output.c194
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/udp_bpf.c9
-rw-r--r--net/ipv6/addrconf_core.c8
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/inet6_hashtables.c6
-rw-r--r--net/ipv6/ip6_fib.c12
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c39
-rw-r--r--net/ipv6/route.c3
-rw-r--r--net/ipv6/tcp_ipv6.c27
-rw-r--r--net/l2tp/Makefile2
-rw-r--r--net/l2tp/l2tp_core.c329
-rw-r--r--net/l2tp/l2tp_core.h33
-rw-r--r--net/l2tp/l2tp_debugfs.c4
-rw-r--r--net/l2tp/l2tp_eth.c13
-rw-r--r--net/l2tp/l2tp_ip.c17
-rw-r--r--net/l2tp/l2tp_ip6.c17
-rw-r--r--net/l2tp/l2tp_netlink.c18
-rw-r--r--net/l2tp/l2tp_ppp.c70
-rw-r--r--net/l2tp/trace.h211
-rw-r--r--net/mac80211/agg-rx.c2
-rw-r--r--net/mac80211/cfg.c110
-rw-r--r--net/mac80211/debugfs.c1
-rw-r--r--net/mac80211/driver-ops.h29
-rw-r--r--net/mac80211/ibss.c4
-rw-r--r--net/mac80211/ieee80211_i.h25
-rw-r--r--net/mac80211/iface.c997
-rw-r--r--net/mac80211/key.c15
-rw-r--r--net/mac80211/main.c2
-rw-r--r--net/mac80211/mesh.c6
-rw-r--r--net/mac80211/mesh_hwmp.c4
-rw-r--r--net/mac80211/mesh_ps.c2
-rw-r--r--net/mac80211/mlme.c56
-rw-r--r--net/mac80211/offchannel.c2
-rw-r--r--net/mac80211/rx.c11
-rw-r--r--net/mac80211/sta_info.h2
-rw-r--r--net/mac80211/status.c213
-rw-r--r--net/mac80211/trace.h33
-rw-r--r--net/mac80211/tx.c243
-rw-r--r--net/mac80211/vht.c4
-rw-r--r--net/mptcp/mib.c5
-rw-r--r--net/mptcp/mib.h5
-rw-r--r--net/mptcp/pm_netlink.c43
-rw-r--r--net/mptcp/protocol.c514
-rw-r--r--net/mptcp/protocol.h21
-rw-r--r--net/mptcp/subflow.c99
-rw-r--r--net/netfilter/ipvs/Kconfig1
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c7
-rw-r--r--net/netfilter/nf_conntrack_core.c25
-rw-r--r--net/netfilter/nf_conntrack_netlink.c5
-rw-r--r--net/netfilter/nf_conntrack_standalone.c4
-rw-r--r--net/netfilter/nf_tables_api.c57
-rw-r--r--net/netfilter/nft_socket.c27
-rw-r--r--net/netfilter/xt_HMARK.c2
-rw-r--r--net/netlabel/netlabel_calipso.c4
-rw-r--r--net/netlabel/netlabel_domainhash.c5
-rw-r--r--net/netlink/af_netlink.c16
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/netlink/policy.c33
-rw-r--r--net/nfc/digital_dep.c3
-rw-r--r--net/openvswitch/actions.c12
-rw-r--r--net/openvswitch/conntrack.c4
-rw-r--r--net/openvswitch/datapath.c46
-rw-r--r--net/openvswitch/flow_table.c70
-rw-r--r--net/openvswitch/flow_table.h1
-rw-r--r--net/openvswitch/vport.c7
-rw-r--r--net/packet/af_packet.c41
-rw-r--r--net/rds/cong.c2
-rw-r--r--net/rds/ib_cm.c2
-rw-r--r--net/rds/rdma.c2
-rw-r--r--net/rxrpc/af_rxrpc.c7
-rw-r--r--net/rxrpc/ar-internal.h70
-rw-r--r--net/rxrpc/call_object.c43
-rw-r--r--net/rxrpc/conn_client.c1088
-rw-r--r--net/rxrpc/conn_event.c14
-rw-r--r--net/rxrpc/conn_object.c12
-rw-r--r--net/rxrpc/conn_service.c7
-rw-r--r--net/rxrpc/local_object.c4
-rw-r--r--net/rxrpc/net_ns.c5
-rw-r--r--net/rxrpc/output.c6
-rw-r--r--net/rxrpc/proc.c2
-rw-r--r--net/rxrpc/rtt.c1
-rw-r--r--net/rxrpc/rxkad.c8
-rw-r--r--net/rxrpc/sysctl.c10
-rw-r--r--net/sched/act_api.c5
-rw-r--r--net/sched/act_ct.c8
-rw-r--r--net/sched/act_ctinfo.c5
-rw-r--r--net/sched/act_gate.c4
-rw-r--r--net/sctp/associola.c4
-rw-r--r--net/sctp/auth.c4
-rw-r--r--net/sctp/bind_addr.c2
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/protocol.c8
-rw-r--r--net/sctp/sm_make_chunk.c6
-rw-r--r--net/sctp/ulpqueue.c2
-rw-r--r--net/smc/af_smc.c218
-rw-r--r--net/smc/smc.h7
-rw-r--r--net/smc/smc_cdc.c4
-rw-r--r--net/smc/smc_clc.c243
-rw-r--r--net/smc/smc_clc.h93
-rw-r--r--net/smc/smc_close.c4
-rw-r--r--net/smc/smc_core.c56
-rw-r--r--net/smc/smc_core.h11
-rw-r--r--net/smc/smc_diag.c30
-rw-r--r--net/smc/smc_llc.c2
-rw-r--r--net/smc/smc_pnet.c5
-rw-r--r--net/smc/smc_tx.c10
-rw-r--r--net/socket.c8
-rw-r--r--net/sunrpc/sysctl.c6
-rw-r--r--net/tipc/core.c6
-rw-r--r--net/tipc/core.h8
-rw-r--r--net/tipc/crypto.c981
-rw-r--r--net/tipc/crypto.h43
-rw-r--r--net/tipc/link.c10
-rw-r--r--net/tipc/msg.c2
-rw-r--r--net/tipc/msg.h8
-rw-r--r--net/tipc/net.c20
-rw-r--r--net/tipc/net.h1
-rw-r--r--net/tipc/netlink.c2
-rw-r--r--net/tipc/node.c94
-rw-r--r--net/tipc/node.h2
-rw-r--r--net/tipc/socket.c3
-rw-r--r--net/tipc/sysctl.c9
-rw-r--r--net/tipc/topsrv.c1
-rw-r--r--net/tipc/udp_media.c1
-rw-r--r--net/tls/tls_main.c25
-rw-r--r--net/unix/af_unix.c1
-rw-r--r--net/wireless/chan.c130
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/lib80211.c2
-rw-r--r--net/wireless/nl80211.c397
-rw-r--r--net/wireless/reg.c327
-rw-r--r--net/wireless/scan.c4
-rw-r--r--net/wireless/sme.c2
-rw-r--r--net/wireless/util.c32
-rw-r--r--net/wireless/wext-compat.c2
-rw-r--r--net/xdp/xdp_umem.c225
-rw-r--r--net/xdp/xdp_umem.h6
-rw-r--r--net/xdp/xsk.c213
-rw-r--r--net/xdp/xsk.h10
-rw-r--r--net/xdp/xsk_buff_pool.c380
-rw-r--r--net/xdp/xsk_diag.c16
-rw-r--r--net/xdp/xsk_queue.h12
-rw-r--r--net/xdp/xskmap.c8
231 files changed, 8533 insertions, 4971 deletions
diff --git a/net/atm/lec.c b/net/atm/lec.c
index b570ef919c28..dbabb65d8b67 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1070,7 +1070,7 @@ module_exit(lane_module_cleanup);
/*
* LANE2: 3.1.3, LE_RESOLVE.request
* Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs.
- * If sizeoftlvs == NULL the default TLVs associated with with this
+ * If sizeoftlvs == NULL the default TLVs associated with this
* lec will be used.
* If dst_mac == NULL, targetless LE_ARP will be sent
*/
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index fbd0c5e7b299..5de06ab8ed75 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -52,7 +52,7 @@ static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
msg->type = as_okay;
}
/*
- * Should probably just turn around the old skb. But the, the buffer
+ * Should probably just turn around the old skb. But then, the buffer
* space accounting needs to follow the change too. Maybe later.
*/
while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index a4faf5f904d9..206d0b424712 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -27,6 +27,7 @@
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/pkt_sched.h>
+#include <linux/prandom.h>
#include <linux/printk.h>
#include <linux/random.h>
#include <linux/rculist.h>
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index d35aca0e969a..79a7dfc32e76 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -20,6 +20,7 @@
#include <linux/kref.h>
#include <linux/netdevice.h>
#include <linux/nl80211.h>
+#include <linux/prandom.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 717fe657561d..8c1148fc73d7 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -20,6 +20,7 @@
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
+#include <linux/prandom.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index c350ab63cd54..ba0027d1f2df 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1863,7 +1863,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work);
- /* backbone_gw is unreferenced in the report work function function
+ /* backbone_gw is unreferenced in the report work function
* if queue_work() call was successful
*/
if (!ret)
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 9fdbe3068153..9a47ef8b95c4 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -306,7 +306,7 @@ free:
* set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
* to NULL; 3) Error: Return false and free skb.
*
- * Return: true when the packet is merged or buffered, false when skb is not not
+ * Return: true when the packet is merged or buffered, false when skb is not
* used.
*/
bool batadv_frag_skb_buffer(struct sk_buff **skb,
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index fa06b51c0144..dad99641df2a 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -599,7 +599,7 @@ out:
/* report to the other components the maximum amount of bytes that
* batman-adv can send over the wire (without considering the payload
* overhead). For example, this value is used by TT to compute the
- * maximum local table table size
+ * maximum local table size
*/
atomic_set(&bat_priv->packet_size_max, min_mtu);
@@ -977,23 +977,6 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
}
/**
- * batadv_hardif_remove_interfaces() - Remove all hard interfaces
- */
-void batadv_hardif_remove_interfaces(void)
-{
- struct batadv_hard_iface *hard_iface, *hard_iface_tmp;
-
- rtnl_lock();
- list_for_each_entry_safe(hard_iface, hard_iface_tmp,
- &batadv_hardif_list, list) {
- list_del_rcu(&hard_iface->list);
- batadv_hardif_generation++;
- batadv_hardif_remove_interface(hard_iface);
- }
- rtnl_unlock();
-}
-
-/**
* batadv_hard_if_event_softif() - Handle events for soft interfaces
* @event: NETDEV_* event to handle
* @net_dev: net_device which generated an event
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index bad2e50135e8..b1855d9d0b06 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -100,7 +100,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
struct net *net, const char *iface_name);
void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
enum batadv_hard_if_cleanup autodel);
-void batadv_hardif_remove_interfaces(void);
int batadv_hardif_min_mtu(struct net_device *soft_iface);
void batadv_update_min_mtu(struct net_device *soft_iface);
void batadv_hardif_release(struct kref *ref);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 519c08c2cfba..70fee9b42e25 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -137,7 +137,6 @@ static void __exit batadv_exit(void)
batadv_netlink_unregister();
rtnl_link_unregister(&batadv_link_ops);
unregister_netdevice_notifier(&batadv_hard_if_notifier);
- batadv_hardif_remove_interfaces();
flush_workqueue(batadv_event_workqueue);
destroy_workqueue(batadv_event_workqueue);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 0393bb9ed3d0..a47dc332d796 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2020.3"
+#define BATADV_SOURCE_VERSION "2020.4"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index ca24a2e522b7..0746fe2c2c04 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -208,7 +208,7 @@ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
/* TODO: ask the bridge if a multicast router is present (the bridge
- * is capable of performing proper RFC4286 multicast multicast router
+ * is capable of performing proper RFC4286 multicast router
* discovery) instead of searching for a ff02::2 listener here
*/
ret = br_multicast_list_adjacent(dev, &bridge_mcast_list);
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 48d707850f3e..61ddd6d709a0 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -26,8 +26,8 @@
#include <linux/lockdep.h>
#include <linux/net.h>
#include <linux/netdevice.h>
+#include <linux/prandom.h>
#include <linux/printk.h>
-#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
@@ -250,7 +250,7 @@ static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
/**
* batadv_nc_packet_free() - frees nc packet
* @nc_packet: the nc packet to free
- * @dropped: whether the packet is freed because is is dropped
+ * @dropped: whether the packet is freed because is dropped
*/
static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
bool dropped)
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index d267b94800d6..87017332b567 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -461,7 +461,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
/**
* batadv_forw_packet_free() - free a forwarding packet
* @forw_packet: The packet to free
- * @dropped: whether the packet is freed because is is dropped
+ * @dropped: whether the packet is freed because is dropped
*
* This frees a forwarding packet and releases any resources it might
* have claimed.
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index cdde943c1b83..82e7ca886605 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -648,7 +648,7 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
/**
* batadv_interface_add_vid() - ndo_add_vid API implementation
* @dev: the netdev of the mesh interface
- * @proto: protocol of the the vlan id
+ * @proto: protocol of the vlan id
* @vid: identifier of the new vlan
*
* Set up all the internal structures for handling the new vlan on top of the
@@ -706,7 +706,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
/**
* batadv_interface_kill_vid() - ndo_kill_vid API implementation
* @dev: the netdev of the mesh interface
- * @proto: protocol of the the vlan id
+ * @proto: protocol of the vlan id
* @vid: identifier of the deleted vlan
*
* Destroy all the internal structures used to handle the vlan identified by vid
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index ed519efa3c36..965336a3b89d 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1492,7 +1492,7 @@ struct batadv_tp_vars {
/** @unacked_lock: protect unacked_list */
spinlock_t unacked_lock;
- /** @last_recv_time: time time (jiffies) a msg was received */
+ /** @last_recv_time: time (jiffies) a msg was received */
unsigned long last_recv_time;
/** @refcount: number of context where the object is used */
@@ -1996,7 +1996,7 @@ struct batadv_tt_change_node {
*/
struct batadv_tt_req_node {
/**
- * @addr: mac address address of the originator this request was sent to
+ * @addr: mac address of the originator this request was sent to
*/
u8 addr[ETH_ALEN];
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 9832f8445d43..d0c1024bf600 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1388,7 +1388,7 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
return 0;
}
-/* Encrypt the the link */
+/* Encrypt the link */
static void hci_conn_encrypt(struct hci_conn *conn)
{
BT_DBG("hcon %p", conn);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 68bfe57b6625..b0209e35284a 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -808,7 +808,7 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
* Delete Stored Link Key command. They are clearly indicating its
* absence in the bit mask of supported commands.
*
- * Check the supported commands and only if the the command is marked
+ * Check the supported commands and only if the command is marked
* as supported send it. If not supported assume that the controller
* does not have actual support for stored link keys which makes this
* command redundant anyway.
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index 73d0b12789f1..8ad0233ce497 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -2,6 +2,7 @@
menuconfig BPFILTER
bool "BPF based packet filtering framework (BPFILTER)"
depends on NET && BPF && INET
+ select USERMODE_DRIVER
help
This builds experimental bpfilter framework that is aiming to
provide netfilter compatible functionality via BPF
diff --git a/net/bridge/br.c b/net/bridge/br.c
index b6fe30e3768f..401eeb9142eb 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -183,6 +183,11 @@ static int br_switchdev_event(struct notifier_block *unused,
br_fdb_offloaded_set(br, p, fdb_info->addr,
fdb_info->vid, fdb_info->offloaded);
break;
+ case SWITCHDEV_FDB_FLUSH_TO_BRIDGE:
+ fdb_info = ptr;
+ /* Don't delete static entries */
+ br_fdb_delete_by_port(br, p, fdb_info->vid, 0);
+ break;
}
out:
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 5e71fc8b826f..2db800fc27ca 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -103,7 +103,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
/*
* Legacy ioctl's through SIOCDEVPRIVATE
- * This interface is deprecated because it was too difficult to
+ * This interface is deprecated because it was too difficult
* to do the translation for 32/64bit ioctl compatibility.
*/
static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index da5ed4cf9233..00f1651a6aba 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -77,10 +77,67 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
#endif
}
+static int __mdb_fill_srcs(struct sk_buff *skb,
+ struct net_bridge_port_group *p)
+{
+ struct net_bridge_group_src *ent;
+ struct nlattr *nest, *nest_ent;
+
+ if (hlist_empty(&p->src_list))
+ return 0;
+
+ nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(ent, &p->src_list, node,
+ lockdep_is_held(&p->port->br->multicast_lock)) {
+ nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY);
+ if (!nest_ent)
+ goto out_cancel_err;
+ switch (ent->addr.proto) {
+ case htons(ETH_P_IP):
+ if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ ent->addr.u.ip4)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ &ent->addr.u.ip6)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#endif
+ default:
+ nla_nest_cancel(skb, nest_ent);
+ continue;
+ }
+ if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER,
+ br_timer_value(&ent->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ nla_nest_end(skb, nest_ent);
+ }
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+out_cancel_err:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int __mdb_fill_info(struct sk_buff *skb,
struct net_bridge_mdb_entry *mp,
struct net_bridge_port_group *p)
{
+ bool dump_srcs_mode = false;
struct timer_list *mtimer;
struct nlattr *nest_ent;
struct br_mdb_entry e;
@@ -119,6 +176,23 @@ static int __mdb_fill_info(struct sk_buff *skb,
nla_nest_cancel(skb, nest_ent);
return -EMSGSIZE;
}
+ switch (mp->addr.proto) {
+ case htons(ETH_P_IP):
+ dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2);
+ break;
+#endif
+ }
+ if (dump_srcs_mode &&
+ (__mdb_fill_srcs(skb, p) ||
+ nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) {
+ nla_nest_cancel(skb, nest_ent);
+ return -EMSGSIZE;
+ }
+
nla_nest_end(skb, nest_ent);
return 0;
@@ -127,7 +201,7 @@ static int __mdb_fill_info(struct sk_buff *skb,
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
- int idx = 0, s_idx = cb->args[1], err = 0;
+ int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2];
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_mdb_entry *mp;
struct nlattr *nest, *nest2;
@@ -152,7 +226,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
break;
}
- if (mp->host_joined) {
+ if (!s_pidx && mp->host_joined) {
err = __mdb_fill_info(skb, mp, NULL);
if (err) {
nla_nest_cancel(skb, nest2);
@@ -164,13 +238,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
pp = &p->next) {
if (!p->port)
continue;
+ if (pidx < s_pidx)
+ goto skip_pg;
err = __mdb_fill_info(skb, mp, p);
if (err) {
- nla_nest_cancel(skb, nest2);
+ nla_nest_end(skb, nest2);
goto out;
}
+skip_pg:
+ pidx++;
}
+ pidx = 0;
+ s_pidx = 0;
nla_nest_end(skb, nest2);
skip:
idx++;
@@ -178,6 +258,7 @@ skip:
out:
cb->args[1] = idx;
+ cb->args[2] = pidx;
nla_nest_end(skb, nest);
return err;
}
@@ -263,14 +344,15 @@ out:
static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
struct net_device *dev,
- struct br_mdb_entry *entry, u32 pid,
- u32 seq, int type, unsigned int flags)
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
struct nlmsghdr *nlh;
struct br_port_msg *bpm;
struct nlattr *nest, *nest2;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
+ nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0);
if (!nlh)
return -EMSGSIZE;
@@ -285,7 +367,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
if (nest2 == NULL)
goto end;
- if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(*entry), entry))
+ if (__mdb_fill_info(skb, mp, pg))
goto end;
nla_nest_end(skb, nest2);
@@ -300,10 +382,49 @@ cancel:
return -EMSGSIZE;
}
-static inline size_t rtnl_mdb_nlmsg_size(void)
+static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg)
{
- return NLMSG_ALIGN(sizeof(struct br_port_msg))
- + nla_total_size(sizeof(struct br_mdb_entry));
+ size_t nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+ nla_total_size(sizeof(struct br_mdb_entry)) +
+ nla_total_size(sizeof(u32));
+ struct net_bridge_group_src *ent;
+ size_t addr_size = 0;
+
+ if (!pg)
+ goto out;
+
+ switch (pg->addr.proto) {
+ case htons(ETH_P_IP):
+ if (pg->port->br->multicast_igmp_version == 2)
+ goto out;
+ addr_size = sizeof(__be32);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (pg->port->br->multicast_mld_version == 1)
+ goto out;
+ addr_size = sizeof(struct in6_addr);
+ break;
+#endif
+ }
+
+ /* MDBA_MDB_EATTR_GROUP_MODE */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ /* MDBA_MDB_EATTR_SRC_LIST nested attr */
+ if (!hlist_empty(&pg->src_list))
+ nlmsg_size += nla_total_size(0);
+
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ /* MDBA_MDB_SRCLIST_ENTRY nested attr +
+ * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER
+ */
+ nlmsg_size += nla_total_size(0) +
+ nla_total_size(addr_size) +
+ nla_total_size(sizeof(u32));
+ }
+out:
+ return nlmsg_size;
}
struct br_mdb_complete_info {
@@ -341,21 +462,22 @@ err:
static void br_mdb_switchdev_host_port(struct net_device *dev,
struct net_device *lower_dev,
- struct br_mdb_entry *entry, int type)
+ struct net_bridge_mdb_entry *mp,
+ int type)
{
struct switchdev_obj_port_mdb mdb = {
.obj = {
.id = SWITCHDEV_OBJ_ID_HOST_MDB,
.flags = SWITCHDEV_F_DEFER,
},
- .vid = entry->vid,
+ .vid = mp->addr.vid,
};
- if (entry->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.u.ip4, mdb.addr);
#if IS_ENABLED(CONFIG_IPV6)
else
- ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ ipv6_eth_mc_map(&mp->addr.u.ip6, mdb.addr);
#endif
mdb.obj.orig_dev = dev;
@@ -370,17 +492,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
}
static void br_mdb_switchdev_host(struct net_device *dev,
- struct br_mdb_entry *entry, int type)
+ struct net_bridge_mdb_entry *mp, int type)
{
struct net_device *lower_dev;
struct list_head *iter;
netdev_for_each_lower_dev(dev, lower_dev, iter)
- br_mdb_switchdev_host_port(dev, lower_dev, entry, type);
+ br_mdb_switchdev_host_port(dev, lower_dev, mp, type);
}
-static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
- struct br_mdb_entry *entry, int type)
+void br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
struct br_mdb_complete_info *complete_info;
struct switchdev_obj_port_mdb mdb = {
@@ -388,44 +512,45 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
.id = SWITCHDEV_OBJ_ID_PORT_MDB,
.flags = SWITCHDEV_F_DEFER,
},
- .vid = entry->vid,
+ .vid = mp->addr.vid,
};
- struct net_device *port_dev;
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- port_dev = __dev_get_by_index(net, entry->ifindex);
- if (entry->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ if (pg) {
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.u.ip4, mdb.addr);
#if IS_ENABLED(CONFIG_IPV6)
- else
- ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ else
+ ipv6_eth_mc_map(&mp->addr.u.ip6, mdb.addr);
#endif
-
- mdb.obj.orig_dev = port_dev;
- if (p && port_dev && type == RTM_NEWMDB) {
- complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
- if (complete_info) {
- complete_info->port = p;
- __mdb_entry_to_br_ip(entry, &complete_info->ip);
+ mdb.obj.orig_dev = pg->port->dev;
+ switch (type) {
+ case RTM_NEWMDB:
+ complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
+ if (!complete_info)
+ break;
+ complete_info->port = pg->port;
+ complete_info->ip = mp->addr;
mdb.obj.complete_priv = complete_info;
mdb.obj.complete = br_mdb_complete;
- if (switchdev_port_obj_add(port_dev, &mdb.obj, NULL))
+ if (switchdev_port_obj_add(pg->port->dev, &mdb.obj, NULL))
kfree(complete_info);
+ break;
+ case RTM_DELMDB:
+ switchdev_port_obj_del(pg->port->dev, &mdb.obj);
+ break;
}
- } else if (p && port_dev && type == RTM_DELMDB) {
- switchdev_port_obj_del(port_dev, &mdb.obj);
+ } else {
+ br_mdb_switchdev_host(dev, mp, type);
}
- if (!p)
- br_mdb_switchdev_host(dev, entry, type);
-
- skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
+ skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC);
if (!skb)
goto errout;
- err = nlmsg_populate_mdb_fill(skb, dev, entry, 0, 0, type, NTF_SELF);
+ err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -437,26 +562,6 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 flags)
-{
- struct br_mdb_entry entry;
-
- memset(&entry, 0, sizeof(entry));
- if (port)
- entry.ifindex = port->dev->ifindex;
- else
- entry.ifindex = dev->ifindex;
- entry.addr.proto = group->proto;
- entry.addr.u.ip4 = group->u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- entry.addr.u.ip6 = group->u.ip6;
-#endif
- entry.vid = group->vid;
- __mdb_entry_fill_flags(&entry, flags);
- __br_mdb_notify(dev, port, &entry, type);
-}
-
static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
struct net_device *dev,
int ifindex, u32 pid,
@@ -600,7 +705,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
}
static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group, unsigned char state)
+ struct br_ip *group, struct br_mdb_entry *entry)
{
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
@@ -619,12 +724,13 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
/* host join */
if (!port) {
/* don't allow any flags for host-joined groups */
- if (state)
+ if (entry->state)
return -EINVAL;
if (mp->host_joined)
return -EEXIST;
br_multicast_host_join(mp, false);
+ br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB);
return 0;
}
@@ -638,12 +744,14 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
break;
}
- p = br_multicast_new_port_group(port, group, *pp, state, NULL);
+ p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL,
+ MCAST_EXCLUDE);
if (unlikely(!p))
return -ENOMEM;
rcu_assign_pointer(*pp, p);
- if (state == MDB_TEMPORARY)
+ if (entry->state == MDB_TEMPORARY)
mod_timer(&p->timer, now + br->multicast_membership_interval);
+ br_mdb_notify(br->dev, mp, p, RTM_NEWMDB);
return 0;
}
@@ -672,7 +780,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
__mdb_entry_to_br_ip(entry, &ip);
spin_lock_bh(&br->multicast_lock);
- ret = br_mdb_add_group(br, p, &ip, entry->state);
+ ret = br_mdb_add_group(br, p, &ip, entry);
spin_unlock_bh(&br->multicast_lock);
return ret;
}
@@ -717,12 +825,9 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
err = __br_mdb_add(net, br, entry);
if (err)
break;
- __br_mdb_notify(dev, p, entry, RTM_NEWMDB);
}
} else {
err = __br_mdb_add(net, br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_NEWMDB);
}
return err;
@@ -750,6 +855,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) {
br_multicast_host_leave(mp, false);
err = 0;
+ br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB);
if (!mp->ports && netif_running(br->dev))
mod_timer(&mp->timer, jiffies);
goto unlock;
@@ -764,16 +870,8 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (p->port->state == BR_STATE_DISABLED)
goto unlock;
- __mdb_entry_fill_flags(entry, p->flags);
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- kfree_rcu(p, rcu);
+ br_multicast_del_pg(mp, p, pp);
err = 0;
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
break;
}
@@ -820,13 +918,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_DELMDB);
}
} else {
err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_DELMDB);
}
return err;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4c4a93abde68..e77f1e27caf7 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -50,6 +50,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
__be32 group,
__u16 vid,
const unsigned char *src);
+static void br_multicast_port_group_rexmit(struct timer_list *t);
static void __del_port_router(struct net_bridge_port *p);
#if IS_ENABLED(CONFIG_IPV6)
@@ -139,6 +140,29 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
return br_mdb_ip_get_rcu(br, &ip);
}
+static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_mdb_entry *mp;
+
+ mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc);
+ WARN_ON(!hlist_unhashed(&mp->mdb_node));
+ WARN_ON(mp->ports);
+
+ del_timer_sync(&mp->timer);
+ kfree_rcu(mp, rcu);
+}
+
+static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp)
+{
+ struct net_bridge *br = mp->br;
+
+ rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ hlist_del_init_rcu(&mp->mdb_node);
+ hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
static void br_multicast_group_expired(struct timer_list *t)
{
struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer);
@@ -152,23 +176,71 @@ static void br_multicast_group_expired(struct timer_list *t)
if (mp->ports)
goto out;
+ br_multicast_del_mdb_entry(mp);
+out:
+ spin_unlock(&br->multicast_lock);
+}
- rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
- br_mdb_rht_params);
- hlist_del_rcu(&mp->mdb_node);
+static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_src *src;
- kfree_rcu(mp, rcu);
+ src = container_of(gc, struct net_bridge_group_src, mcast_gc);
+ WARN_ON(!hlist_unhashed(&src->node));
-out:
- spin_unlock(&br->multicast_lock);
+ del_timer_sync(&src->timer);
+ kfree_rcu(src, rcu);
}
-static void br_multicast_del_pg(struct net_bridge *br,
- struct net_bridge_port_group *pg)
+static void br_multicast_del_group_src(struct net_bridge_group_src *src)
{
+ struct net_bridge *br = src->pg->port->br;
+
+ hlist_del_init_rcu(&src->node);
+ src->pg->src_ents--;
+ hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
+static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_port_group *pg;
+
+ pg = container_of(gc, struct net_bridge_port_group, mcast_gc);
+ WARN_ON(!hlist_unhashed(&pg->mglist));
+ WARN_ON(!hlist_empty(&pg->src_list));
+
+ del_timer_sync(&pg->rexmit_timer);
+ del_timer_sync(&pg->timer);
+ kfree_rcu(pg, rcu);
+}
+
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp)
+{
+ struct net_bridge *br = pg->port->br;
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+
+ rcu_assign_pointer(*pp, pg->next);
+ hlist_del_init(&pg->mglist);
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ br_multicast_del_group_src(ent);
+ br_mdb_notify(br->dev, mp, pg, RTM_DELMDB);
+ hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+
+ if (!mp->ports && !mp->host_joined && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+}
+
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port_group __rcu **pp;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
- struct net_bridge_port_group __rcu **pp;
mp = br_mdb_ip_get(br, &pg->addr);
if (WARN_ON(!mp))
@@ -180,17 +252,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
if (p != pg)
continue;
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
- p->flags);
- kfree_rcu(p, rcu);
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
-
+ br_multicast_del_pg(mp, pg, pp);
return;
}
@@ -200,35 +262,95 @@ static void br_multicast_del_pg(struct net_bridge *br,
static void br_multicast_port_group_expired(struct timer_list *t)
{
struct net_bridge_port_group *pg = from_timer(pg, t, timer);
+ struct net_bridge_group_src *src_ent;
struct net_bridge *br = pg->port->br;
+ struct hlist_node *tmp;
+ bool changed;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT)
goto out;
- br_multicast_del_pg(br, pg);
+ changed = !!(pg->filter_mode == MCAST_EXCLUDE);
+ pg->filter_mode = MCAST_INCLUDE;
+ hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) {
+ if (!timer_pending(&src_ent->timer)) {
+ br_multicast_del_group_src(src_ent);
+ changed = true;
+ }
+ }
+
+ if (hlist_empty(&pg->src_list)) {
+ br_multicast_find_del_pg(br, pg);
+ } else if (changed) {
+ struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->addr);
+ if (WARN_ON(!mp))
+ goto out;
+ br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB);
+ }
out:
spin_unlock(&br->multicast_lock);
}
-static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
- __be32 group,
- u8 *igmp_type)
+static void br_multicast_gc(struct hlist_head *head)
{
+ struct net_bridge_mcast_gc *gcent;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(gcent, tmp, head, gc_node) {
+ hlist_del_init(&gcent->gc_node);
+ gcent->destroy(gcent);
+ }
+}
+
+static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
+ struct net_bridge_port_group *pg,
+ __be32 ip_dst, __be32 group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ struct net_bridge_port *p = pg ? pg->port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, igmp_hdr_size;
+ unsigned long now = jiffies;
struct igmpv3_query *ihv3;
- size_t igmp_hdr_size;
+ void *csum_start = NULL;
+ __sum16 *csum = NULL;
struct sk_buff *skb;
struct igmphdr *ih;
struct ethhdr *eth;
+ unsigned long lmqt;
struct iphdr *iph;
+ u16 lmqt_srcs = 0;
igmp_hdr_size = sizeof(*ih);
- if (br->multicast_igmp_version == 3)
+ if (br->multicast_igmp_version == 3) {
igmp_hdr_size = sizeof(*ihv3);
- skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
- igmp_hdr_size + 4);
+ if (pg && with_srcs) {
+ lmqt = now + (br->multicast_last_member_interval *
+ br->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ lmqt_srcs++;
+ }
+
+ if (!lmqt_srcs)
+ return NULL;
+ igmp_hdr_size += lmqt_srcs * sizeof(__be32);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(br->dev, pkt_size);
if (!skb)
goto out;
@@ -238,29 +360,24 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
eth = eth_hdr(skb);
ether_addr_copy(eth->h_source, br->dev->dev_addr);
- eth->h_dest[0] = 1;
- eth->h_dest[1] = 0;
- eth->h_dest[2] = 0x5e;
- eth->h_dest[3] = 0;
- eth->h_dest[4] = 0;
- eth->h_dest[5] = 1;
+ ip_eth_mc_map(ip_dst, eth->h_dest);
eth->h_proto = htons(ETH_P_IP);
skb_put(skb, sizeof(*eth));
skb_set_network_header(skb, skb->len);
iph = ip_hdr(skb);
+ iph->tot_len = htons(pkt_size - sizeof(*eth));
iph->version = 4;
iph->ihl = 6;
iph->tos = 0xc0;
- iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->protocol = IPPROTO_IGMP;
iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
- iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
+ iph->daddr = ip_dst;
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
@@ -280,7 +397,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
(HZ / IGMP_TIMER_SCALE);
ih->group = group;
ih->csum = 0;
- ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
+ csum = &ih->csum;
+ csum_start = (void *)ih;
break;
case 3:
ihv3 = igmpv3_query_hdr(skb);
@@ -290,15 +408,40 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
(HZ / IGMP_TIMER_SCALE);
ihv3->group = group;
ihv3->qqic = br->multicast_query_interval / HZ;
- ihv3->nsrcs = 0;
+ ihv3->nsrcs = htons(lmqt_srcs);
ihv3->resv = 0;
- ihv3->suppress = 0;
+ ihv3->suppress = sflag;
ihv3->qrv = 2;
ihv3->csum = 0;
- ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
+ csum = &ihv3->csum;
+ csum_start = (void *)ihv3;
+ if (!pg || !with_srcs)
+ break;
+
+ lmqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ ihv3->srcs[lmqt_srcs++] = ent->addr.u.ip4;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
break;
}
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = ip_compute_csum(csum_start, igmp_hdr_size);
skb_put(skb, igmp_hdr_size);
__skb_pull(skb, sizeof(*eth));
@@ -308,23 +451,54 @@ out:
#if IS_ENABLED(CONFIG_IPV6)
static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
- const struct in6_addr *grp,
- u8 *igmp_type)
-{
+ struct net_bridge_port_group *pg,
+ const struct in6_addr *ip6_dst,
+ const struct in6_addr *group,
+ bool with_srcs, bool over_llqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
+{
+ struct net_bridge_port *p = pg ? pg->port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, mld_hdr_size;
+ unsigned long now = jiffies;
struct mld2_query *mld2q;
+ void *csum_start = NULL;
unsigned long interval;
+ __sum16 *csum = NULL;
struct ipv6hdr *ip6h;
struct mld_msg *mldq;
- size_t mld_hdr_size;
struct sk_buff *skb;
+ unsigned long llqt;
struct ethhdr *eth;
+ u16 llqt_srcs = 0;
u8 *hopopt;
mld_hdr_size = sizeof(*mldq);
- if (br->multicast_mld_version == 2)
+ if (br->multicast_mld_version == 2) {
mld_hdr_size = sizeof(*mld2q);
- skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
- 8 + mld_hdr_size);
+ if (pg && with_srcs) {
+ llqt = now + (br->multicast_last_member_interval *
+ br->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ llqt_srcs++;
+ }
+
+ if (!llqt_srcs)
+ return NULL;
+ mld_hdr_size += llqt_srcs * sizeof(struct in6_addr);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(br->dev, pkt_size);
if (!skb)
goto out;
@@ -346,7 +520,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h->payload_len = htons(8 + mld_hdr_size);
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
- ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
+ ip6h->daddr = *ip6_dst;
if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
&ip6h->saddr)) {
kfree_skb(skb);
@@ -371,7 +545,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
/* ICMPv6 */
skb_set_transport_header(skb, skb->len);
- interval = ipv6_addr_any(grp) ?
+ interval = ipv6_addr_any(group) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
*igmp_type = ICMPV6_MGM_QUERY;
@@ -383,12 +557,9 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
mldq->mld_cksum = 0;
mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
mldq->mld_reserved = 0;
- mldq->mld_mca = *grp;
- mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mldq), IPPROTO_ICMPV6,
- csum_partial(mldq,
- sizeof(*mldq),
- 0));
+ mldq->mld_mca = *group;
+ csum = &mldq->mld_cksum;
+ csum_start = (void *)mldq;
break;
case 2:
mld2q = (struct mld2_query *)icmp6_hdr(skb);
@@ -398,21 +569,43 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
mld2q->mld2q_cksum = 0;
mld2q->mld2q_resv1 = 0;
mld2q->mld2q_resv2 = 0;
- mld2q->mld2q_suppress = 0;
+ mld2q->mld2q_suppress = sflag;
mld2q->mld2q_qrv = 2;
- mld2q->mld2q_nsrcs = 0;
+ mld2q->mld2q_nsrcs = htons(llqt_srcs);
mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
- mld2q->mld2q_mca = *grp;
- mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mld2q),
- IPPROTO_ICMPV6,
- csum_partial(mld2q,
- sizeof(*mld2q),
- 0));
+ mld2q->mld2q_mca = *group;
+ csum = &mld2q->mld2q_cksum;
+ csum_start = (void *)mld2q;
+ if (!pg || !with_srcs)
+ break;
+
+ llqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.u.ip6;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
break;
}
- skb_put(skb, mld_hdr_size);
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size,
+ IPPROTO_ICMPV6,
+ csum_partial(csum_start, mld_hdr_size, 0));
+ skb_put(skb, mld_hdr_size);
__skb_pull(skb, sizeof(*eth));
out:
@@ -421,16 +614,39 @@ out:
#endif
static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
- struct br_ip *addr,
- u8 *igmp_type)
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
{
- switch (addr->proto) {
+ __be32 ip4_dst;
+
+ switch (group->proto) {
case htons(ETH_P_IP):
- return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type);
+ ip4_dst = ip_dst ? ip_dst->u.ip4 : htonl(INADDR_ALLHOSTS_GROUP);
+ return br_ip4_multicast_alloc_query(br, pg,
+ ip4_dst, group->u.ip4,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return br_ip6_multicast_alloc_query(br, &addr->u.ip6,
- igmp_type);
+ case htons(ETH_P_IPV6): {
+ struct in6_addr ip6_dst;
+
+ if (ip_dst)
+ ip6_dst = ip_dst->u.ip6;
+ else
+ ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0,
+ htonl(1));
+
+ return br_ip6_multicast_alloc_query(br, pg,
+ &ip6_dst, &group->u.ip6,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
+ }
#endif
}
return NULL;
@@ -457,6 +673,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
mp->br = br;
mp->addr = *group;
+ mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry;
timer_setup(&mp->timer, br_multicast_group_expired, 0);
err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode,
br_mdb_rht_params);
@@ -470,12 +687,97 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
return mp;
}
+static void br_multicast_group_src_expired(struct timer_list *t)
+{
+ struct net_bridge_group_src *src = from_timer(src, t, timer);
+ struct net_bridge_port_group *pg;
+ struct net_bridge *br = src->br;
+
+ spin_lock(&br->multicast_lock);
+ if (hlist_unhashed(&src->node) || !netif_running(br->dev) ||
+ timer_pending(&src->timer))
+ goto out;
+
+ pg = src->pg;
+ if (pg->filter_mode == MCAST_INCLUDE) {
+ br_multicast_del_group_src(src);
+ if (!hlist_empty(&pg->src_list))
+ goto out;
+ br_multicast_find_del_pg(br, pg);
+ }
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static struct net_bridge_group_src *
+br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip)
+{
+ struct net_bridge_group_src *ent;
+
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (ip->u.ip4 == ent->addr.u.ip4)
+ return ent;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (!ipv6_addr_cmp(&ent->addr.u.ip6, &ip->u.ip6))
+ return ent;
+ break;
+#endif
+ }
+
+ return NULL;
+}
+
+static struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip)
+{
+ struct net_bridge_group_src *grp_src;
+
+ if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT))
+ return NULL;
+
+ switch (src_ip->proto) {
+ case htons(ETH_P_IP):
+ if (ipv4_is_zeronet(src_ip->u.ip4) ||
+ ipv4_is_multicast(src_ip->u.ip4))
+ return NULL;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (ipv6_addr_any(&src_ip->u.ip6) ||
+ ipv6_addr_is_multicast(&src_ip->u.ip6))
+ return NULL;
+ break;
+#endif
+ }
+
+ grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC);
+ if (unlikely(!grp_src))
+ return NULL;
+
+ grp_src->pg = pg;
+ grp_src->br = pg->port->br;
+ grp_src->addr = *src_ip;
+ grp_src->mcast_gc.destroy = br_multicast_destroy_group_src;
+ timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0);
+
+ hlist_add_head_rcu(&grp_src->node, &pg->src_list);
+ pg->src_ents++;
+
+ return grp_src;
+}
+
struct net_bridge_port_group *br_multicast_new_port_group(
struct net_bridge_port *port,
struct br_ip *group,
struct net_bridge_port_group __rcu *next,
unsigned char flags,
- const unsigned char *src)
+ const unsigned char *src,
+ u8 filter_mode)
{
struct net_bridge_port_group *p;
@@ -486,9 +788,13 @@ struct net_bridge_port_group *br_multicast_new_port_group(
p->addr = *group;
p->port = port;
p->flags = flags;
+ p->filter_mode = filter_mode;
+ p->mcast_gc.destroy = br_multicast_destroy_port_group;
+ INIT_HLIST_HEAD(&p->src_list);
rcu_assign_pointer(p->next, next);
- hlist_add_head(&p->mglist, &port->mglist);
timer_setup(&p->timer, br_multicast_port_group_expired, 0);
+ timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0);
+ hlist_add_head(&p->mglist, &port->mglist);
if (src)
memcpy(p->eth_addr, src, ETH_ALEN);
@@ -516,8 +822,7 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify)
if (!mp->host_joined) {
mp->host_joined = true;
if (notify)
- br_mdb_notify(mp->br->dev, NULL, &mp->addr,
- RTM_NEWMDB, 0);
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB);
}
mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval);
}
@@ -529,13 +834,15 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify)
mp->host_joined = false;
if (notify)
- br_mdb_notify(mp->br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB);
}
static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group,
- const unsigned char *src)
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1)
{
struct net_bridge_port_group __rcu **pp;
struct net_bridge_port_group *p;
@@ -567,14 +874,16 @@ static int br_multicast_add_group(struct net_bridge *br,
break;
}
- p = br_multicast_new_port_group(port, group, *pp, 0, src);
+ p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode);
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0);
+ br_mdb_notify(br->dev, mp, p, RTM_NEWMDB);
found:
- mod_timer(&p->timer, now + br->multicast_membership_interval);
+ if (igmpv2_mldv1)
+ mod_timer(&p->timer, now + br->multicast_membership_interval);
+
out:
err = 0;
@@ -587,9 +896,11 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
__u16 vid,
- const unsigned char *src)
+ const unsigned char *src,
+ bool igmpv2)
{
struct br_ip br_group;
+ u8 filter_mode;
if (ipv4_is_local_multicast(group))
return 0;
@@ -598,8 +909,10 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
br_group.u.ip4 = group;
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
+ filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE;
- return br_multicast_add_group(br, port, &br_group, src);
+ return br_multicast_add_group(br, port, &br_group, src, filter_mode,
+ igmpv2);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -607,9 +920,11 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
__u16 vid,
- const unsigned char *src)
+ const unsigned char *src,
+ bool mldv1)
{
struct br_ip br_group;
+ u8 filter_mode;
if (ipv6_addr_is_ll_all_nodes(group))
return 0;
@@ -618,8 +933,10 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
br_group.u.ip6 = *group;
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
+ filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE;
- return br_multicast_add_group(br, port, &br_group, src);
+ return br_multicast_add_group(br, port, &br_group, src, filter_mode,
+ mldv1);
}
#endif
@@ -711,12 +1028,21 @@ static void br_multicast_select_own_querier(struct net_bridge *br,
static void __br_multicast_send_query(struct net_bridge *br,
struct net_bridge_port *port,
- struct br_ip *ip)
-{
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs,
+ u8 sflag,
+ bool *need_rexmit)
+{
+ bool over_lmqt = !!sflag;
struct sk_buff *skb;
u8 igmp_type;
- skb = br_multicast_alloc_query(br, ip, &igmp_type);
+again_under_lmqt:
+ skb = br_multicast_alloc_query(br, pg, ip_dst, group, with_srcs,
+ over_lmqt, sflag, &igmp_type,
+ need_rexmit);
if (!skb)
return;
@@ -727,8 +1053,13 @@ static void __br_multicast_send_query(struct net_bridge *br,
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
+
+ if (over_lmqt && with_srcs && sflag) {
+ over_lmqt = false;
+ goto again_under_lmqt;
+ }
} else {
- br_multicast_select_own_querier(br, ip, skb);
+ br_multicast_select_own_querier(br, group, skb);
br_multicast_count(br, port, skb, igmp_type,
BR_MCAST_DIR_RX);
netif_rx(skb);
@@ -764,7 +1095,8 @@ static void br_multicast_send_query(struct net_bridge *br,
if (!other_query || timer_pending(&other_query->timer))
return;
- __br_multicast_send_query(br, port, &br_group);
+ __br_multicast_send_query(br, port, NULL, NULL, &br_group, false, 0,
+ NULL);
time = jiffies;
time += own_query->startup_sent < br->multicast_startup_query_count ?
@@ -809,6 +1141,44 @@ static void br_ip6_multicast_port_query_expired(struct timer_list *t)
}
#endif
+static void br_multicast_port_group_rexmit(struct timer_list *t)
+{
+ struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer);
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->port->br;
+ bool need_rexmit = false;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ !br_opt_get(br, BROPT_MULTICAST_QUERIER))
+ goto out;
+
+ if (pg->addr.proto == htons(ETH_P_IP))
+ other_query = &br->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &br->ip6_other_query;
+#endif
+
+ if (!other_query || timer_pending(&other_query->timer))
+ goto out;
+
+ if (pg->grp_query_rexmit_cnt) {
+ pg->grp_query_rexmit_cnt--;
+ __br_multicast_send_query(br, pg->port, pg, &pg->addr,
+ &pg->addr, false, 1, NULL);
+ }
+ __br_multicast_send_query(br, pg->port, pg, &pg->addr,
+ &pg->addr, true, 0, &need_rexmit);
+
+ if (pg->grp_query_rexmit_cnt || need_rexmit)
+ mod_timer(&pg->rexmit_timer, jiffies +
+ br->multicast_last_member_interval);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
static void br_mc_disabled_update(struct net_device *dev, bool value)
{
struct switchdev_attr attr = {
@@ -847,13 +1217,16 @@ void br_multicast_del_port(struct net_bridge_port *port)
{
struct net_bridge *br = port->br;
struct net_bridge_port_group *pg;
+ HLIST_HEAD(deleted_head);
struct hlist_node *n;
/* Take care of the remaining groups, only perm ones should be left */
spin_lock_bh(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- br_multicast_del_pg(br, pg);
+ br_multicast_find_del_pg(br, pg);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
spin_unlock_bh(&br->multicast_lock);
+ br_multicast_gc(&deleted_head);
del_timer_sync(&port->multicast_router_timer);
free_percpu(port->mcast_stats);
}
@@ -901,7 +1274,7 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_lock(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
- br_multicast_del_pg(br, pg);
+ br_multicast_find_del_pg(br, pg);
__del_port_router(port);
@@ -913,20 +1286,561 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_unlock(&br->multicast_lock);
}
+static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+ int deleted = 0;
+
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ if (ent->flags & BR_SGRP_F_DELETE) {
+ br_multicast_del_group_src(ent);
+ deleted++;
+ }
+
+ return deleted;
+}
+
+static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->port->br;
+ u32 lmqc = br->multicast_last_member_count;
+ unsigned long lmqt, lmi, now = jiffies;
+ struct net_bridge_group_src *ent;
+
+ if (!netif_running(br->dev) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->addr.proto == htons(ETH_P_IP))
+ other_query = &br->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &br->ip6_other_query;
+#endif
+
+ lmqt = now + br_multicast_lmqt(br);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (ent->flags & BR_SGRP_F_SEND) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ if (ent->timer.expires > lmqt) {
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER) &&
+ other_query &&
+ !timer_pending(&other_query->timer))
+ ent->src_query_rexmit_cnt = lmqc;
+ mod_timer(&ent->timer, lmqt);
+ }
+ }
+ }
+
+ if (!br_opt_get(br, BROPT_MULTICAST_QUERIER) ||
+ !other_query || timer_pending(&other_query->timer))
+ return;
+
+ __br_multicast_send_query(br, pg->port, pg, &pg->addr,
+ &pg->addr, true, 1, NULL);
+
+ lmi = now + br->multicast_last_member_interval;
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+}
+
+static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->port->br;
+ unsigned long now = jiffies, lmi;
+
+ if (!netif_running(br->dev) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->addr.proto == htons(ETH_P_IP))
+ other_query = &br->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &br->ip6_other_query;
+#endif
+
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER) &&
+ other_query && !timer_pending(&other_query->timer)) {
+ lmi = now + br->multicast_last_member_interval;
+ pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1;
+ __br_multicast_send_query(br, pg->port, pg, &pg->addr,
+ &pg->addr, false, 0, NULL);
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+ }
+
+ if (pg->filter_mode == MCAST_EXCLUDE &&
+ (!timer_pending(&pg->timer) ||
+ time_after(pg->timer.expires, now + br_multicast_lmqt(br))))
+ mod_timer(&pg->timer, now + br_multicast_lmqt(br));
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI
+ * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI
+ * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ */
+static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->port->br;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+
+ if (ent)
+ mod_timer(&ent->timer, now + br_multicast_gmi(br));
+ srcs += src_size;
+ }
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Group Timer=GMI
+ */
+static void __grp_src_isexc_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent)
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ else
+ br_multicast_new_group_src(pg, &src_ip);
+ srcs += src_size;
+ }
+
+ __grp_src_delete_marked(pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Group Timer=GMI
+ */
+static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->port->br;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ mod_timer(&ent->timer,
+ now + br_multicast_gmi(br));
+ changed = true;
+ }
+ }
+ srcs += src_size;
+ }
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+
+ return changed;
+}
+
+static bool br_multicast_isexc(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->port->br;
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_isexc_incl(pg, srcs, nsrcs, src_size);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_isexc_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(br));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI
+ * Send Q(G,A-B)
+ */
+static bool __grp_src_toin_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->port->br;
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ mod_timer(&ent->timer, now + br_multicast_gmi(br));
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ * Send Q(G,X-A)
+ * Send Q(G)
+ */
+static bool __grp_src_toin_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->port->br;
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (timer_pending(&ent->timer))
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ if (timer_pending(&ent->timer)) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ }
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ mod_timer(&ent->timer, now + br_multicast_gmi(br));
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ __grp_send_query_and_rexmit(pg);
+
+ return changed;
+}
+
+static bool br_multicast_toin(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ changed = __grp_src_toin_incl(pg, srcs, nsrcs, src_size);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toin_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Send Q(G,A*B)
+ * Group Timer=GMI
+ */
+static void __grp_src_toex_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) |
+ BR_SGRP_F_SEND;
+ to_send++;
+ } else {
+ br_multicast_new_group_src(pg, &src_ip);
+ }
+ srcs += src_size;
+ }
+
+ __grp_src_delete_marked(pg);
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Send Q(G,A-Y)
+ * Group Timer=GMI
+ */
+static bool __grp_src_toex_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ mod_timer(&ent->timer, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ srcs += src_size;
+ }
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ return changed;
+}
+
+static bool br_multicast_toex(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge *br = pg->port->br;
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_toex_incl(pg, srcs, nsrcs, src_size);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toex_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(br));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B)
+ */
+static void __grp_src_block_incl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ if (pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list))
+ br_multicast_find_del_pg(pg->port->br, pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer
+ * Send Q(G,A-Y)
+ */
+static bool __grp_src_block_excl(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.u, srcs, src_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ mod_timer(&ent->timer, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ srcs += src_size;
+ }
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(pg);
+
+ return changed;
+}
+
+static bool br_multicast_block(struct net_bridge_port_group *pg,
+ void *srcs, u32 nsrcs, size_t src_size)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_block_incl(pg, srcs, nsrcs, src_size);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_block_excl(pg, srcs, nsrcs, src_size);
+ break;
+ }
+
+ return changed;
+}
+
+static struct net_bridge_port_group *
+br_multicast_find_port(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port *p,
+ const unsigned char *src)
+{
+ struct net_bridge *br __maybe_unused = mp->br;
+ struct net_bridge_port_group *pg;
+
+ for (pg = mlock_dereference(mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br))
+ if (br_port_group_equal(pg, p, src))
+ return pg;
+
+ return NULL;
+}
+
static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
u16 vid)
{
+ bool igmpv2 = br->multicast_igmp_version == 2;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
const unsigned char *src;
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
- int i;
- int len;
- int num;
- int type;
- int err = 0;
+ int i, len, num, type;
+ bool changed = false;
__be32 group;
+ int err = 0;
u16 nsrcs;
ih = igmpv3_report_hdr(skb);
@@ -947,7 +1861,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
if (!ip_mc_may_pull(skb, len))
return -EINVAL;
- /* We treat this as an IGMPv2 report for now. */
switch (type) {
case IGMPV3_MODE_IS_INCLUDE:
case IGMPV3_MODE_IS_EXCLUDE:
@@ -962,16 +1875,62 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
}
src = eth_hdr(skb)->h_source;
- if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
- type == IGMPV3_MODE_IS_INCLUDE) &&
- nsrcs == 0) {
- br_ip4_multicast_leave_group(br, port, group, vid, src);
+ if (nsrcs == 0 &&
+ (type == IGMPV3_CHANGE_TO_INCLUDE ||
+ type == IGMPV3_MODE_IS_INCLUDE)) {
+ if (!port || igmpv2) {
+ br_ip4_multicast_leave_group(br, port, group, vid, src);
+ continue;
+ }
} else {
err = br_ip4_multicast_add_group(br, port, group, vid,
- src);
+ src, igmpv2);
if (err)
break;
}
+
+ if (!port || igmpv2)
+ continue;
+
+ spin_lock_bh(&br->multicast_lock);
+ mdst = br_mdb_ip4_get(br, group, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ /* reload grec */
+ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4));
+ switch (type) {
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src,
+ nsrcs, sizeof(__be32));
+ break;
+ case IGMPV3_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(pg, grec->grec_src, nsrcs,
+ sizeof(__be32));
+ break;
+ }
+ if (changed)
+ br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock_bh(&br->multicast_lock);
}
return err;
@@ -983,14 +1942,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ bool mldv1 = br->multicast_mld_version == 1;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
unsigned int nsrcs_offset;
const unsigned char *src;
struct icmp6hdr *icmp6h;
struct mld2_grec *grec;
unsigned int grec_len;
- int i;
- int len;
- int num;
+ bool changed = false;
+ int i, len, num;
int err = 0;
if (!ipv6_mc_may_pull(skb, sizeof(*icmp6h)))
@@ -1024,7 +1985,6 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
grec = (struct mld2_grec *)(skb->data + len);
len += grec_len;
- /* We treat these as MLDv1 reports for now. */
switch (grec->grec_type) {
case MLD2_MODE_IS_INCLUDE:
case MLD2_MODE_IS_EXCLUDE:
@@ -1042,15 +2002,61 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
nsrcs == 0) {
- br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
- vid, src);
+ if (!port || mldv1) {
+ br_ip6_multicast_leave_group(br, port,
+ &grec->grec_mca,
+ vid, src);
+ continue;
+ }
} else {
err = br_ip6_multicast_add_group(br, port,
&grec->grec_mca, vid,
- src);
+ src, mldv1);
if (err)
break;
}
+
+ if (!port || mldv1)
+ continue;
+
+ spin_lock_bh(&br->multicast_lock);
+ mdst = br_mdb_ip6_get(br, &grec->grec_mca, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ switch (grec->grec_type) {
+ case MLD2_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src,
+ nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(pg, grec->grec_src, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ }
+ if (changed)
+ br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock_bh(&br->multicast_lock);
}
return err;
@@ -1245,7 +2251,8 @@ static void br_ip4_multicast_query(struct net_bridge *br,
}
} else if (transport_len >= sizeof(*ih3)) {
ih3 = igmpv3_query_hdr(skb);
- if (ih3->nsrcs)
+ if (ih3->nsrcs ||
+ (br->multicast_igmp_version == 3 && group && ih3->suppress))
goto out;
max_delay = ih3->code ?
@@ -1280,7 +2287,9 @@ static void br_ip4_multicast_query(struct net_bridge *br,
pp = &p->next) {
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&p->timer) >= 0)
+ try_to_del_timer_sync(&p->timer) >= 0 &&
+ (br->multicast_igmp_version == 2 ||
+ p->filter_mode == MCAST_EXCLUDE))
mod_timer(&p->timer, now + max_delay);
}
@@ -1330,6 +2339,10 @@ static int br_ip6_multicast_query(struct net_bridge *br,
mld2q = (struct mld2_query *)icmp6_hdr(skb);
if (!mld2q->mld2q_nsrcs)
group = &mld2q->mld2q_mca;
+ if (br->multicast_mld_version == 2 &&
+ !ipv6_addr_any(&mld2q->mld2q_mca) &&
+ mld2q->mld2q_suppress)
+ goto out;
max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL);
}
@@ -1363,7 +2376,9 @@ static int br_ip6_multicast_query(struct net_bridge *br,
pp = &p->next) {
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&p->timer) >= 0)
+ try_to_del_timer_sync(&p->timer) >= 0 &&
+ (br->multicast_mld_version == 1 ||
+ p->filter_mode == MCAST_EXCLUDE))
mod_timer(&p->timer, now + max_delay);
}
@@ -1407,16 +2422,8 @@ br_multicast_leave_group(struct net_bridge *br,
if (p->flags & MDB_PG_FLAGS_PERMANENT)
break;
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- kfree_rcu(p, rcu);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB,
- p->flags | MDB_PG_FLAGS_FAST_LEAVE);
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
+ p->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_del_pg(mp, p, pp);
}
goto out;
}
@@ -1425,7 +2432,8 @@ br_multicast_leave_group(struct net_bridge *br,
goto out;
if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
- __br_multicast_send_query(br, port, &mp->addr);
+ __br_multicast_send_query(br, port, NULL, NULL, &mp->addr,
+ false, 0, NULL);
time = jiffies + br->multicast_last_member_count *
br->multicast_last_member_interval;
@@ -1627,7 +2635,8 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
+ err = br_ip4_multicast_add_group(br, port, ih->group, vid, src,
+ true);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
err = br_ip4_multicast_igmp3_report(br, port, skb, vid);
@@ -1706,7 +2715,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
src = eth_hdr(skb)->h_source;
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
- src);
+ src, true);
break;
case ICMPV6_MLD2_REPORT:
err = br_ip6_multicast_mld2_report(br, port, skb, vid);
@@ -1781,6 +2790,19 @@ static void br_ip6_multicast_query_expired(struct timer_list *t)
}
#endif
+static void br_multicast_gc_work(struct work_struct *work)
+{
+ struct net_bridge *br = container_of(work, struct net_bridge,
+ mcast_gc_work);
+ HLIST_HEAD(deleted_head);
+
+ spin_lock_bh(&br->multicast_lock);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
+ spin_unlock_bh(&br->multicast_lock);
+
+ br_multicast_gc(&deleted_head);
+}
+
void br_multicast_init(struct net_bridge *br)
{
br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
@@ -1821,6 +2843,8 @@ void br_multicast_init(struct net_bridge *br)
br_ip6_multicast_query_expired, 0);
#endif
INIT_HLIST_HEAD(&br->mdb_list);
+ INIT_HLIST_HEAD(&br->mcast_gc_list);
+ INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work);
}
static void br_ip4_multicast_join_snoopers(struct net_bridge *br)
@@ -1924,18 +2948,18 @@ void br_multicast_stop(struct net_bridge *br)
void br_multicast_dev_del(struct net_bridge *br)
{
struct net_bridge_mdb_entry *mp;
+ HLIST_HEAD(deleted_head);
struct hlist_node *tmp;
spin_lock_bh(&br->multicast_lock);
- hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) {
- del_timer(&mp->timer);
- rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
- br_mdb_rht_params);
- hlist_del_rcu(&mp->mdb_node);
- kfree_rcu(mp, rcu);
- }
+ hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node)
+ br_multicast_del_mdb_entry(mp);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
spin_unlock_bh(&br->multicast_lock);
+ br_multicast_gc(&deleted_head);
+ cancel_work_sync(&br->mcast_gc_work);
+
rcu_barrier();
}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 147d52596e17..8a71c60fa357 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1095,8 +1095,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
- [IFLA_BR_MULTI_BOOLOPT] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct br_boolopt_multi) },
+ [IFLA_BR_MULTI_BOOLOPT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)),
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index baa1500f384f..a23d2bae56e1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -214,26 +214,61 @@ struct net_bridge_fdb_entry {
#define MDB_PG_FLAGS_OFFLOAD BIT(1)
#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
+#define PG_SRC_ENT_LIMIT 32
+
+#define BR_SGRP_F_DELETE BIT(0)
+#define BR_SGRP_F_SEND BIT(1)
+
+struct net_bridge_mcast_gc {
+ struct hlist_node gc_node;
+ void (*destroy)(struct net_bridge_mcast_gc *gc);
+};
+
+struct net_bridge_group_src {
+ struct hlist_node node;
+
+ struct br_ip addr;
+ struct net_bridge_port_group *pg;
+ u8 flags;
+ u8 src_query_rexmit_cnt;
+ struct timer_list timer;
+
+ struct net_bridge *br;
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
+};
+
struct net_bridge_port_group {
struct net_bridge_port *port;
struct net_bridge_port_group __rcu *next;
- struct hlist_node mglist;
- struct rcu_head rcu;
- struct timer_list timer;
struct br_ip addr;
unsigned char eth_addr[ETH_ALEN] __aligned(2);
unsigned char flags;
+ unsigned char filter_mode;
+ unsigned char grp_query_rexmit_cnt;
+
+ struct hlist_head src_list;
+ unsigned int src_ents;
+ struct timer_list timer;
+ struct timer_list rexmit_timer;
+ struct hlist_node mglist;
+
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
};
struct net_bridge_mdb_entry {
struct rhash_head rhnode;
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
- struct rcu_head rcu;
- struct timer_list timer;
struct br_ip addr;
bool host_joined;
+
+ struct timer_list timer;
struct hlist_node mdb_node;
+
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
};
struct net_bridge_port {
@@ -406,6 +441,7 @@ struct net_bridge {
struct rhashtable mdb_hash_tbl;
+ struct hlist_head mcast_gc_list;
struct hlist_head mdb_list;
struct hlist_head router_list;
@@ -419,6 +455,7 @@ struct net_bridge {
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
#endif /* IS_ENABLED(CONFIG_IPV6) */
+ struct work_struct mcast_gc_work;
#endif
struct timer_list hello_timer;
@@ -766,13 +803,17 @@ br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char flags, const unsigned char *src);
+ unsigned char flags, const unsigned char *src,
+ u8 filter_mode);
int br_mdb_hash_init(struct net_bridge *br);
void br_mdb_hash_fini(struct net_bridge *br);
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 flags);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg, int type);
void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
int type);
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp);
void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir);
int br_multicast_init_stats(struct net_bridge *br);
@@ -836,6 +877,19 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
return BR_INPUT_SKB_CB(skb)->igmp;
}
+
+static inline unsigned long br_multicast_lmqt(const struct net_bridge *br)
+{
+ return br->multicast_last_member_interval *
+ br->multicast_last_member_count;
+}
+
+static inline unsigned long br_multicast_gmi(const struct net_bridge *br)
+{
+ /* use the RFC default of 2 for QRV */
+ return 2 * br->multicast_query_interval +
+ br->multicast_query_response_interval;
+}
#else
static inline int br_multicast_rcv(struct net_bridge *br,
struct net_bridge_port *port,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 61c94cefa843..002bbc93209d 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -140,7 +140,7 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
return err == -EOPNOTSUPP ? 0 : err;
}
-/* Returns a master vlan, if it didn't exist it gets created. In all cases a
+/* Returns a master vlan, if it didn't exist it gets created. In all cases
* a reference is taken to the master vlan before returning.
*/
static struct net_bridge_vlan *
@@ -1891,8 +1891,8 @@ out_err:
}
static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
- [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct bridge_vlan_info) },
+ [BRIDGE_VLANDB_ENTRY_INFO] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)),
[BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
[BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 0d6d20c9105e..8f68afda5f81 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -15,7 +15,6 @@
#include <linux/netfilter_bridge/ebt_stp.h>
#define BPDU_TYPE_CONFIG 0
-#define BPDU_TYPE_TCN 0x80
struct stp_header {
u8 dsap;
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index d0a4d0ac7045..9cef9496a707 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -21,7 +21,6 @@
#define SRVL_FLOW_OFF 0x81
#define SRVL_FLOW_ON 0x80
#define SRVL_SET_PIN 0x82
-#define SRVL_CTRL_PKT_SIZE 1
#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 5c06404bdf3e..ea29a6d97ef5 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -358,7 +358,7 @@ static unsigned int effhash(canid_t can_id)
*
* Return:
* Pointer to optimal filterlist for the given can_id/mask pair.
- * Constistency checked mask.
+ * Consistency checked mask.
* Reduced can_id to have a preprocessed filter compare value.
*/
static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
@@ -411,7 +411,7 @@ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
/**
* can_rx_register - subscribe CAN frames from a specific interface
* @net: the applicable net namespace
- * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list)
+ * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list)
* @can_id: CAN identifier (see description)
* @mask: CAN mask (see description)
* @func: callback function on filter match
diff --git a/net/can/bcm.c b/net/can/bcm.c
index d14ea12affb1..4253915800e6 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
*
diff --git a/net/can/gw.c b/net/can/gw.c
index 65d60c93af29..49b4e3d91ad6 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
*
* Copyright (c) 2019 Volkswagen Group Electronic Research
diff --git a/net/can/proc.c b/net/can/proc.c
index e6881bfc3ed1..a4eb06c9eb70 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* proc.c - procfs support for Protocol family CAN core module
*
diff --git a/net/can/raw.c b/net/can/raw.c
index 94a9405658dc..24db4b4afdc7 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* raw.c - Raw sockets for protocol family CAN
*
* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
@@ -154,16 +154,16 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
if (!skb)
return;
- /* Put the datagram to the queue so that raw_recvmsg() can
- * get it from there. We need to pass the interface index to
- * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
- * containing the interface index.
+ /* Put the datagram to the queue so that raw_recvmsg() can get
+ * it from there. We need to pass the interface index to
+ * raw_recvmsg(). We pass a whole struct sockaddr_can in
+ * skb->cb containing the interface index.
*/
sock_skb_cb_check_size(sizeof(struct sockaddr_can));
addr = (struct sockaddr_can *)skb->cb;
memset(addr, 0, sizeof(*addr));
- addr->can_family = AF_CAN;
+ addr->can_family = AF_CAN;
addr->can_ifindex = skb->dev->ifindex;
/* add CAN specific message flags for raw_recvmsg() */
@@ -290,8 +290,8 @@ static int raw_notifier(struct notifier_block *nb,
kfree(ro->filter);
ro->ifindex = 0;
- ro->bound = 0;
- ro->count = 0;
+ ro->bound = 0;
+ ro->count = 0;
release_sock(sk);
sk->sk_err = ENODEV;
@@ -374,8 +374,8 @@ static int raw_release(struct socket *sock)
kfree(ro->filter);
ro->ifindex = 0;
- ro->bound = 0;
- ro->count = 0;
+ ro->bound = 0;
+ ro->count = 0;
free_percpu(ro->uniq);
sock_orphan(sk);
@@ -773,7 +773,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
skb_setup_tx_timestamp(skb, sk->sk_tsflags);
skb->dev = dev;
- skb->sk = sk;
+ skb->sk = sk;
skb->priority = sk->sk_priority;
err = can_send(skb, ro->loopback);
@@ -801,8 +801,8 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int err = 0;
int noblock;
- noblock = flags & MSG_DONTWAIT;
- flags &= ~MSG_DONTWAIT;
+ noblock = flags & MSG_DONTWAIT;
+ flags &= ~MSG_DONTWAIT;
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index b988f48153a4..a0d1a3265b71 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -7,97 +7,14 @@
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
-#define SK_STORAGE_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_CLONE)
-
-struct bucket {
- struct hlist_head list;
- raw_spinlock_t lock;
-};
-
-/* Thp map is not the primary owner of a bpf_sk_storage_elem.
- * Instead, the sk->sk_bpf_storage is.
- *
- * The map (bpf_sk_storage_map) is for two purposes
- * 1. Define the size of the "sk local storage". It is
- * the map's value_size.
- *
- * 2. Maintain a list to keep track of all elems such
- * that they can be cleaned up during the map destruction.
- *
- * When a bpf local storage is being looked up for a
- * particular sk, the "bpf_map" pointer is actually used
- * as the "key" to search in the list of elem in
- * sk->sk_bpf_storage.
- *
- * Hence, consider sk->sk_bpf_storage is the mini-map
- * with the "bpf_map" pointer as the searching key.
- */
-struct bpf_sk_storage_map {
- struct bpf_map map;
- /* Lookup elem does not require accessing the map.
- *
- * Updating/Deleting requires a bucket lock to
- * link/unlink the elem from the map. Having
- * multiple buckets to improve contention.
- */
- struct bucket *buckets;
- u32 bucket_log;
- u16 elem_size;
- u16 cache_idx;
-};
-
-struct bpf_sk_storage_data {
- /* smap is used as the searching key when looking up
- * from sk->sk_bpf_storage.
- *
- * Put it in the same cacheline as the data to minimize
- * the number of cachelines access during the cache hit case.
- */
- struct bpf_sk_storage_map __rcu *smap;
- u8 data[] __aligned(8);
-};
-
-/* Linked to bpf_sk_storage and bpf_sk_storage_map */
-struct bpf_sk_storage_elem {
- struct hlist_node map_node; /* Linked to bpf_sk_storage_map */
- struct hlist_node snode; /* Linked to bpf_sk_storage */
- struct bpf_sk_storage __rcu *sk_storage;
- struct rcu_head rcu;
- /* 8 bytes hole */
- /* The data is stored in aother cacheline to minimize
- * the number of cachelines access during a cache hit.
- */
- struct bpf_sk_storage_data sdata ____cacheline_aligned;
-};
-
-#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
-#define SDATA(_SELEM) (&(_SELEM)->sdata)
-#define BPF_SK_STORAGE_CACHE_SIZE 16
-
-static DEFINE_SPINLOCK(cache_idx_lock);
-static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE];
-
-struct bpf_sk_storage {
- struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
- struct hlist_head list; /* List of bpf_sk_storage_elem */
- struct sock *sk; /* The sk that owns the the above "list" of
- * bpf_sk_storage_elem.
- */
- struct rcu_head rcu;
- raw_spinlock_t lock; /* Protect adding/removing from the "list" */
-};
-
-static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
-{
- return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
-}
+DEFINE_BPF_STORAGE_CACHE(sk_cache);
static int omem_charge(struct sock *sk, unsigned int size)
{
@@ -111,445 +28,38 @@ static int omem_charge(struct sock *sk, unsigned int size)
return -ENOMEM;
}
-static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
-{
- return !hlist_unhashed(&selem->snode);
-}
-
-static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
-{
- return !hlist_unhashed(&selem->map_node);
-}
-
-static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
- struct sock *sk, void *value,
- bool charge_omem)
-{
- struct bpf_sk_storage_elem *selem;
-
- if (charge_omem && omem_charge(sk, smap->elem_size))
- return NULL;
-
- selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (selem) {
- if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
- return selem;
- }
-
- if (charge_omem)
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
-
- return NULL;
-}
-
-/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
- * The caller must ensure selem->smap is still valid to be
- * dereferenced for its smap->elem_size and smap->cache_idx.
- */
-static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_elem *selem,
- bool uncharge_omem)
-{
- struct bpf_sk_storage_map *smap;
- bool free_sk_storage;
- struct sock *sk;
-
- smap = rcu_dereference(SDATA(selem)->smap);
- sk = sk_storage->sk;
-
- /* All uncharging on sk->sk_omem_alloc must be done first.
- * sk may be freed once the last selem is unlinked from sk_storage.
- */
- if (uncharge_omem)
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
-
- free_sk_storage = hlist_is_singular_node(&selem->snode,
- &sk_storage->list);
- if (free_sk_storage) {
- atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
- sk_storage->sk = NULL;
- /* After this RCU_INIT, sk may be freed and cannot be used */
- RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
-
- /* sk_storage is not freed now. sk_storage->lock is
- * still held and raw_spin_unlock_bh(&sk_storage->lock)
- * will be done by the caller.
- *
- * Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(sk_storage, rcu) is done
- * after the raw_spin_unlock_bh(&sk_storage->lock).
- *
- * Hence, a "bool free_sk_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
- */
- }
- hlist_del_init_rcu(&selem->snode);
- if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
- SDATA(selem))
- RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
-
- kfree_rcu(selem, rcu);
-
- return free_sk_storage;
-}
-
-static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
-{
- struct bpf_sk_storage *sk_storage;
- bool free_sk_storage = false;
-
- if (unlikely(!selem_linked_to_sk(selem)))
- /* selem has already been unlinked from sk */
- return;
-
- sk_storage = rcu_dereference(selem->sk_storage);
- raw_spin_lock_bh(&sk_storage->lock);
- if (likely(selem_linked_to_sk(selem)))
- free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
- raw_spin_unlock_bh(&sk_storage->lock);
-
- if (free_sk_storage)
- kfree_rcu(sk_storage, rcu);
-}
-
-static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_elem *selem)
-{
- RCU_INIT_POINTER(selem->sk_storage, sk_storage);
- hlist_add_head(&selem->snode, &sk_storage->list);
-}
-
-static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
-{
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
-
- if (unlikely(!selem_linked_to_map(selem)))
- /* selem has already be unlinked from smap */
- return;
-
- smap = rcu_dereference(SDATA(selem)->smap);
- b = select_bucket(smap, selem);
- raw_spin_lock_bh(&b->lock);
- if (likely(selem_linked_to_map(selem)))
- hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_bh(&b->lock);
-}
-
-static void selem_link_map(struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
-{
- struct bucket *b = select_bucket(smap, selem);
-
- raw_spin_lock_bh(&b->lock);
- RCU_INIT_POINTER(SDATA(selem)->smap, smap);
- hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_bh(&b->lock);
-}
-
-static void selem_unlink(struct bpf_sk_storage_elem *selem)
-{
- /* Always unlink from map before unlinking from sk_storage
- * because selem will be freed after successfully unlinked from
- * the sk_storage.
- */
- selem_unlink_map(selem);
- selem_unlink_sk(selem);
-}
-
-static struct bpf_sk_storage_data *
-__sk_storage_lookup(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_map *smap,
- bool cacheit_lockit)
-{
- struct bpf_sk_storage_data *sdata;
- struct bpf_sk_storage_elem *selem;
-
- /* Fast path (cache hit) */
- sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
- if (sdata && rcu_access_pointer(sdata->smap) == smap)
- return sdata;
-
- /* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
- if (rcu_access_pointer(SDATA(selem)->smap) == smap)
- break;
-
- if (!selem)
- return NULL;
-
- sdata = SDATA(selem);
- if (cacheit_lockit) {
- /* spinlock is needed to avoid racing with the
- * parallel delete. Otherwise, publishing an already
- * deleted sdata to the cache will become a use-after-free
- * problem in the next __sk_storage_lookup().
- */
- raw_spin_lock_bh(&sk_storage->lock);
- if (selem_linked_to_sk(selem))
- rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
- sdata);
- raw_spin_unlock_bh(&sk_storage->lock);
- }
-
- return sdata;
-}
-
-static struct bpf_sk_storage_data *
+static struct bpf_local_storage_data *
sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
{
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_map *smap;
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage)
return NULL;
- smap = (struct bpf_sk_storage_map *)map;
- return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
-}
-
-static int check_flags(const struct bpf_sk_storage_data *old_sdata,
- u64 map_flags)
-{
- if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
- /* elem already exists */
- return -EEXIST;
-
- if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
- /* elem doesn't exist, cannot update it */
- return -ENOENT;
-
- return 0;
-}
-
-static int sk_storage_alloc(struct sock *sk,
- struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *first_selem)
-{
- struct bpf_sk_storage *prev_sk_storage, *sk_storage;
- int err;
-
- err = omem_charge(sk, sizeof(*sk_storage));
- if (err)
- return err;
-
- sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
- if (!sk_storage) {
- err = -ENOMEM;
- goto uncharge;
- }
- INIT_HLIST_HEAD(&sk_storage->list);
- raw_spin_lock_init(&sk_storage->lock);
- sk_storage->sk = sk;
-
- __selem_link_sk(sk_storage, first_selem);
- selem_link_map(smap, first_selem);
- /* Publish sk_storage to sk. sk->sk_lock cannot be acquired.
- * Hence, atomic ops is used to set sk->sk_bpf_storage
- * from NULL to the newly allocated sk_storage ptr.
- *
- * From now on, the sk->sk_bpf_storage pointer is protected
- * by the sk_storage->lock. Hence, when freeing
- * the sk->sk_bpf_storage, the sk_storage->lock must
- * be held before setting sk->sk_bpf_storage to NULL.
- */
- prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
- NULL, sk_storage);
- if (unlikely(prev_sk_storage)) {
- selem_unlink_map(first_selem);
- err = -EAGAIN;
- goto uncharge;
-
- /* Note that even first_selem was linked to smap's
- * bucket->list, first_selem can be freed immediately
- * (instead of kfree_rcu) because
- * bpf_sk_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
- * Hence, no one is accessing selem from the
- * bucket->list under rcu_read_lock().
- */
- }
-
- return 0;
-
-uncharge:
- kfree(sk_storage);
- atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
- return err;
-}
-
-/* sk cannot be going away because it is linking new elem
- * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
- * Otherwise, it will become a leak (and other memory issues
- * during map destruction).
- */
-static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
- struct bpf_map *map,
- void *value,
- u64 map_flags)
-{
- struct bpf_sk_storage_data *old_sdata = NULL;
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_map *smap;
- int err;
-
- /* BPF_EXIST and BPF_NOEXIST cannot be both set */
- if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
- /* BPF_F_LOCK can only be used in a value with spin_lock */
- unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
- return ERR_PTR(-EINVAL);
-
- smap = (struct bpf_sk_storage_map *)map;
- sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage || hlist_empty(&sk_storage->list)) {
- /* Very first elem for this sk */
- err = check_flags(NULL, map_flags);
- if (err)
- return ERR_PTR(err);
-
- selem = selem_alloc(smap, sk, value, true);
- if (!selem)
- return ERR_PTR(-ENOMEM);
-
- err = sk_storage_alloc(sk, smap, selem);
- if (err) {
- kfree(selem);
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
- return ERR_PTR(err);
- }
-
- return SDATA(selem);
- }
-
- if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
- /* Hoping to find an old_sdata to do inline update
- * such that it can avoid taking the sk_storage->lock
- * and changing the lists.
- */
- old_sdata = __sk_storage_lookup(sk_storage, smap, false);
- err = check_flags(old_sdata, map_flags);
- if (err)
- return ERR_PTR(err);
- if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
- copy_map_value_locked(map, old_sdata->data,
- value, false);
- return old_sdata;
- }
- }
-
- raw_spin_lock_bh(&sk_storage->lock);
-
- /* Recheck sk_storage->list under sk_storage->lock */
- if (unlikely(hlist_empty(&sk_storage->list))) {
- /* A parallel del is happening and sk_storage is going
- * away. It has just been checked before, so very
- * unlikely. Return instead of retry to keep things
- * simple.
- */
- err = -EAGAIN;
- goto unlock_err;
- }
-
- old_sdata = __sk_storage_lookup(sk_storage, smap, false);
- err = check_flags(old_sdata, map_flags);
- if (err)
- goto unlock_err;
-
- if (old_sdata && (map_flags & BPF_F_LOCK)) {
- copy_map_value_locked(map, old_sdata->data, value, false);
- selem = SELEM(old_sdata);
- goto unlock;
- }
-
- /* sk_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during __selem_unlink_sk().
- */
- selem = selem_alloc(smap, sk, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
- }
-
- /* First, link the new selem to the map */
- selem_link_map(smap, selem);
-
- /* Second, link (and publish) the new selem to sk_storage */
- __selem_link_sk(sk_storage, selem);
-
- /* Third, remove old selem, SELEM(old_sdata) */
- if (old_sdata) {
- selem_unlink_map(SELEM(old_sdata));
- __selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
- }
-
-unlock:
- raw_spin_unlock_bh(&sk_storage->lock);
- return SDATA(selem);
-
-unlock_err:
- raw_spin_unlock_bh(&sk_storage->lock);
- return ERR_PTR(err);
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
}
static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
sdata = sk_storage_lookup(sk, map, false);
if (!sdata)
return -ENOENT;
- selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata));
return 0;
}
-static u16 cache_idx_get(void)
-{
- u64 min_usage = U64_MAX;
- u16 i, res = 0;
-
- spin_lock(&cache_idx_lock);
-
- for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) {
- if (cache_idx_usage_counts[i] < min_usage) {
- min_usage = cache_idx_usage_counts[i];
- res = i;
-
- /* Found a free cache_idx */
- if (!min_usage)
- break;
- }
- }
- cache_idx_usage_counts[res]++;
-
- spin_unlock(&cache_idx_lock);
-
- return res;
-}
-
-static void cache_idx_free(u16 idx)
-{
- spin_lock(&cache_idx_lock);
- cache_idx_usage_counts[idx]--;
- spin_unlock(&cache_idx_lock);
-}
-
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *sk_storage;
bool free_sk_storage = false;
struct hlist_node *n;
@@ -565,7 +75,7 @@ void bpf_sk_storage_free(struct sock *sk)
* Thus, no elem can be added-to or deleted-from the
* sk_storage->list by the bpf_prog or by the bpf-map's syscall.
*
- * It is racing with bpf_sk_storage_map_free() alone
+ * It is racing with bpf_local_storage_map_free() alone
* when unlinking elem from the sk_storage->list and
* the map's bucket->list.
*/
@@ -574,8 +84,9 @@ void bpf_sk_storage_free(struct sock *sk)
/* Always unlink from map before unlinking from
* sk_storage.
*/
- selem_unlink_map(selem);
- free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
+ bpf_selem_unlink_map(selem);
+ free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage,
+ selem, true);
}
raw_spin_unlock_bh(&sk_storage->lock);
rcu_read_unlock();
@@ -584,132 +95,24 @@ void bpf_sk_storage_free(struct sock *sk)
kfree_rcu(sk_storage, rcu);
}
-static void bpf_sk_storage_map_free(struct bpf_map *map)
-{
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
- unsigned int i;
-
- smap = (struct bpf_sk_storage_map *)map;
-
- cache_idx_free(smap->cache_idx);
-
- /* Note that this map might be concurrently cloned from
- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
- * RCU read section to finish before proceeding. New RCU
- * read sections should be prevented via bpf_map_inc_not_zero.
- */
- synchronize_rcu();
-
- /* bpf prog and the userspace can no longer access this map
- * now. No new selem (of this map) can be added
- * to the sk->sk_bpf_storage or to the map bucket's list.
- *
- * The elem of this map can be cleaned up here
- * or
- * by bpf_sk_storage_free() during __sk_destruct().
- */
- for (i = 0; i < (1U << smap->bucket_log); i++) {
- b = &smap->buckets[i];
-
- rcu_read_lock();
- /* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_sk_storage_elem,
- map_node))) {
- selem_unlink(selem);
- cond_resched_rcu();
- }
- rcu_read_unlock();
- }
-
- /* bpf_sk_storage_free() may still need to access the map.
- * e.g. bpf_sk_storage_free() has unlinked selem from the map
- * which then made the above while((selem = ...)) loop
- * exited immediately.
- *
- * However, the bpf_sk_storage_free() still needs to access
- * the smap->elem_size to do the uncharging in
- * __selem_unlink_sk().
- *
- * Hence, wait another rcu grace period for the
- * bpf_sk_storage_free() to finish.
- */
- synchronize_rcu();
-
- kvfree(smap->buckets);
- kfree(map);
-}
-
-/* U16_MAX is much more than enough for sk local storage
- * considering a tcp_sock is ~2k.
- */
-#define MAX_VALUE_SIZE \
- min_t(u32, \
- (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
- (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
-
-static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
+static void sk_storage_map_free(struct bpf_map *map)
{
- if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
- !(attr->map_flags & BPF_F_NO_PREALLOC) ||
- attr->max_entries ||
- attr->key_size != sizeof(int) || !attr->value_size ||
- /* Enforce BTF for userspace sk dumping */
- !attr->btf_key_type_id || !attr->btf_value_type_id)
- return -EINVAL;
-
- if (!bpf_capable())
- return -EPERM;
+ struct bpf_local_storage_map *smap;
- if (attr->value_size > MAX_VALUE_SIZE)
- return -E2BIG;
-
- return 0;
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
+ bpf_local_storage_map_free(smap);
}
-static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
+static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_sk_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
- u64 cost;
- int ret;
-
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
- cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
-
- ret = bpf_map_charge_init(&smap->map.memory, cost);
- if (ret < 0) {
- kfree(smap);
- return ERR_PTR(ret);
- }
-
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN);
- if (!smap->buckets) {
- bpf_map_charge_finish(&smap->map.memory);
- kfree(smap);
- return ERR_PTR(-ENOMEM);
- }
+ struct bpf_local_storage_map *smap;
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
- }
-
- smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
- smap->cache_idx = cache_idx_get();
+ smap = bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+ smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
return &smap->map;
}
@@ -719,26 +122,9 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
- const struct btf *btf,
- const struct btf_type *key_type,
- const struct btf_type *value_type)
-{
- u32 int_data;
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
- return -EINVAL;
-
- return 0;
-}
-
static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
struct socket *sock;
int fd, err;
@@ -756,14 +142,16 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- sdata = sk_storage_update(sock->sk, map, value, map_flags);
+ sdata = bpf_local_storage_update(
+ sock->sk, (struct bpf_local_storage_map *)map, value,
+ map_flags);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -787,14 +175,14 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
bpf_sk_storage_clone_elem(struct sock *newsk,
- struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
{
- struct bpf_sk_storage_elem *copy_selem;
+ struct bpf_local_storage_elem *copy_selem;
- copy_selem = selem_alloc(smap, newsk, NULL, true);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
if (!copy_selem)
return NULL;
@@ -810,9 +198,9 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
{
- struct bpf_sk_storage *new_sk_storage = NULL;
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage *new_sk_storage = NULL;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
int ret = 0;
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
@@ -824,8 +212,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
goto out;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
- struct bpf_sk_storage_elem *copy_selem;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_elem *copy_selem;
+ struct bpf_local_storage_map *smap;
struct bpf_map *map;
smap = rcu_dereference(SDATA(selem)->smap);
@@ -833,7 +221,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
continue;
/* Note that for lockless listeners adding new element
- * here can race with cleanup in bpf_sk_storage_map_free.
+ * here can race with cleanup in bpf_local_storage_map_free.
* Try to grab map refcnt to make sure that it's still
* alive and prevent concurrent removal.
*/
@@ -849,10 +237,10 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
}
if (new_sk_storage) {
- selem_link_map(smap, copy_selem);
- __selem_link_sk(new_sk_storage, copy_selem);
+ bpf_selem_link_map(smap, copy_selem);
+ bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
} else {
- ret = sk_storage_alloc(newsk, smap, copy_selem);
+ ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
if (ret) {
kfree(copy_selem);
atomic_sub(smap->elem_size,
@@ -861,7 +249,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
goto out;
}
- new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+ new_sk_storage =
+ rcu_dereference(copy_selem->local_storage);
}
bpf_map_put(map);
}
@@ -879,7 +268,7 @@ out:
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
if (flags > BPF_SK_STORAGE_GET_F_CREATE)
return (unsigned long)NULL;
@@ -895,7 +284,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
* destruction).
*/
refcount_inc_not_zero(&sk->sk_refcnt)) {
- sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
+ sdata = bpf_local_storage_update(
+ sk, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -920,18 +311,44 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
return -ENOENT;
}
+static int sk_storage_charge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ return omem_charge(owner, size);
+}
+
+static void sk_storage_uncharge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = owner;
+
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+static struct bpf_local_storage __rcu **
+sk_storage_ptr(void *owner)
+{
+ struct sock *sk = owner;
+
+ return &sk->sk_bpf_storage;
+}
+
static int sk_storage_map_btf_id;
const struct bpf_map_ops sk_storage_map_ops = {
- .map_alloc_check = bpf_sk_storage_map_alloc_check,
- .map_alloc = bpf_sk_storage_map_alloc,
- .map_free = bpf_sk_storage_map_free,
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = sk_storage_map_alloc,
+ .map_free = sk_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
.map_update_elem = bpf_fd_sk_storage_update_elem,
.map_delete_elem = bpf_fd_sk_storage_delete_elem,
- .map_check_btf = bpf_sk_storage_map_check_btf,
- .map_btf_name = "bpf_sk_storage_map",
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_name = "bpf_local_storage_map",
.map_btf_id = &sk_storage_map_btf_id,
+ .map_local_storage_charge = sk_storage_charge,
+ .map_local_storage_uncharge = sk_storage_uncharge,
+ .map_owner_storage_ptr = sk_storage_ptr,
};
const struct bpf_func_proto bpf_sk_storage_get_proto = {
@@ -962,6 +379,30 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.arg2_type = ARG_PTR_TO_SOCKET,
};
+BTF_ID_LIST(sk_storage_btf_ids)
+BTF_ID_UNUSED
+BTF_ID(struct, sock)
+
+const struct bpf_func_proto sk_storage_get_btf_proto = {
+ .func = bpf_sk_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+ .btf_id = sk_storage_btf_ids,
+};
+
+const struct bpf_func_proto sk_storage_delete_btf_proto = {
+ .func = bpf_sk_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .btf_id = sk_storage_btf_ids,
+};
+
struct bpf_sk_storage_diag {
u32 nr_maps;
struct bpf_map *maps[];
@@ -1022,7 +463,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
u32 nr_maps = 0;
int rem, err;
- /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+ /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
* the map_alloc_check() side also does.
*/
if (!bpf_capable())
@@ -1072,13 +513,13 @@ err_free:
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
-static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
{
struct nlattr *nla_stg, *nla_value;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_map *smap;
/* It cannot exceed max nlattr's payload */
- BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
if (!nla_stg)
@@ -1114,9 +555,9 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
{
/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
unsigned int diag_size = nla_total_size(0);
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
struct nlattr *nla_stgs;
unsigned int saved_len;
int err = 0;
@@ -1169,8 +610,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
{
/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
unsigned int diag_size = nla_total_size(0);
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_data *sdata;
struct nlattr *nla_stgs;
unsigned int saved_len;
int err = 0;
@@ -1197,8 +638,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
saved_len = skb->len;
for (i = 0; i < diag->nr_maps; i++) {
- sdata = __sk_storage_lookup(sk_storage,
- (struct bpf_sk_storage_map *)diag->maps[i],
+ sdata = bpf_local_storage_lookup(sk_storage,
+ (struct bpf_local_storage_map *)diag->maps[i],
false);
if (!sdata)
@@ -1235,19 +676,19 @@ struct bpf_iter_seq_sk_storage_map_info {
unsigned skip_elems;
};
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
- struct bpf_sk_storage_elem *prev_selem)
+ struct bpf_local_storage_elem *prev_selem)
{
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
u32 skip_elems = info->skip_elems;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_map *smap;
u32 bucket_id = info->bucket_id;
u32 i, count, n_buckets;
- struct bucket *b;
+ struct bpf_local_storage_map_bucket *b;
- smap = (struct bpf_sk_storage_map *)info->map;
+ smap = (struct bpf_local_storage_map *)info->map;
n_buckets = 1U << smap->bucket_log;
if (bucket_id >= n_buckets)
return NULL;
@@ -1257,7 +698,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
count = 0;
while (selem) {
selem = hlist_entry_safe(selem->map_node.next,
- struct bpf_sk_storage_elem, map_node);
+ struct bpf_local_storage_elem, map_node);
if (!selem) {
/* not found, unlock and go to the next bucket */
b = &smap->buckets[bucket_id++];
@@ -1265,7 +706,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
skip_elems = 0;
break;
}
- sk_storage = rcu_dereference_raw(selem->sk_storage);
+ sk_storage = rcu_dereference_raw(selem->local_storage);
if (sk_storage) {
info->skip_elems = skip_elems + count;
return selem;
@@ -1278,7 +719,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
raw_spin_lock_bh(&b->lock);
count = 0;
hlist_for_each_entry(selem, &b->list, map_node) {
- sk_storage = rcu_dereference_raw(selem->sk_storage);
+ sk_storage = rcu_dereference_raw(selem->local_storage);
if (sk_storage && count >= skip_elems) {
info->bucket_id = i;
info->skip_elems = count;
@@ -1297,7 +738,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage_elem *selem;
selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
if (!selem)
@@ -1330,11 +771,11 @@ DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
void *value)
static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
- struct bpf_sk_storage_elem *selem)
+ struct bpf_local_storage_elem *selem)
{
struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
struct bpf_iter__bpf_sk_storage_map ctx = {};
- struct bpf_sk_storage *sk_storage;
+ struct bpf_local_storage *sk_storage;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int ret = 0;
@@ -1345,8 +786,8 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
ctx.meta = &meta;
ctx.map = info->map;
if (selem) {
- sk_storage = rcu_dereference_raw(selem->sk_storage);
- ctx.sk = sk_storage->sk;
+ sk_storage = rcu_dereference_raw(selem->local_storage);
+ ctx.sk = sk_storage->owner;
ctx.value = SDATA(selem)->data;
}
ret = bpf_iter_run_prog(prog, &ctx);
@@ -1363,13 +804,13 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
if (!v) {
(void)__bpf_sk_storage_map_seq_show(seq, v);
} else {
- smap = (struct bpf_sk_storage_map *)info->map;
+ smap = (struct bpf_local_storage_map *)info->map;
b = &smap->buckets[info->bucket_id];
raw_spin_unlock_bh(&b->lock);
}
@@ -1437,6 +878,8 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
.target = "bpf_sk_storage_map",
.attach_target = bpf_iter_attach_map,
.detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 639745d4f3b9..9fcaa544f11a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -623,10 +623,11 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
+ struct page *last_head = NULL;
size_t start;
ssize_t copied;
unsigned long truesize;
- int n = 0;
+ int refs, n = 0;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
@@ -649,13 +650,37 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
}
- while (copied) {
+ for (refs = 0; copied != 0; start = 0) {
int size = min_t(int, copied, PAGE_SIZE - start);
- skb_fill_page_desc(skb, frag++, pages[n], start, size);
- start = 0;
+ struct page *head = compound_head(pages[n]);
+
+ start += (pages[n] - head) << PAGE_SHIFT;
copied -= size;
n++;
+ if (frag) {
+ skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];
+
+ if (head == skb_frag_page(last) &&
+ start == skb_frag_off(last) + skb_frag_size(last)) {
+ skb_frag_size_add(last, size);
+ /* We combined this page, we need to release
+ * a reference. Since compound pages refcount
+ * is shared among many pages, batch the refcount
+ * adjustments to limit false sharing.
+ */
+ last_head = head;
+ refs++;
+ continue;
+ }
+ }
+ if (refs) {
+ page_ref_sub(last_head, refs);
+ refs = 0;
+ }
+ skb_fill_page_desc(skb, frag++, head, start, size);
}
+ if (refs)
+ page_ref_sub(last_head, refs);
}
return 0;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 266073e300b5..38a172a63318 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -98,6 +98,7 @@
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
+#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
@@ -1130,7 +1131,7 @@ EXPORT_SYMBOL(__dev_get_by_flags);
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
@@ -5192,7 +5193,7 @@ skip_classify:
}
}
- if (unlikely(skb_vlan_tag_present(skb))) {
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
@@ -5621,17 +5622,60 @@ static void flush_backlog(struct work_struct *work)
local_bh_enable();
}
+static bool flush_required(int cpu)
+{
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
+
+ local_irq_disable();
+ rps_lock(sd);
+
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
+ */
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ rps_unlock(sd);
+ local_irq_enable();
+
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
+ */
+ return true;
+}
+
static void flush_all_backlogs(void)
{
+ static cpumask_t flush_cpus;
unsigned int cpu;
+ /* since we are under rtnl lock protection we can use static data
+ * for the cpumask and avoid allocating on stack the possibly
+ * large mask
+ */
+ ASSERT_RTNL();
+
get_online_cpus();
- for_each_online_cpu(cpu)
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
+ cpumask_clear(&flush_cpus);
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+ cpumask_set_cpu(cpu, &flush_cpus);
+ }
+ }
- for_each_online_cpu(cpu)
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in rollback_registered_many() will take care of
+ * them
+ */
+ for_each_cpu(cpu, &flush_cpus)
flush_work(per_cpu_ptr(&flush_works, cpu));
put_online_cpus();
@@ -6293,7 +6337,7 @@ EXPORT_SYMBOL(__napi_schedule);
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable
+ * it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
@@ -6533,8 +6577,7 @@ EXPORT_SYMBOL(napi_busy_loop);
static void napi_hash_add(struct napi_struct *napi)
{
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
spin_lock(&napi_hash_lock);
@@ -6555,20 +6598,14 @@ static void napi_hash_add(struct napi_struct *napi)
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-bool napi_hash_del(struct napi_struct *napi)
+static void napi_hash_del(struct napi_struct *napi)
{
- bool rcu_sync_needed = false;
-
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
- rcu_sync_needed = true;
- hlist_del_rcu(&napi->napi_hash_node);
- }
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
spin_unlock(&napi_hash_lock);
- return rcu_sync_needed;
}
-EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
@@ -6600,7 +6637,11 @@ static void init_gro_hash(struct napi_struct *napi)
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
INIT_LIST_HEAD(&napi->poll_list);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
@@ -6653,18 +6694,19 @@ static void flush_gro_hash(struct napi_struct *napi)
}
/* Must be called in process context */
-void netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del(struct napi_struct *napi)
{
- might_sleep();
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ napi_hash_del(napi);
+ list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
flush_gro_hash(napi);
napi->gro_bitmask = 0;
}
-EXPORT_SYMBOL(netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
@@ -9470,7 +9512,7 @@ int __netdev_update_features(struct net_device *dev)
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
- /* some features can't be enabled if they're off an an upper device */
+ /* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
@@ -9986,10 +10028,12 @@ EXPORT_SYMBOL(netdev_refcnt_read);
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
- int refcnt;
+ int wait = 0, refcnt;
linkwatch_forget_dev(dev);
@@ -10023,7 +10067,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
rebroadcast_time = jiffies;
}
- msleep(250);
+ if (!wait) {
+ rcu_barrier();
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
refcnt = netdev_refcnt_read(dev);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 80ec1cd81c64..045468390480 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -523,15 +523,20 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return -EMSGSIZE;
switch (devlink_port->attrs.flavour) {
case DEVLINK_PORT_FLAVOUR_PCI_PF:
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
- attrs->pci_pf.pf))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_pf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
- attrs->pci_vf.pf) ||
- nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
- attrs->pci_vf.vf))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_vf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
@@ -3017,9 +3022,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
static int devlink_nl_flash_update_fill(struct sk_buff *msg,
struct devlink *devlink,
enum devlink_command cmd,
- const char *status_msg,
- const char *component,
- unsigned long done, unsigned long total)
+ struct devlink_flash_notify *params)
{
void *hdr;
@@ -3033,19 +3036,22 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg,
if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
goto out;
- if (status_msg &&
+ if (params->status_msg &&
nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
- status_msg))
+ params->status_msg))
goto nla_put_failure;
- if (component &&
+ if (params->component &&
nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
- component))
+ params->component))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
- done, DEVLINK_ATTR_PAD))
+ params->done, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
- total, DEVLINK_ATTR_PAD))
+ params->total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
+ params->timeout, DEVLINK_ATTR_PAD))
goto nla_put_failure;
out:
@@ -3059,10 +3065,7 @@ nla_put_failure:
static void __devlink_flash_update_notify(struct devlink *devlink,
enum devlink_command cmd,
- const char *status_msg,
- const char *component,
- unsigned long done,
- unsigned long total)
+ struct devlink_flash_notify *params)
{
struct sk_buff *msg;
int err;
@@ -3075,8 +3078,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink,
if (!msg)
return;
- err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
- component, done, total);
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
if (err)
goto out_free_msg;
@@ -3090,17 +3092,21 @@ out_free_msg:
void devlink_flash_update_begin_notify(struct devlink *devlink)
{
+ struct devlink_flash_notify params = { 0 };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE,
- NULL, NULL, 0, 0);
+ &params);
}
EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
void devlink_flash_update_end_notify(struct devlink *devlink)
{
+ struct devlink_flash_notify params = { 0 };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_END,
- NULL, NULL, 0, 0);
+ &params);
}
EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
@@ -3110,12 +3116,36 @@ void devlink_flash_update_status_notify(struct devlink *devlink,
unsigned long done,
unsigned long total)
{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .done = done,
+ .total = total,
+ };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_STATUS,
- status_msg, component, done, total);
+ &params);
}
EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+void devlink_flash_update_timeout_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long timeout)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .timeout = timeout,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
@@ -4317,7 +4347,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
}
}
- err = region->ops->snapshot(devlink, info->extack, &data);
+ err = region->ops->snapshot(devlink, region->ops, info->extack, &data);
if (err)
goto err_snapshot_capture;
@@ -5895,6 +5925,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
list_for_each_entry(devlink, &devlink_list, list) {
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
continue;
+ mutex_lock(&devlink->lock);
list_for_each_entry(port, &devlink->port_list, list) {
mutex_lock(&port->reporters_lock);
list_for_each_entry(reporter, &port->reporter_list, list) {
@@ -5909,12 +5940,14 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
NLM_F_MULTI);
if (err) {
mutex_unlock(&port->reporters_lock);
+ mutex_unlock(&devlink->lock);
goto out;
}
idx++;
}
mutex_unlock(&port->reporters_lock);
}
+ mutex_unlock(&devlink->lock);
}
out:
mutex_unlock(&devlink_mutex);
@@ -6088,6 +6121,28 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
return 0;
}
+static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->test) {
+ devlink_health_reporter_put(reporter);
+ return -EOPNOTSUPP;
+ }
+
+ err = reporter->ops->test(reporter, info->extack);
+
+ devlink_health_reporter_put(reporter);
+ return err;
+}
+
struct devlink_stats {
u64 rx_bytes;
u64 rx_packets;
@@ -7309,6 +7364,14 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_NO_LOCK,
},
{
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_health_reporter_test_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT |
+ DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
.cmd = DEVLINK_CMD_FLASH_UPDATE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_flash_update,
@@ -7555,11 +7618,11 @@ int devlink_port_register(struct devlink *devlink,
devlink_port->index = port_index;
devlink_port->registered = true;
spin_lock_init(&devlink_port->type_lock);
+ INIT_LIST_HEAD(&devlink_port->reporter_list);
+ mutex_init(&devlink_port->reporters_lock);
list_add_tail(&devlink_port->list, &devlink->port_list);
INIT_LIST_HEAD(&devlink_port->param_list);
mutex_unlock(&devlink->lock);
- INIT_LIST_HEAD(&devlink_port->reporter_list);
- mutex_init(&devlink_port->reporters_lock);
INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
@@ -7576,13 +7639,13 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
- WARN_ON(!list_empty(&devlink_port->reporter_list));
- mutex_destroy(&devlink_port->reporters_lock);
devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
mutex_lock(&devlink->lock);
list_del(&devlink_port->list);
mutex_unlock(&devlink->lock);
+ WARN_ON(!list_empty(&devlink_port->reporter_list));
+ mutex_destroy(&devlink_port->reporters_lock);
}
EXPORT_SYMBOL_GPL(devlink_port_unregister);
@@ -7600,14 +7663,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
-/**
- * devlink_port_type_eth_set - Set port type to Ethernet
- *
- * @devlink_port: devlink port
- * @netdev: related netdevice
- */
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
- struct net_device *netdev)
+static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
+ struct net_device *netdev)
{
const struct net_device_ops *ops = netdev->netdev_ops;
@@ -7641,6 +7698,24 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port,
err = ops->ndo_get_port_parent_id(netdev, &ppid);
WARN_ON(err != -EOPNOTSUPP);
}
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ if (netdev)
+ devlink_port_type_netdev_checks(devlink_port, netdev);
+ else
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev);
}
EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
@@ -7712,9 +7787,12 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
* devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
*
* @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
*/
-void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf)
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
@@ -7723,8 +7801,9 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf)
DEVLINK_PORT_FLAVOUR_PCI_PF);
if (ret)
return;
-
+ attrs->pci_pf.controller = controller;
attrs->pci_pf.pf = pf;
+ attrs->pci_pf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
@@ -7732,11 +7811,13 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
* devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
*
* @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @vf: associated VF of a PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
*/
-void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
- u16 pf, u16 vf)
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u16 vf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
@@ -7745,8 +7826,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_VF);
if (ret)
return;
+ attrs->pci_vf.controller = controller;
attrs->pci_vf.pf = pf;
attrs->pci_vf.vf = vf;
+ attrs->pci_vf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
@@ -7777,9 +7860,23 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
WARN_ON(1);
return -EINVAL;
case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (attrs->pci_pf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (attrs->pci_vf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%uvf%u",
attrs->pci_vf.pf, attrs->pci_vf.vf);
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index 21eaf3b182f2..08f577114acc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4459,6 +4459,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
} else {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
if (optlen != sizeof(int))
return -EINVAL;
@@ -4480,6 +4481,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
tp->snd_ssthresh = val;
}
break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
case TCP_SAVE_SYN:
if (val < 0 || val > 1)
ret = -EINVAL;
@@ -4550,9 +4565,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
tp = tcp_sk(sk);
if (optlen <= 0 || !tp->saved_syn ||
- optlen > tp->saved_syn[0])
+ optlen > tcp_saved_syn_len(tp->saved_syn))
goto err_clear;
- memcpy(optval, tp->saved_syn + 1, optlen);
+ memcpy(optval, tp->saved_syn->data, optlen);
break;
default:
goto err_clear;
@@ -4654,9 +4669,99 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
+ int optname, const u8 **start)
+{
+ struct sk_buff *syn_skb = bpf_sock->syn_skb;
+ const u8 *hdr_start;
+ int ret;
+
+ if (syn_skb) {
+ /* sk is a request_sock here */
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = syn_skb->data;
+ ret = tcp_hdrlen(syn_skb);
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = skb_network_header(syn_skb);
+ ret = skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+ hdr_start = skb_mac_header(syn_skb);
+ ret = skb_mac_header_len(syn_skb) +
+ skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ }
+ } else {
+ struct sock *sk = bpf_sock->sk;
+ struct saved_syn *saved_syn;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ /* synack retransmit. bpf_sock->syn_skb will
+ * not be available. It has to resort to
+ * saved_syn (if it is saved).
+ */
+ saved_syn = inet_reqsk(sk)->saved_syn;
+ else
+ saved_syn = tcp_sk(sk)->saved_syn;
+
+ if (!saved_syn)
+ return -ENOENT;
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen;
+ ret = saved_syn->tcp_hdrlen;
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen;
+ ret = saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+
+ /* TCP_SAVE_SYN may not have saved the mac hdr */
+ if (!saved_syn->mac_hdrlen)
+ return -ENOENT;
+
+ hdr_start = saved_syn->data;
+ ret = saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ }
+ }
+
+ *start = hdr_start;
+ return ret;
+}
+
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
+ optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
+ int ret, copy_len = 0;
+ const u8 *start;
+
+ ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
+ if (ret > 0) {
+ copy_len = ret;
+ if (optlen < copy_len) {
+ copy_len = optlen;
+ ret = -ENOSPC;
+ }
+
+ memcpy(optval, start, copy_len);
+ }
+
+ /* Zero out unused buffer at the end */
+ memset(optval + copy_len, 0, optlen - copy_len);
+
+ return ret;
+ }
+
return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -6151,6 +6256,232 @@ static const struct bpf_func_proto bpf_sk_assign_proto = {
.arg3_type = ARG_ANYTHING,
};
+static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
+ u8 search_kind, const u8 *magic,
+ u8 magic_len, bool *eol)
+{
+ u8 kind, kind_len;
+
+ *eol = false;
+
+ while (op < opend) {
+ kind = op[0];
+
+ if (kind == TCPOPT_EOL) {
+ *eol = true;
+ return ERR_PTR(-ENOMSG);
+ } else if (kind == TCPOPT_NOP) {
+ op++;
+ continue;
+ }
+
+ if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
+ /* Something is wrong in the received header.
+ * Follow the TCP stack's tcp_parse_options()
+ * and just bail here.
+ */
+ return ERR_PTR(-EFAULT);
+
+ kind_len = op[1];
+ if (search_kind == kind) {
+ if (!magic_len)
+ return op;
+
+ if (magic_len > kind_len - 2)
+ return ERR_PTR(-ENOMSG);
+
+ if (!memcmp(&op[2], magic, magic_len))
+ return op;
+ }
+
+ op += kind_len;
+ }
+
+ return ERR_PTR(-ENOMSG);
+}
+
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ void *, search_res, u32, len, u64, flags)
+{
+ bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
+ const u8 *op, *opend, *magic, *search = search_res;
+ u8 search_kind, search_len, copy_len, magic_len;
+ int ret;
+
+ /* 2 byte is the minimal option len except TCPOPT_NOP and
+ * TCPOPT_EOL which are useless for the bpf prog to learn
+ * and this helper disallow loading them also.
+ */
+ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
+ return -EINVAL;
+
+ search_kind = search[0];
+ search_len = search[1];
+
+ if (search_len > len || search_kind == TCPOPT_NOP ||
+ search_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (search_kind == TCPOPT_EXP || search_kind == 253) {
+ /* 16 or 32 bit magic. +2 for kind and kind length */
+ if (search_len != 4 && search_len != 6)
+ return -EINVAL;
+ magic = &search[2];
+ magic_len = search_len - 2;
+ } else {
+ if (search_len)
+ return -EINVAL;
+ magic = NULL;
+ magic_len = 0;
+ }
+
+ if (load_syn) {
+ ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
+ if (ret < 0)
+ return ret;
+
+ opend = op + ret;
+ op += sizeof(struct tcphdr);
+ } else {
+ if (!bpf_sock->skb ||
+ bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ /* This bpf_sock->op cannot call this helper */
+ return -EPERM;
+
+ opend = bpf_sock->skb_data_end;
+ op = bpf_sock->skb->data + sizeof(struct tcphdr);
+ }
+
+ op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
+ &eol);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+
+ copy_len = op[1];
+ ret = copy_len;
+ if (copy_len > len) {
+ ret = -ENOSPC;
+ copy_len = len;
+ }
+
+ memcpy(search_res, op, copy_len);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
+ .func = bpf_sock_ops_load_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ const void *, from, u32, len, u64, flags)
+{
+ u8 new_kind, new_kind_len, magic_len = 0, *opend;
+ const u8 *op, *new_op, *magic = NULL;
+ struct sk_buff *skb;
+ bool eol;
+
+ if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+ return -EPERM;
+
+ if (len < 2 || flags)
+ return -EINVAL;
+
+ new_op = from;
+ new_kind = new_op[0];
+ new_kind_len = new_op[1];
+
+ if (new_kind_len > len || new_kind == TCPOPT_NOP ||
+ new_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (new_kind_len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ /* 253 is another experimental kind */
+ if (new_kind == TCPOPT_EXP || new_kind == 253) {
+ if (new_kind_len < 4)
+ return -EINVAL;
+ /* Match for the 2 byte magic also.
+ * RFC 6994: the magic could be 2 or 4 bytes.
+ * Hence, matching by 2 byte only is on the
+ * conservative side but it is the right
+ * thing to do for the 'search-for-duplication'
+ * purpose.
+ */
+ magic = &new_op[2];
+ magic_len = 2;
+ }
+
+ /* Check for duplication */
+ skb = bpf_sock->skb;
+ op = skb->data + sizeof(struct tcphdr);
+ opend = bpf_sock->skb_data_end;
+
+ op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
+ &eol);
+ if (!IS_ERR(op))
+ return -EEXIST;
+
+ if (PTR_ERR(op) != -ENOMSG)
+ return PTR_ERR(op);
+
+ if (eol)
+ /* The option has been ended. Treat it as no more
+ * header option can be written.
+ */
+ return -ENOSPC;
+
+ /* No duplication found. Store the header option. */
+ memcpy(opend, from, new_kind_len);
+
+ bpf_sock->remaining_opt_len -= new_kind_len;
+ bpf_sock->skb_data_end += new_kind_len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
+ .func = bpf_sock_ops_store_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ u32, len, u64, flags)
+{
+ if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ return -EPERM;
+
+ if (flags || len < 2)
+ return -EINVAL;
+
+ if (len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ bpf_sock->remaining_opt_len -= len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
+ .func = bpf_sock_ops_reserve_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -6180,6 +6511,9 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
+#ifdef CONFIG_INET
+ func == bpf_sock_ops_store_hdr_opt ||
+#endif
func == bpf_lwt_in_push_encap ||
func == bpf_lwt_xmit_push_encap)
return true;
@@ -6551,6 +6885,12 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_INET
+ case BPF_FUNC_load_hdr_opt:
+ return &bpf_sock_ops_load_hdr_opt_proto;
+ case BPF_FUNC_store_hdr_opt:
+ return &bpf_sock_ops_store_hdr_opt_proto;
+ case BPF_FUNC_reserve_hdr_opt:
+ return &bpf_sock_ops_reserve_hdr_opt_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
@@ -7350,6 +7690,20 @@ static bool sock_ops_is_valid_access(int off, int size,
return false;
info->reg_type = PTR_TO_SOCKET_OR_NULL;
break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size,
+ size_default);
default:
if (size != size_default)
return false;
@@ -8451,17 +8805,22 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
switch (si->off) {
- case offsetof(struct bpf_sock_ops, op) ...
+ case offsetof(struct bpf_sock_ops, op):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ op),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, op));
+ break;
+
+ case offsetof(struct bpf_sock_ops, replylong[0]) ...
offsetof(struct bpf_sock_ops, replylong[3]):
- BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) !=
- sizeof_field(struct bpf_sock_ops_kern, op));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
sizeof_field(struct bpf_sock_ops_kern, reply));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
sizeof_field(struct bpf_sock_ops_kern, replylong));
off = si->off;
- off -= offsetof(struct bpf_sock_ops, op);
- off += offsetof(struct bpf_sock_ops_kern, op);
+ off -= offsetof(struct bpf_sock_ops, replylong[0]);
+ off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
off);
@@ -8682,6 +9041,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, sk):
SOCK_OPS_GET_SK();
break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb_data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb_data_end));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, data));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_len):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, len));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ off = offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, tcp_flags);
+ *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
+ tcp_flags),
+ si->dst_reg, si->dst_reg, off);
+ break;
}
return insn - insn_buf;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 6bbd06f7dc7d..c714e6a9dad4 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -116,6 +116,12 @@ static int dev_seq_show(struct seq_file *seq, void *v)
return 0;
}
+static u32 softnet_backlog_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->input_pkt_queue) +
+ skb_queue_len_lockless(&sd->process_queue);
+}
+
static struct softnet_data *softnet_get_online(loff_t *pos)
{
struct softnet_data *sd = NULL;
@@ -159,12 +165,17 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
+ /* the index is the CPU id owing this sd. Since offline CPUs are not
+ * displayed, it would be othrwise not trivial for the user-space
+ * mapping the data a specific CPU
+ */
seq_printf(seq,
- "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count);
+ sd->received_rps, flow_limit_count,
+ softnet_backlog_len(sd), (int)seq->index);
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 2338753e936b..c310c7c1cef7 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -297,7 +297,7 @@ static int netpoll_owner_active(struct net_device *dev)
{
struct napi_struct *napi;
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner == smp_processor_id())
return 1;
}
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index d964a5147f22..e33fde06d528 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -107,6 +107,36 @@ unsigned int ptp_classify_raw(const struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
+struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type)
+{
+ u8 *ptr = skb_mac_header(skb);
+
+ if (type & PTP_CLASS_VLAN)
+ ptr += VLAN_HLEN;
+
+ switch (type & PTP_CLASS_PMASK) {
+ case PTP_CLASS_IPV4:
+ ptr += IPV4_HLEN(ptr) + UDP_HLEN;
+ break;
+ case PTP_CLASS_IPV6:
+ ptr += IP6_HLEN + UDP_HLEN;
+ break;
+ case PTP_CLASS_L2:
+ break;
+ default:
+ return NULL;
+ }
+
+ ptr += ETH_HLEN;
+
+ /* Ensure that the entire header is present in this packet. */
+ if (ptr + sizeof(struct ptp_header) > skb->data + skb->len)
+ return NULL;
+
+ return (struct ptp_header *)ptr;
+}
+EXPORT_SYMBOL_GPL(ptp_parse_header);
+
void __init ptp_classifier_init(void)
{
static struct sock_filter ptp_filter[] __initdata = {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6faf73d6a0f7..e0774471f56d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -895,9 +895,6 @@ void __kfree_skb_defer(struct sk_buff *skb)
void napi_consume_skb(struct sk_buff *skb, int budget)
{
- if (unlikely(!skb))
- return;
-
/* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget)) {
dev_consume_skb_any(skb);
@@ -5955,8 +5952,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
size = SKB_WITH_OVERHEAD(ksize(data));
memcpy((struct skb_shared_info *)(data + size),
- skb_shinfo(skb), offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
+ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 649583158983..4b5f7c8fecd1 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -494,14 +494,34 @@ end:
struct sk_psock *sk_psock_init(struct sock *sk, int node)
{
- struct sk_psock *psock = kzalloc_node(sizeof(*psock),
- GFP_ATOMIC | __GFP_NOWARN,
- node);
- if (!psock)
- return NULL;
+ struct sk_psock *psock;
+ struct proto *prot;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ if (inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ if (sk->sk_user_data) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+ psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
+ if (!psock) {
+ psock = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ prot = READ_ONCE(sk->sk_prot);
psock->sk = sk;
- psock->eval = __SK_NONE;
+ psock->eval = __SK_NONE;
+ psock->sk_proto = prot;
+ psock->saved_unhash = prot->unhash;
+ psock->saved_close = prot->close;
+ psock->saved_write_space = sk->sk_write_space;
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);
@@ -516,6 +536,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
rcu_assign_sk_user_data_nocopy(sk, psock);
sock_hold(sk);
+out:
+ write_unlock_bh(&sk->sk_callback_lock);
return psock;
}
EXPORT_SYMBOL_GPL(sk_psock_init);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c5c6b18eff4..ba9e7d91e2ef 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -413,18 +413,6 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
return 0;
}
-static void sock_warn_obsolete_bsdism(const char *name)
-{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
- warncomm, name);
- warned++;
- }
-}
-
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -984,7 +972,6 @@ set_sndbuf:
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
break;
case SO_PASSCRED:
@@ -1387,7 +1374,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
break;
case SO_TIMESTAMP_OLD:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 119f52a99dc1..078386d7d9a2 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -184,8 +184,6 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
struct proto *prot;
- sock_owned_by_me(sk);
-
switch (sk->sk_type) {
case SOCK_STREAM:
prot = tcp_bpf_get_proto(sk, psock);
@@ -272,8 +270,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
}
} else {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock) {
- ret = -ENOMEM;
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
goto out_progs;
}
}
@@ -322,8 +320,8 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
if (!psock) {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock)
- return -ENOMEM;
+ if (IS_ERR(psock))
+ return PTR_ERR(psock);
}
ret = sock_map_init_proto(sk, psock);
@@ -478,8 +476,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -563,10 +559,12 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return false;
}
-static int sock_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags);
+
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
- u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
@@ -595,14 +593,38 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
sock_map_sk_acquire(sk);
if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
else
- ret = sock_map_update_common(map, idx, sk, flags);
+ ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
return ret;
}
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct sock *sk = (struct sock *)value;
+ int ret;
+
+ if (!sock_map_sk_is_suitable(sk))
+ return -EOPNOTSUPP;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ return ret;
+}
+
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
struct bpf_map *, map, void *, key, u64, flags)
{
@@ -683,6 +705,7 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
static int sock_map_btf_id;
const struct bpf_map_ops sock_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
.map_get_next_key = sock_map_get_next_key,
@@ -855,8 +878,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -915,45 +936,6 @@ out_free:
return ret;
}
-static int sock_hash_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
-{
- struct socket *sock;
- struct sock *sk;
- int ret;
- u64 ufd;
-
- if (map->value_size == sizeof(u64))
- ufd = *(u64 *)value;
- else
- ufd = *(u32 *)value;
- if (ufd > S32_MAX)
- return -EINVAL;
-
- sock = sockfd_lookup(ufd, &ret);
- if (!sock)
- return ret;
- sk = sock->sk;
- if (!sk) {
- ret = -EINVAL;
- goto out;
- }
- if (!sock_map_sk_is_suitable(sk)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- sock_map_sk_acquire(sk);
- if (!sock_map_sk_state_allowed(sk))
- ret = -EOPNOTSUPP;
- else
- ret = sock_hash_update_common(map, key, sk, flags);
- sock_map_sk_release(sk);
-out:
- fput(sock->file);
- return ret;
-}
-
static int sock_hash_get_next_key(struct bpf_map *map, void *key,
void *key_next)
{
@@ -1219,10 +1201,11 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
static int sock_hash_map_btf_id;
const struct bpf_map_ops sock_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_hash_alloc,
.map_free = sock_hash_free,
.map_get_next_key = sock_hash_get_next_key,
- .map_update_elem = sock_hash_update_elem,
+ .map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_hash_delete_elem,
.map_lookup_elem = sock_hash_lookup,
.map_lookup_elem_sys_only = sock_hash_lookup_sys,
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6ada114bbcca..d86d8d11cfe4 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -22,7 +22,7 @@
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
-static int two __maybe_unused = 2;
+static int two = 2;
static int three = 3;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -546,7 +546,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = &two,
},
{
.procname = "devconf_inherit_init_net",
@@ -587,6 +587,19 @@ static struct ctl_table netns_core_table[] = {
{ }
};
+static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
+{
+ /* fallback tunnels for initns only */
+ if (!strncmp(str, "initns", 6))
+ sysctl_fb_tunnels_only_for_init_net = 1;
+ /* no fallback tunnels anywhere */
+ else if (!strncmp(str, "none", 4))
+ sysctl_fb_tunnels_only_for_init_net = 2;
+
+ return 1;
+}
+__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
+
static __net_init int sysctl_core_net_init(struct net *net)
{
struct ctl_table *tbl;
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 0a72510d5de1..8f3dd3b1d2d0 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -274,7 +274,7 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
/**
* dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
* This routine is called when the peer acknowledges the receipt of Ack Vectors
- * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * up to and including @ackno. While based on section A.3 of RFC 4340, here
* are additional precautions to prevent corrupted buffer state. In particular,
* we use tail_ackno to identify outdated records; it always marks the earliest
* packet of group (2) in 11.4.2.
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9c28c8251125..bb3d70664dde 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ inet_sk(sk)->tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
local_bh_disable();
bh_lock_sock(ctl_sk);
err = ip_build_and_send_pkt(skb, ctl_sk,
- rxiph->daddr, rxiph->saddr, NULL);
+ rxiph->daddr, rxiph->saddr, NULL,
+ inet_sk(ctl_sk)->tos);
bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
@@ -731,7 +733,7 @@ int dccp_invalid_packet(struct sk_buff *skb)
return 1;
}
/*
- * If P.Data Offset is too too large for packet, drop packet and return
+ * If P.Data Offset is too large for packet, drop packet and return
*/
if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 0e06dfc32273..927c796d7682 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -85,7 +85,7 @@ static void dccp_retransmit_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
/*
- * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+ * More than 4MSL (8 minutes) has passed, a RESET(aborted) was
* sent, no need to retransmit, this sock is dead.
*/
if (dccp_write_timeout(sk))
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 1ce9ba8cf545..5c18c0214aac 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -330,11 +330,7 @@ EXPORT_SYMBOL_GPL(call_dsa_notifiers);
int dsa_devlink_param_get(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
- struct dsa_devlink_priv *dl_priv;
- struct dsa_switch *ds;
-
- dl_priv = devlink_priv(dl);
- ds = dl_priv->ds;
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
if (!ds->ops->devlink_param_get)
return -EOPNOTSUPP;
@@ -346,11 +342,7 @@ EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
int dsa_devlink_param_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
{
- struct dsa_devlink_priv *dl_priv;
- struct dsa_switch *ds;
-
- dl_priv = devlink_priv(dl);
- ds = dl_priv->ds;
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
if (!ds->ops->devlink_param_set)
return -EOPNOTSUPP;
@@ -412,6 +404,22 @@ void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
}
EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+struct devlink_region *
+dsa_devlink_region_create(struct dsa_switch *ds,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ return devlink_region_create(ds->devlink, ops, region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_create);
+
+void dsa_devlink_region_destroy(struct devlink_region *region)
+{
+ devlink_region_destroy(region);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy);
+
struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
{
if (!netdev || !dsa_slave_dev_check(netdev))
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index c0ffc7a2b65f..3cf67f5fe54a 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -21,9 +21,6 @@
static DEFINE_MUTEX(dsa2_mutex);
LIST_HEAD(dsa_tree_list);
-static const struct devlink_ops dsa_devlink_ops = {
-};
-
struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
{
struct dsa_switch_tree *dst;
@@ -382,6 +379,22 @@ static void dsa_port_teardown(struct dsa_port *dp)
dp->setup = false;
}
+static int dsa_devlink_info_get(struct devlink *dl,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (ds->ops->devlink_info_get)
+ return ds->ops->devlink_info_get(ds, req, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static const struct devlink_ops dsa_devlink_ops = {
+ .info_get = dsa_devlink_info_get,
+};
+
static int dsa_switch_setup(struct dsa_switch *ds)
{
struct dsa_devlink_priv *dl_priv;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 1653e3377cb3..2da656d984ef 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -164,8 +164,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
struct switchdev_trans *trans);
int dsa_port_vlan_del(struct dsa_port *dp,
const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags);
-int dsa_port_vid_del(struct dsa_port *dp, u16 vid);
int dsa_port_link_register_of(struct dsa_port *dp);
void dsa_port_link_unregister_of(struct dsa_port *dp);
extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index e23ece229c7e..9a4fb80d2731 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -193,11 +193,44 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
}
+/* Must be called under rcu_read_lock() */
static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
bool vlan_filtering)
{
struct dsa_switch *ds = dp->ds;
- int i;
+ int err, i;
+
+ /* VLAN awareness was off, so the question is "can we turn it on".
+ * We may have had 8021q uppers, those need to go. Make sure we don't
+ * enter an inconsistent state: deny changing the VLAN awareness state
+ * as long as we have 8021q uppers.
+ */
+ if (vlan_filtering && dsa_is_user_port(ds, dp->index)) {
+ struct net_device *upper_dev, *slave = dp->slave;
+ struct net_device *br = dp->bridge_dev;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
+ struct bridge_vlan_info br_info;
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ dev_err(ds->dev, "Must remove upper %s first\n",
+ upper_dev->name);
+ return false;
+ }
+ }
+ }
if (!ds->vlan_filtering_is_global)
return true;
@@ -232,15 +265,24 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
struct dsa_switch *ds = dp->ds;
int err;
- /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
- if (switchdev_trans_ph_prepare(trans))
- return 0;
+ if (switchdev_trans_ph_prepare(trans)) {
+ bool apply;
- if (!ds->ops->port_vlan_filtering)
- return 0;
+ if (!ds->ops->port_vlan_filtering)
+ return -EOPNOTSUPP;
- if (!dsa_port_can_apply_vlan_filtering(dp, vlan_filtering))
- return -EINVAL;
+ /* We are called from dsa_slave_switchdev_blocking_event(),
+ * which is not under rcu_read_lock(), unlike
+ * dsa_slave_switchdev_event().
+ */
+ rcu_read_lock();
+ apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering);
+ rcu_read_unlock();
+ if (!apply)
+ return -EINVAL;
+
+ return 0;
+ }
if (dsa_port_is_vlan_filtering(dp) == vlan_filtering)
return 0;
@@ -433,39 +475,6 @@ int dsa_port_vlan_del(struct dsa_port *dp,
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
}
-int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags)
-{
- struct switchdev_obj_port_vlan vlan = {
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
- };
- struct switchdev_trans trans;
- int err;
-
- trans.ph_prepare = true;
- err = dsa_port_vlan_add(dp, &vlan, &trans);
- if (err)
- return err;
-
- trans.ph_prepare = false;
- return dsa_port_vlan_add(dp, &vlan, &trans);
-}
-EXPORT_SYMBOL(dsa_port_vid_add);
-
-int dsa_port_vid_del(struct dsa_port *dp, u16 vid)
-{
- struct switchdev_obj_port_vlan vlan = {
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .vid_begin = vid,
- .vid_end = vid,
- };
-
- return dsa_port_vlan_del(dp, &vlan);
-}
-EXPORT_SYMBOL(dsa_port_vid_del);
-
static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
{
struct device_node *phy_dn;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 16e5f98d4882..e7c1d62fde99 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -303,13 +303,36 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
return ret;
}
+/* Must be called under rcu_read_lock() */
+static int
+dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct net_device *upper_dev;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+ if (vid >= vlan->vid_begin && vid <= vlan->vid_end)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
static int dsa_slave_vlan_add(struct net_device *dev,
const struct switchdev_obj *obj,
struct switchdev_trans *trans)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
struct switchdev_obj_port_vlan vlan;
- int err;
+ int vid, err;
if (obj->orig_dev != dev)
return -EOPNOTSUPP;
@@ -319,6 +342,17 @@ static int dsa_slave_vlan_add(struct net_device *dev,
vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+ /* Deny adding a bridge VLAN when there is already an 802.1Q upper with
+ * the same VID.
+ */
+ if (trans->ph_prepare && br_vlan_enabled(dp->bridge_dev)) {
+ rcu_read_lock();
+ err = dsa_slave_vlan_check_for_8021q_uppers(dev, &vlan);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
err = dsa_port_vlan_add(dp, &vlan, trans);
if (err)
return err;
@@ -333,6 +367,12 @@ static int dsa_slave_vlan_add(struct net_device *dev,
if (err)
return err;
+ for (vid = vlan.vid_begin; vid <= vlan.vid_end; vid++) {
+ err = vlan_vid_add(master, htons(ETH_P_8021Q), vid);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -376,7 +416,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
static int dsa_slave_vlan_del(struct net_device *dev,
const struct switchdev_obj *obj)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+ int vid, err;
if (obj->orig_dev != dev)
return -EOPNOTSUPP;
@@ -384,10 +427,19 @@ static int dsa_slave_vlan_del(struct net_device *dev,
if (dsa_port_skip_vlan_configuration(dp))
return 0;
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
/* Do not deprogram the CPU port as it may be shared with other user
* ports which can be members of this VLAN as well.
*/
- return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
+ err = dsa_port_vlan_del(dp, vlan);
+ if (err)
+ return err;
+
+ for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++)
+ vlan_vid_del(master, htons(ETH_P_8021Q), vid);
+
+ return 0;
}
static int dsa_slave_port_obj_del(struct net_device *dev,
@@ -1232,64 +1284,66 @@ static int dsa_slave_get_ts_info(struct net_device *dev,
static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
u16 vid)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct bridge_vlan_info info;
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid_begin = vid,
+ .vid_end = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ struct switchdev_trans trans;
int ret;
- /* Check for a possible bridge VLAN entry now since there is no
- * need to emulate the switchdev prepare + commit phase.
- */
- if (dp->bridge_dev) {
- if (dsa_port_skip_vlan_configuration(dp))
- return 0;
+ /* User port... */
+ trans.ph_prepare = true;
+ ret = dsa_port_vlan_add(dp, &vlan, &trans);
+ if (ret)
+ return ret;
- /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
- * device, respectively the VID is not found, returning
- * 0 means success, which is a failure for us here.
- */
- ret = br_vlan_get_info(dp->bridge_dev, vid, &info);
- if (ret == 0)
- return -EBUSY;
- }
+ trans.ph_prepare = false;
+ ret = dsa_port_vlan_add(dp, &vlan, &trans);
+ if (ret)
+ return ret;
- ret = dsa_port_vid_add(dp, vid, 0);
+ /* And CPU port... */
+ trans.ph_prepare = true;
+ ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans);
if (ret)
return ret;
- ret = dsa_port_vid_add(dp->cpu_dp, vid, 0);
+ trans.ph_prepare = false;
+ ret = dsa_port_vlan_add(dp->cpu_dp, &vlan, &trans);
if (ret)
return ret;
- return 0;
+ return vlan_vid_add(master, proto, vid);
}
static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
u16 vid)
{
+ struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct bridge_vlan_info info;
- int ret;
-
- /* Check for a possible bridge VLAN entry now since there is no
- * need to emulate the switchdev prepare + commit phase.
- */
- if (dp->bridge_dev) {
- if (dsa_port_skip_vlan_configuration(dp))
- return 0;
-
- /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
- * device, respectively the VID is not found, returning
- * 0 means success, which is a failure for us here.
- */
- ret = br_vlan_get_info(dp->bridge_dev, vid, &info);
- if (ret == 0)
- return -EBUSY;
- }
+ struct switchdev_obj_port_vlan vlan = {
+ .vid_begin = vid,
+ .vid_end = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ int err;
/* Do not deprogram the CPU port as it may be shared with other user
* ports which can be members of this VLAN as well.
*/
- return dsa_port_vid_del(dp, vid);
+ err = dsa_port_vlan_del(dp, &vlan);
+ if (err)
+ return err;
+
+ vlan_vid_del(master, proto, vid);
+
+ return 0;
}
struct dsa_hw_port {
@@ -1784,7 +1838,7 @@ int dsa_slave_create(struct dsa_port *port)
rtnl_lock();
ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);
rtnl_unlock();
- if (ret)
+ if (ret && ret != -EOPNOTSUPP)
dev_warn(ds->dev, "nonfatal error %d setting MTU on port %d\n",
ret, port->index);
@@ -1792,8 +1846,9 @@ int dsa_slave_create(struct dsa_port *port)
ret = dsa_slave_phy_setup(slave_dev);
if (ret) {
- netdev_err(master, "error %d setting up slave PHY for %s\n",
- ret, slave_dev->name);
+ netdev_err(slave_dev,
+ "error %d setting up PHY for tree %d, switch %d, port %d\n",
+ ret, ds->dst->index, ds->index, port->index);
goto out_gcells;
}
@@ -1880,9 +1935,9 @@ static int dsa_slave_changeupper(struct net_device *dev,
return err;
}
-static int dsa_slave_upper_vlan_check(struct net_device *dev,
- struct netdev_notifier_changeupper_info *
- info)
+static int
+dsa_prevent_bridging_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
{
struct netlink_ext_ack *ext_ack;
struct net_device *slave;
@@ -1912,14 +1967,56 @@ static int dsa_slave_upper_vlan_check(struct net_device *dev,
return NOTIFY_DONE;
}
+static int
+dsa_slave_check_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct net_device *br = dp->bridge_dev;
+ struct bridge_vlan_info br_info;
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+ u16 vid;
+
+ if (!br || !br_vlan_enabled(br))
+ return NOTIFY_DONE;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+ vid = vlan_dev_vlan_id(info->upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "This VLAN is already configured by the bridge");
+ return notifier_from_errno(-EBUSY);
+ }
+
+ return NOTIFY_DONE;
+}
+
static int dsa_slave_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- if (event == NETDEV_CHANGEUPPER) {
+ switch (event) {
+ case NETDEV_PRECHANGEUPPER: {
+ struct netdev_notifier_changeupper_info *info = ptr;
+
+ if (!dsa_slave_dev_check(dev))
+ return dsa_prevent_bridging_8021q_upper(dev, ptr);
+
+ if (is_vlan_dev(info->upper_dev))
+ return dsa_slave_check_8021q_upper(dev, ptr);
+ break;
+ }
+ case NETDEV_CHANGEUPPER:
if (!dsa_slave_dev_check(dev))
- return dsa_slave_upper_vlan_check(dev, ptr);
+ return NOTIFY_DONE;
return dsa_slave_changeupper(dev, ptr);
}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 86c8dc5c32a0..9afef6f0f9df 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -232,43 +232,6 @@ static int dsa_switch_mdb_del(struct dsa_switch *ds,
return 0;
}
-static int dsa_port_vlan_device_check(struct net_device *vlan_dev,
- int vlan_dev_vid,
- void *arg)
-{
- struct switchdev_obj_port_vlan *vlan = arg;
- u16 vid;
-
- for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) {
- if (vid == vlan_dev_vid)
- return -EBUSY;
- }
-
- return 0;
-}
-
-static int dsa_port_vlan_check(struct dsa_switch *ds, int port,
- const struct switchdev_obj_port_vlan *vlan)
-{
- const struct dsa_port *dp = dsa_to_port(ds, port);
- int err = 0;
-
- /* Device is not bridged, let it proceed with the VLAN device
- * creation.
- */
- if (!dp->bridge_dev)
- return err;
-
- /* dsa_slave_vlan_rx_{add,kill}_vid() cannot use the prepare phase and
- * already checks whether there is an overlapping bridge VLAN entry
- * with the same VID, so here we only need to check that if we are
- * adding a bridge VLAN entry there is not an overlapping VLAN device
- * claiming that VID.
- */
- return vlan_for_each(dp->slave, dsa_port_vlan_device_check,
- (void *)vlan);
-}
-
static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port,
struct dsa_notifier_vlan_info *info)
{
@@ -291,10 +254,6 @@ static int dsa_switch_vlan_prepare(struct dsa_switch *ds,
for (port = 0; port < ds->num_ports; port++) {
if (dsa_switch_vlan_match(ds, port, info)) {
- err = dsa_port_vlan_check(ds, port, info->vlan);
- if (err)
- return err;
-
err = ds->ops->port_vlan_prepare(ds, port, info->vlan);
if (err)
return err;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 780b2a15ac9b..8e3e8a5b8559 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -146,15 +146,15 @@ EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
* user explicitly configured this @vid through the bridge core, then the @vid
* is installed again, but this time with the flags from the bridge layer.
*/
-static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
+static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid,
u16 flags, bool enabled)
{
- struct dsa_port *dp = dsa_to_port(ds, port);
+ struct dsa_port *dp = dsa_to_port(ctx->ds, port);
if (enabled)
- return dsa_port_vid_add(dp, vid, flags);
+ return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags);
- return dsa_port_vid_del(dp, vid);
+ return ctx->ops->vlan_del(ctx->ds, dp->index, vid);
}
/* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single
@@ -209,25 +209,29 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
* +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+
* swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3
*/
-int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
+static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port,
+ bool enabled)
{
- int upstream = dsa_upstream_port(ds, port);
- u16 rx_vid = dsa_8021q_rx_vid(ds, port);
- u16 tx_vid = dsa_8021q_tx_vid(ds, port);
- int i, err;
+ int upstream = dsa_upstream_port(ctx->ds, port);
+ u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
+ u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port);
+ struct net_device *master;
+ int i, err, subvlan;
/* The CPU port is implicitly configured by
* configuring the front-panel ports
*/
- if (!dsa_is_user_port(ds, port))
+ if (!dsa_is_user_port(ctx->ds, port))
return 0;
+ master = dsa_to_port(ctx->ds, port)->cpu_dp->master;
+
/* Add this user port's RX VID to the membership list of all others
* (including itself). This is so that bridging will not be hindered.
* L2 forwarding rules still take precedence when there are no VLAN
* restrictions, so there are no concerns about leaking traffic.
*/
- for (i = 0; i < ds->num_ports; i++) {
+ for (i = 0; i < ctx->ds->num_ports; i++) {
u16 flags;
if (i == upstream)
@@ -240,9 +244,10 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
/* The RX VID is a regular VLAN on all others */
flags = BRIDGE_VLAN_INFO_UNTAGGED;
- err = dsa_8021q_vid_apply(ds, i, rx_vid, flags, enabled);
+ err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply RX VID %d to port %d: %d\n",
rx_vid, port, err);
return err;
}
@@ -251,80 +256,115 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
/* CPU port needs to see this port's RX VID
* as tagged egress.
*/
- err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled);
+ err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply RX VID %d to port %d: %d\n",
rx_vid, port, err);
return err;
}
+ /* Add to the master's RX filter not only @rx_vid, but in fact
+ * the entire subvlan range, just in case this DSA switch might
+ * want to use sub-VLANs.
+ */
+ for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) {
+ u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan);
+
+ if (enabled)
+ vlan_vid_add(master, ctx->proto, vid);
+ else
+ vlan_vid_del(master, ctx->proto, vid);
+ }
+
/* Finally apply the TX VID on this port and on the CPU port */
- err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
+ err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply TX VID %d on port %d: %d\n",
tx_vid, port, err);
return err;
}
- err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled);
+ err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled);
if (err) {
- dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n",
+ dev_err(ctx->ds->dev,
+ "Failed to apply TX VID %d on port %d: %d\n",
tx_vid, upstream, err);
return err;
}
return err;
}
-EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging);
-static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
+int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled)
+{
+ int rc, port;
+
+ ASSERT_RTNL();
+
+ for (port = 0; port < ctx->ds->num_ports; port++) {
+ rc = dsa_8021q_setup_port(ctx, port, enabled);
+ if (rc < 0) {
+ dev_err(ctx->ds->dev,
+ "Failed to setup VLAN tagging for port %d: %d\n",
+ port, rc);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_setup);
+
+static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx,
+ int port,
+ struct dsa_8021q_context *other_ctx,
int other_port, bool enabled)
{
- u16 rx_vid = dsa_8021q_rx_vid(ds, port);
+ u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port);
/* @rx_vid of local @ds port @port goes to @other_port of
* @other_ds
*/
- return dsa_8021q_vid_apply(other_ds, other_port, rx_vid,
+ return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid,
BRIDGE_VLAN_INFO_UNTAGGED, enabled);
}
-static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
- int other_port,
- struct list_head *crosschip_links)
+static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port,
+ struct dsa_8021q_context *other_ctx,
+ int other_port)
{
struct dsa_8021q_crosschip_link *c;
- list_for_each_entry(c, crosschip_links, list) {
- if (c->port == port && c->other_ds == other_ds &&
+ list_for_each_entry(c, &ctx->crosschip_links, list) {
+ if (c->port == port && c->other_ctx == other_ctx &&
c->other_port == other_port) {
refcount_inc(&c->refcount);
return 0;
}
}
- dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n",
- port, dev_name(other_ds->dev), other_port);
+ dev_dbg(ctx->ds->dev,
+ "adding crosschip link from port %d to %s port %d\n",
+ port, dev_name(other_ctx->ds->dev), other_port);
c = kzalloc(sizeof(*c), GFP_KERNEL);
if (!c)
return -ENOMEM;
c->port = port;
- c->other_ds = other_ds;
+ c->other_ctx = other_ctx;
c->other_port = other_port;
refcount_set(&c->refcount, 1);
- list_add(&c->list, crosschip_links);
+ list_add(&c->list, &ctx->crosschip_links);
return 0;
}
-static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
+static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx,
struct dsa_8021q_crosschip_link *c,
- struct list_head *crosschip_links,
bool *keep)
{
*keep = !refcount_dec_and_test(&c->refcount);
@@ -332,9 +372,9 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
if (*keep)
return;
- dev_dbg(ds->dev,
+ dev_dbg(ctx->ds->dev,
"deleting crosschip link from port %d to %s port %d\n",
- c->port, dev_name(c->other_ds->dev), c->other_port);
+ c->port, dev_name(c->other_ctx->ds->dev), c->other_port);
list_del(&c->list);
kfree(c);
@@ -347,64 +387,58 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
* or untagged: it doesn't matter, since it should never egress a frame having
* our @rx_vid.
*/
-int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
- int other_port,
- struct list_head *crosschip_links)
+int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
+ struct dsa_8021q_context *other_ctx,
+ int other_port)
{
/* @other_upstream is how @other_ds reaches us. If we are part
* of disjoint trees, then we are probably connected through
* our CPU ports. If we're part of the same tree though, we should
* probably use dsa_towards_port.
*/
- int other_upstream = dsa_upstream_port(other_ds, other_port);
+ int other_upstream = dsa_upstream_port(other_ctx->ds, other_port);
int rc;
- rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
- other_port, crosschip_links);
+ rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port);
if (rc)
return rc;
- rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+ rc = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx,
other_port, true);
if (rc)
return rc;
- rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
- other_upstream,
- crosschip_links);
+ rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream);
if (rc)
return rc;
- return dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+ return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx,
other_upstream, true);
}
EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join);
-int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
- struct dsa_switch *other_ds,
- int other_port,
- struct list_head *crosschip_links)
+int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
+ struct dsa_8021q_context *other_ctx,
+ int other_port)
{
- int other_upstream = dsa_upstream_port(other_ds, other_port);
+ int other_upstream = dsa_upstream_port(other_ctx->ds, other_port);
struct dsa_8021q_crosschip_link *c, *n;
- list_for_each_entry_safe(c, n, crosschip_links, list) {
- if (c->port == port && c->other_ds == other_ds &&
+ list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) {
+ if (c->port == port && c->other_ctx == other_ctx &&
(c->other_port == other_port ||
c->other_port == other_upstream)) {
- struct dsa_switch *other_ds = c->other_ds;
+ struct dsa_8021q_context *other_ctx = c->other_ctx;
int other_port = c->other_port;
bool keep;
int rc;
- dsa_8021q_crosschip_link_del(ds, c, crosschip_links,
- &keep);
+ dsa_8021q_crosschip_link_del(ctx, c, &keep);
if (keep)
continue;
- rc = dsa_8021q_crosschip_link_apply(ds, port,
- other_ds,
+ rc = dsa_8021q_crosschip_link_apply(ctx, port,
+ other_ctx,
other_port,
false);
if (rc)
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 9b4a4d719291..3710f9daa46d 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -72,14 +72,21 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb)
{
struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
+ u16 vlan_tci;
if (hdr->h_vlan_proto == htons(ETH_P_SJA1105))
return true;
- if (hdr->h_vlan_proto != htons(ETH_P_8021Q))
+ if (hdr->h_vlan_proto != htons(ETH_P_8021Q) &&
+ !skb_vlan_tag_present(skb))
return false;
- return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK);
+ if (skb_vlan_tag_present(skb))
+ vlan_tci = skb_vlan_tag_get(skb);
+ else
+ vlan_tci = ntohs(hdr->h_vlan_TCI);
+
+ return vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK);
}
/* This is the first time the tagger sees the frame on RX.
@@ -283,7 +290,8 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
hdr = eth_hdr(skb);
tpid = ntohs(hdr->h_proto);
- is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q);
+ is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q ||
+ skb_vlan_tag_present(skb));
is_link_local = sja1105_is_link_local(skb);
is_meta = sja1105_is_meta_frame(skb);
@@ -292,7 +300,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
if (is_tagged) {
/* Normal traffic path. */
skb_push_rcsum(skb, ETH_HLEN);
- __skb_vlan_pop(skb, &tci);
+ if (skb_vlan_tag_present(skb)) {
+ tci = skb_vlan_tag_get(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ __skb_vlan_pop(skb, &tci);
+ }
skb_pull_rcsum(skb, ETH_HLEN);
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 9ef54cdcf662..9ecda09ecb11 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -223,7 +223,7 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
from_channel = channels.combined_count +
min(channels.rx_count, channels.tx_count);
for (i = from_channel; i < old_total; i++)
- if (xdp_get_umem_from_qid(dev, i)) {
+ if (xsk_get_pool_from_qid(dev, i)) {
GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
return -EINVAL;
}
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index ed19573fccd7..24036e3055a1 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -192,6 +192,8 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
__DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full),
__DEFINE_LINK_MODE_NAME(400000, DR4, Full),
__DEFINE_LINK_MODE_NAME(400000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, FX, Half),
+ __DEFINE_LINK_MODE_NAME(100, FX, Full),
};
static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 441794e0034f..328d15cd4006 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1706,7 +1706,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
min(channels.rx_count, channels.tx_count);
to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
for (i = from_channel; i < to_channel; i++)
- if (xdp_get_umem_from_qid(dev, i))
+ if (xsk_get_pool_from_qid(dev, i))
return -EINVAL;
ret = dev->ethtool_ops->set_channels(dev, &channels);
@@ -1861,23 +1861,18 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
} else {
/* Driver expects to be called at twice the frequency in rc */
- int n = rc * 2, i, interval = HZ / n;
+ int n = rc * 2, interval = HZ / n;
+ u64 count = n * id.data, i = 0;
- /* Count down seconds */
do {
- /* Count down iterations per second */
- i = n;
- do {
- rtnl_lock();
- rc = ops->set_phys_id(dev,
- (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
- rtnl_unlock();
- if (rc)
- break;
- schedule_timeout_interruptible(interval);
- } while (!signal_pending(current) && --i != 0);
- } while (!signal_pending(current) &&
- (id.data == 0 || --id.data != 0));
+ rtnl_lock();
+ rc = ops->set_phys_id(dev,
+ (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && (!id.data || i < count));
}
rtnl_lock();
@@ -3025,13 +3020,14 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
case TCP_V4_FLOW:
case TCP_V6_FLOW:
match->key.basic.ip_proto = IPPROTO_TCP;
+ match->mask.basic.ip_proto = 0xff;
break;
case UDP_V4_FLOW:
case UDP_V6_FLOW:
match->key.basic.ip_proto = IPPROTO_UDP;
+ match->mask.basic.ip_proto = 0xff;
break;
}
- match->mask.basic.ip_proto = 0xff;
match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC);
match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] =
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 7044a2853886..29dcd675b65a 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -272,6 +272,8 @@ static const struct link_mode_info link_mode_params[] = {
__DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
__DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
__DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Full),
};
static const struct nla_policy
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
index 7aea35d1e8a5..1980aa7eb2b6 100644
--- a/net/ethtool/pause.c
+++ b/net/ethtool/pause.c
@@ -10,6 +10,7 @@ struct pause_req_info {
struct pause_reply_data {
struct ethnl_reply_data base;
struct ethtool_pauseparam pauseparam;
+ struct ethtool_pause_stats pausestat;
};
#define PAUSE_REPDATA(__reply_base) \
@@ -22,8 +23,15 @@ pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
[ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT },
[ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT },
[ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT },
};
+static void ethtool_stats_init(u64 *stats, unsigned int n)
+{
+ while (n--)
+ stats[n] = ETHTOOL_STAT_NOT_SET;
+}
+
static int pause_prepare_data(const struct ethnl_req_info *req_base,
struct ethnl_reply_data *reply_base,
struct genl_info *info)
@@ -34,10 +42,17 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base,
if (!dev->ethtool_ops->get_pauseparam)
return -EOPNOTSUPP;
+
ret = ethnl_ops_begin(dev);
if (ret < 0)
return ret;
dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam);
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ dev->ethtool_ops->get_pause_stats) {
+ ethtool_stats_init((u64 *)&data->pausestat,
+ sizeof(data->pausestat) / 8);
+ dev->ethtool_ops->get_pause_stats(dev, &data->pausestat);
+ }
ethnl_ops_complete(dev);
return 0;
@@ -46,9 +61,50 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base,
static int pause_reply_size(const struct ethnl_req_info *req_base,
const struct ethnl_reply_data *reply_base)
{
- return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
+ int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
nla_total_size(sizeof(u8)) + /* _PAUSE_RX */
nla_total_size(sizeof(u8)); /* _PAUSE_TX */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ n += nla_total_size(0) + /* _PAUSE_STATS */
+ nla_total_size_64bit(sizeof(u64)) *
+ (ETHTOOL_A_PAUSE_STAT_MAX - 2);
+ return n;
+}
+
+static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype,
+ u16 padtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, padtype))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int pause_put_stats(struct sk_buff *skb,
+ const struct ethtool_pause_stats *pause_stats)
+{
+ const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (ethtool_put_stat(skb, pause_stats->tx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) ||
+ ethtool_put_stat(skb, pause_stats->rx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
}
static int pause_fill_reply(struct sk_buff *skb,
@@ -63,6 +119,10 @@ static int pause_fill_reply(struct sk_buff *skb,
nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause))
return -EMSGSIZE;
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ pause_put_stats(skb, &data->pausestat))
+ return -EMSGSIZE;
+
return 0;
}
@@ -89,6 +149,7 @@ pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
[ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 },
[ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 },
[ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT },
};
int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info)
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 7e11a6c35bc3..4cfd9e829c7b 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -60,17 +60,7 @@ hsr_node_table_show(struct seq_file *sfp, void *data)
return 0;
}
-/* hsr_node_table_open - Open the node_table file
- *
- * Description:
- * This routine opens a debugfs file node_table of specific hsr
- * or prp device
- */
-static int
-hsr_node_table_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, hsr_node_table_show, inode->i_private);
-}
+DEFINE_SHOW_ATTRIBUTE(hsr_node_table);
void hsr_debugfs_rename(struct net_device *dev)
{
@@ -85,13 +75,6 @@ void hsr_debugfs_rename(struct net_device *dev)
priv->node_tbl_root = d;
}
-static const struct file_operations hsr_fops = {
- .open = hsr_node_table_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* hsr_debugfs_init - create hsr node_table file for dumping
* the node table
*
@@ -113,7 +96,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
de = debugfs_create_file("node_table", S_IFREG | 0444,
priv->node_tbl_root, priv,
- &hsr_fops);
+ &hsr_node_table_fops);
if (IS_ERR(de)) {
pr_err("Cannot create hsr node_table file\n");
debugfs_remove(priv->node_tbl_root);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4307503a6f0b..b7260c8cef2e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1017,6 +1017,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
+ .flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 2eb71579f4d2..471d33a0d095 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -498,7 +498,7 @@ static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
/**
* cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CIPSO engine. The NetLabel routines will
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index abd083415f89..5308cfa3de62 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -237,7 +237,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
- * header to the outer L3 tunnel header, or we are are simply
+ * header to the outer L3 tunnel header, or we are simply
* treating the GRE tunnel header as though it is a UDP protocol
* specific header such as VXLAN or GENEVE.
*/
@@ -429,7 +429,7 @@ next_proto:
/* We can clear the encap_mark for GUE as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
- * header to the outer L3 tunnel header, or we are are simply
+ * header to the outer L3 tunnel header, or we are simply
* treating the GRE tunnel header as though it is a UDP protocol
* specific header such as VXLAN or GENEVE.
*/
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cf36f955bfe6..8f2e974a1e4d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -690,9 +690,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_unlock();
}
- tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) |
IPTOS_PREC_INTERNETCONTROL) :
- iph->tos;
+ iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
@@ -784,7 +784,7 @@ EXPORT_SYMBOL(icmp_ndo_send);
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
const struct net_protocol *ipprot;
int protocol = iph->protocol;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index f1bd95f243b3..366a4507b5a3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_sockopt inet_sockopt;
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
goto errout;
@@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
+ memset(&inet_sockopt, 0, sizeof(inet_sockopt));
+ inet_sockopt.recverr = inet->recverr;
+ inet_sockopt.is_icsk = inet->is_icsk;
+ inet_sockopt.freebind = inet->freebind;
+ inet_sockopt.hdrincl = inet->hdrincl;
+ inet_sockopt.mc_loop = inet->mc_loop;
+ inet_sockopt.transparent = inet->transparent;
+ inet_sockopt.mc_all = inet->mc_all;
+ inet_sockopt.nodefrag = inet->nodefrag;
+ inet_sockopt.bind_address_no_port = inet->bind_address_no_port;
+ inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884;
+ inet_sockopt.defer_connect = inet->defer_connect;
+ if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt),
+ &inet_sockopt))
+ goto errout;
+
return 0;
errout:
return 1;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 239e54474b65..8cbe74313f38 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -228,7 +228,7 @@ static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
- const int dif, const int sdif, bool exact_dif)
+ const int dif, const int sdif)
{
int score = -1;
@@ -277,15 +277,13 @@ static struct sock *inet_lhash2_lookup(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- bool exact_dif = inet_exact_dif_match(net, skb);
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr,
- dif, sdif, exact_dif);
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
saddr, sport, daddr, hnum);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 948747aac4e2..da1b5038bdfd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -47,32 +47,32 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
unsigned char *iph = skb_network_header(skb);
memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
- memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen);
opt = &(IPCB(skb)->opt);
if (opt->srr)
- memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+ memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
if (!is_frag) {
if (opt->rr_needaddr)
- ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
if (opt->ts_needaddr)
- ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
if (opt->ts_needtime) {
__be32 midtime;
midtime = inet_current_timestamp();
- memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
return;
}
if (opt->rr) {
- memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]);
opt->rr = 0;
opt->rr_needaddr = 0;
}
if (opt->ts) {
- memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]);
opt->ts = 0;
opt->ts_needaddr = opt->ts_needtime = 0;
}
@@ -495,26 +495,29 @@ EXPORT_SYMBOL(ip_options_compile);
void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
- memmove(optptr+7, optptr+3, optptr[1]-7);
- memcpy(optptr+3, &opt->faddr, 4);
+ unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr);
+
+ memmove(optptr + 7, optptr + 3, optptr[1] - 7);
+ memcpy(optptr + 3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr);
+
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
if (opt->ts) {
- unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr);
+
if (opt->ts_needtime) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
- if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ memset(&optptr[optptr[2] - 1], 0, 4);
+ if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC)
optptr[2] -= 4;
}
if (opt->ts_needaddr) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e6f2ada9e7d5..8b200252c655 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -143,7 +143,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
- __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
+ u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
@@ -156,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = inet->tos;
+ iph->tos = tos;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
@@ -997,7 +998,7 @@ static int __ip_append_data(struct sock *sk,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
if (cork->length + length > maxnonfragsize - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1352,7 +1353,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
if (cork->flags & IPCORK_OPT)
opt = cork->opt;
- if (!(rt->dst.dev->features&NETIF_F_SG))
+ if (!(rt->dst.dev->features & NETIF_F_SG))
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1537,7 +1538,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_select_ident(net, skb, sk);
if (opt) {
- iph->ihl += opt->optlen>>2;
+ iph->ihl += opt->optlen >> 2;
ip_options_build(skb, opt, cork->addr, rt, 0);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d2c223554ff7..ec6036713e2c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1124,8 +1124,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
dev_put(dev);
err = -EINVAL;
- if (sk->sk_bound_dev_if &&
- (!midx || midx != sk->sk_bound_dev_if))
+ if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
break;
inet->uc_index = ifindex;
@@ -1189,7 +1188,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
err = -EINVAL;
if (sk->sk_bound_dev_if &&
mreq.imr_ifindex != sk->sk_bound_dev_if &&
- (!midx || midx != sk->sk_bound_dev_if))
+ midx != sk->sk_bound_dev_if)
break;
inet->mc_index = mreq.imr_ifindex;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 876fd6ff1ff9..939792a38814 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt,
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
msg->im_msgtype = assert;
msg->im_mbz = 0;
- if (assert == IGMPMSG_WRVIFWHOLE)
+ if (assert == IGMPMSG_WRVIFWHOLE) {
msg->im_vif = vifi;
- else
+ msg->im_vif_hi = vifi >> 8;
+ } else {
msg->im_vif = mrt->mroute_reg_vif_num;
+ msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8;
+ }
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
ip_hdr(skb)->protocol = 0;
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
+ msg->im_vif_hi = vifi >> 8;
skb_dst_set(skb, dst_clone(skb_dst(pkt)));
/* Add our header */
igmp = skb_put(skb, sizeof(struct igmphdr));
@@ -2396,6 +2400,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
+ nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */
+ nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */
+ nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */
+ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */
/* IPMRA_CREPORT_PKT */
+ nla_total_size(payloadlen)
;
@@ -2427,11 +2432,12 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
rtgenm = nlmsg_data(nlh);
rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
- nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
+ nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) ||
nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
msg->im_src.s_addr) ||
nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
- msg->im_dst.s_addr))
+ msg->im_dst.s_addr) ||
+ nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id))
goto nla_put_failure;
nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 134e92382275..8c0f17c6863c 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -42,8 +42,8 @@ static int call_nexthop_notifiers(struct net *net,
{
int err;
- err = atomic_notifier_call_chain(&net->nexthop.notifier_chain,
- event_type, nh);
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ event_type, nh);
return notifier_to_errno(err);
}
@@ -133,12 +133,9 @@ static struct nexthop *nexthop_alloc(void)
static struct nh_group *nexthop_grp_alloc(u16 num_nh)
{
- size_t sz = offsetof(struct nexthop, nh_grp)
- + sizeof(struct nh_group)
- + sizeof(struct nh_grp_entry) * num_nh;
struct nh_group *nhg;
- nhg = kzalloc(sz, GFP_KERNEL);
+ nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
if (nhg)
nhg->num_nh = num_nh;
@@ -279,7 +276,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
case AF_INET:
fib_nh = &nhi->fib_nh;
if (fib_nh->fib_nh_gw_family &&
- nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+ nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
goto nla_put_failure;
break;
@@ -800,7 +797,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
return;
}
- newg->has_v4 = nhg->has_v4;
+ newg->has_v4 = false;
newg->mpath = nhg->mpath;
newg->fdb_nh = nhg->fdb_nh;
newg->num_nh = nhg->num_nh;
@@ -809,12 +806,18 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
nhges = nhg->nh_entries;
new_nhges = newg->nh_entries;
for (i = 0, j = 0; i < nhg->num_nh; ++i) {
+ struct nh_info *nhi;
+
/* current nexthop getting removed */
if (nhg->nh_entries[i].nh == nh) {
newg->num_nh--;
continue;
}
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ newg->has_v4 = true;
+
list_del(&nhges[i].nh_list);
new_nhges[j].nh_parent = nhges[i].nh_parent;
new_nhges[j].nh = nhges[i].nh;
@@ -867,8 +870,6 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
bool do_flush = false;
struct fib_info *fi;
- call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
-
list_for_each_entry(fi, &nh->fi_list, nh_list) {
fi->fib_flags |= RTNH_F_DEAD;
do_flush = true;
@@ -906,6 +907,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh,
static void remove_nexthop(struct net *net, struct nexthop *nh,
struct nl_info *nlinfo)
{
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
+
/* remove from the tree */
rb_erase(&nh->rb_node, &net->nexthop.rb_root);
@@ -961,6 +964,23 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
return 0;
}
+static void nh_group_v4_update(struct nh_group *nhg)
+{
+ struct nh_grp_entry *nhges;
+ bool has_v4 = false;
+ int i;
+
+ nhges = nhg->nh_entries;
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ has_v4 = true;
+ }
+ nhg->has_v4 = has_v4;
+}
+
static int replace_nexthop_single(struct net *net, struct nexthop *old,
struct nexthop *new,
struct netlink_ext_ack *extack)
@@ -984,6 +1004,21 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old,
rcu_assign_pointer(old->nh_info, newi);
rcu_assign_pointer(new->nh_info, oldi);
+ /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
+ * update IPv4 indication in all the groups using the nexthop.
+ */
+ if (oldi->family == AF_INET && newi->family == AF_INET6) {
+ struct nh_grp_entry *nhge;
+
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nhp->nh_grp);
+ nh_group_v4_update(nhg);
+ }
+ }
+
return 0;
}
@@ -1101,7 +1136,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh,
while (1) {
struct nexthop *nh;
- next = rtnl_dereference(*pp);
+ next = *pp;
if (!next)
break;
@@ -1924,14 +1959,15 @@ static struct notifier_block nh_netdev_notifier = {
int register_nexthop_notifier(struct net *net, struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb);
+ return blocking_notifier_chain_register(&net->nexthop.notifier_chain,
+ nb);
}
EXPORT_SYMBOL(register_nexthop_notifier);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain,
- nb);
+ return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
+ nb);
}
EXPORT_SYMBOL(unregister_nexthop_notifier);
@@ -1951,7 +1987,7 @@ static int __net_init nexthop_net_init(struct net *net)
net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
if (!net->nexthop.devhash)
return -ENOMEM;
- ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
+ BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
return 0;
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index df6fbefe44d4..248856b301c4 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -293,7 +293,8 @@ EXPORT_SYMBOL_GPL(ping_close);
/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
- struct sockaddr *uaddr, int addr_len) {
+ struct sockaddr *uaddr, int addr_len)
+{
struct net *net = sock_net(sk);
if (sk->sk_family == AF_INET) {
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
@@ -310,10 +311,10 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
-
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
+ else
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
if ((!inet_can_nonlocal_bind(net, isk) &&
chk_addr_ret != RTN_LOCAL) ||
@@ -383,20 +384,6 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
}
}
-static void ping_clear_saddr(struct sock *sk, int dif)
-{
- sk->sk_bound_dev_if = dif;
- if (sk->sk_family == AF_INET) {
- struct inet_sock *isk = inet_sk(sk);
- isk->inet_rcv_saddr = isk->inet_saddr = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
- memset(&np->saddr, 0, sizeof(np->saddr));
-#endif
- }
-}
/*
* We need our own bind because there are no privileged id's == local ports.
* Moreover, we don't allow binding to multi- and broadcast addresses.
@@ -420,12 +407,13 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
err = -EADDRINUSE;
- ping_set_saddr(sk, uaddr);
snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
if (ping_get_port(sk, snum) != 0) {
- ping_clear_saddr(sk, dif);
+ /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */
+ sk->sk_bound_dev_if = dif;
goto out;
}
+ ping_set_saddr(sk, uaddr);
pr_debug("after bind(): num = %hu, dif = %d\n",
isk->inet_num,
@@ -647,7 +635,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
}
int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
- void *user_icmph, size_t icmph_len) {
+ void *user_icmph, size_t icmph_len)
+{
u8 type, code;
if (len > 0xFFFF)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 407956be7deb..1170653a89cd 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -260,11 +260,12 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
err = EHOSTUNREACH;
if (code > NR_ICMP_UNREACH)
break;
- err = icmp_err_convert[code].errno;
- harderr = icmp_err_convert[code].fatal;
if (code == ICMP_FRAG_NEEDED) {
harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
err = EMSGSIZE;
+ } else {
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
}
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 58642b29a499..d15a78b26dfa 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -623,7 +623,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
u32 hval;
net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
- hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
+ hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
return hash_32(hval, FNHE_HASH_SHIFT);
}
@@ -1016,13 +1016,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
struct net *net = dev_net(dst->dev);
- u32 old_mtu = ipv4_mtu(dst);
struct fib_result res;
bool lock = false;
+ u32 old_mtu;
if (ip_mtu_locked(dst))
return;
+ old_mtu = ipv4_mtu(dst);
if (old_mtu < mtu)
return;
@@ -1066,7 +1067,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
@@ -1083,7 +1084,7 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
@@ -1101,7 +1102,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct dst_entry *odst = NULL;
@@ -1131,7 +1132,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
new = true;
}
- __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
+ __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
@@ -1156,7 +1157,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
@@ -1172,7 +1173,7 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct net *net = sock_net(sk);
@@ -1312,7 +1313,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- const struct rtable *rt = (const struct rtable *) dst;
+ const struct rtable *rt = (const struct rtable *)dst;
unsigned int mtu = rt->rt_pmtu;
if (!mtu || time_after_eq(jiffies, rt->dst.expires))
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f0794f0232ba..c375c126f436 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk,
struct sk_buff *skb)
{
+ struct tcp_request_sock *treq;
struct request_sock *req;
#ifdef CONFIG_MPTCP
- struct tcp_request_sock *treq;
-
if (sk_is_mptcp(sk))
ops = &mptcp_subflow_request_sock_ops;
#endif
@@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
if (!req)
return NULL;
-#if IS_ENABLED(CONFIG_MPTCP)
treq = tcp_rsk(req);
+ treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
if (treq->is_mptcp) {
int err = mptcp_subflow_init_cookie_req(req, sk, skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 54023a46db04..3e5f4f2e705e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &comp_sack_nr_max,
},
{
+ .procname = "tcp_reflect_tos",
+ .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 31f3b858db81..65057744fac8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -562,7 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= EPOLLIN | EPOLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
+ if (__sk_stream_is_writeable(sk, 1)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -574,7 +576,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* pairs with the input side.
*/
smp_mb__after_atomic();
- if (sk_stream_is_writeable(sk))
+ if (__sk_stream_is_writeable(sk, 1))
mask |= EPOLLOUT | EPOLLWRNORM;
}
} else
@@ -1002,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ goto wait_for_space;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
tcp_rtx_and_write_queues_empty(sk));
if (!skb)
- goto wait_for_memory;
+ goto wait_for_space;
#ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
@@ -1026,7 +1028,7 @@ new_segment:
goto new_segment;
}
if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ goto wait_for_space;
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
@@ -1067,9 +1069,8 @@ new_segment:
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1280,7 +1281,7 @@ restart:
new_segment:
if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ goto wait_for_space;
if (unlikely(process_backlog >= 16)) {
process_backlog = 0;
@@ -1291,7 +1292,7 @@ new_segment:
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb);
if (!skb)
- goto wait_for_memory;
+ goto wait_for_space;
process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1324,7 +1325,7 @@ new_segment:
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
- goto wait_for_memory;
+ goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
@@ -1338,7 +1339,7 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
@@ -1391,9 +1392,8 @@ new_segment:
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1525,7 +1525,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
@@ -2685,6 +2685,8 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0;
icsk->icsk_probes_out = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
@@ -3207,7 +3209,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
+ /* 0: disable, 1: enable, 2: start from ether_header */
+ if (val < 0 || val > 2)
err = -EINVAL;
else
tp->save_syn = val;
@@ -3788,20 +3791,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
lock_sock(sk);
if (tp->saved_syn) {
- if (len < tp->saved_syn[0]) {
- if (put_user(tp->saved_syn[0], optlen)) {
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
+ if (put_user(tcp_saved_syn_len(tp->saved_syn),
+ optlen)) {
release_sock(sk);
return -EFAULT;
}
release_sock(sk);
return -EINVAL;
}
- len = tp->saved_syn[0];
+ len = tcp_saved_syn_len(tp->saved_syn);
if (put_user(len, optlen)) {
release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ if (copy_to_user(optval, tp->saved_syn->data, len)) {
release_sock(sk);
return -EFAULT;
}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 7aa68f4aae6c..37f4cb2bba5c 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -567,10 +567,9 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
}
-static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
{
- if (sk->sk_family == AF_INET6 &&
- unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
spin_lock_bh(&tcpv6_prot_lock);
if (likely(ops != tcpv6_prot_saved)) {
tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
@@ -603,13 +602,11 @@ struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
- if (!psock->sk_proto) {
- struct proto *ops = READ_ONCE(sk->sk_prot);
-
- if (tcp_bpf_assert_proto_ops(ops))
+ if (sk->sk_family == AF_INET6) {
+ if (tcp_bpf_assert_proto_ops(psock->sk_proto))
return ERR_PTR(-EINVAL);
- tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
}
return &tcp_bpf_prots[family][config];
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 09b62de04eea..af2814c9342a 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -295,7 +295,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
refcount_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
- tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 184ea556f50e..50834e7f958e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -138,6 +138,69 @@ void clean_acked_data_flush(void)
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
#endif
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+ struct bpf_sock_ops_kern sock_ops;
+
+ if (likely(!unknown_opt && !parse_all_opt))
+ return;
+
+ /* The skb will be handled in the
+ * bpf_skops_established() or
+ * bpf_skops_write_hdr_opt().
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_LISTEN:
+ return;
+ }
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = bpf_op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+}
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -3801,7 +3864,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
-static void smc_parse_options(const struct tcphdr *th,
+static bool smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
@@ -3810,10 +3873,13 @@ static void smc_parse_options(const struct tcphdr *th,
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
+ return true;
+ }
}
#endif
+ return false;
}
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
@@ -3874,6 +3940,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->saw_unknown = 0;
while (length > 0) {
int opcode = *ptr++;
@@ -3964,15 +4031,21 @@ void tcp_parse_options(const struct net *net,
*/
if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
get_unaligned_be16(ptr) ==
- TCPOPT_FASTOPEN_MAGIC)
+ TCPOPT_FASTOPEN_MAGIC) {
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
- else
- smc_parse_options(th, opt_rx, ptr,
- opsize);
+ break;
+ }
+
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
+ break;
+
+ opt_rx->saw_unknown = 1;
break;
+ default:
+ opt_rx->saw_unknown = 1;
}
ptr += opsize-2;
length -= opsize;
@@ -5259,12 +5332,6 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return true;
}
-/* When incoming ACK allowed to free some skb from write_queue,
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
- * on the exit from tcp input handler.
- *
- * PROBLEM: sndbuf expansion does not work well with largesend.
- */
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5279,16 +5346,13 @@ static void tcp_new_space(struct sock *sk)
static void tcp_check_space(struct sock *sk)
{
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ /* pairs with tcp_poll() */
+ smp_mb();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -5590,6 +5654,8 @@ syn_challenge:
goto discard;
}
+ bpf_skops_parse_hdr(sk, skb);
+
return true;
discard:
@@ -5798,7 +5864,7 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
-void tcp_init_transfer(struct sock *sk, int bpf_op)
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5819,7 +5885,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
tp->snd_cwnd_stamp = tcp_jiffies32;
- tcp_call_bpf(sk, bpf_op, 0, NULL);
+ bpf_skops_established(sk, bpf_op, skb);
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
@@ -5838,7 +5904,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
}
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -6310,7 +6376,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+ skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
@@ -6599,13 +6666,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
- u32 *copy;
+ struct saved_syn *saved_syn;
+ u32 mac_hdrlen;
+ void *base;
+
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
+ base = skb_mac_header(skb);
+ mac_hdrlen = skb_mac_header_len(skb);
+ len += mac_hdrlen;
+ } else {
+ base = skb_network_header(skb);
+ mac_hdrlen = 0;
+ }
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
- if (copy) {
- copy[0] = len;
- memcpy(&copy[1], skb_network_header(skb), len);
- req->saved_syn = copy;
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
+ GFP_ATOMIC);
+ if (saved_syn) {
+ saved_syn->mac_hdrlen = mac_hdrlen;
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+ memcpy(saved_syn->data, base, len);
+ req->saved_syn = saved_syn;
}
}
}
@@ -6744,6 +6825,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
@@ -6752,7 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, TCP_SYNACK_FASTOPEN);
+ &foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
@@ -6770,7 +6852,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
- TCP_SYNACK_COOKIE);
+ TCP_SYNACK_COOKIE,
+ skb);
if (want_cookie) {
reqsk_free(req);
return 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5084333b5ab6..ace48b2790ff 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -575,7 +575,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
@@ -965,18 +965,23 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
+
+ tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -984,7 +989,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ tos & ~INET_ECN_MASK);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -1529,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();
+ /* Set ToS of the new socket based upon the value of incoming SYN. */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 495dda2449fe..56c306e3cd2f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -548,6 +548,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+ bpf_skops_init_child(sk, newsk);
tcp_bpf_clone(sk, newsk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 85ff417bda7f..386978dcd318 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -438,6 +438,7 @@ struct tcp_out_options {
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
@@ -452,6 +453,145 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
#endif
}
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
+#endif
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -701,7 +843,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -758,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -826,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
+ }
+
return size;
}
@@ -1213,6 +1368,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
#endif
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
tcp_v6_send_check, tcp_v4_send_check,
sk, skb);
@@ -1524,7 +1682,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
skb->truesize -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
}
/* Any change of skb->len requires recalculation of tso factor. */
@@ -3336,20 +3493,20 @@ int tcp_send_synack(struct sock *sk)
}
/**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- * foc: cookie for tcp fast open
- * synack_type: Type of synback to prepare
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3408,8 +3565,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc, synack_type) + sizeof(*th);
+ foc, synack_type,
+ syn_skb) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3441,6 +3601,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_unlock();
#endif
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
skb->skb_mstamp_ns = now;
tcp_add_tx_delay(skb, tp);
@@ -3741,6 +3904,8 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
@@ -3934,7 +4099,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6cebf412d590..5842081bc8a2 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -10,7 +10,7 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8.
+ * .01 and 7/8.
*/
#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e88efba07551..09f0a23d1a01 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1170,7 +1170,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = inet->uc_index;
} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
/* oif is set, packet is to local broadcast and
- * and uc_index is set. oif is most likely set
+ * uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index eddd973e6575..7a94791efc1a 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -22,10 +22,9 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
prot->close = sock_map_close;
}
-static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
{
- if (sk->sk_family == AF_INET6 &&
- unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+ if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
spin_lock_bh(&udpv6_prot_lock);
if (likely(ops != udpv6_prot_saved)) {
udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
@@ -46,8 +45,8 @@ struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
{
int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
- if (!psock->sk_proto)
- udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot));
+ if (sk->sk_family == AF_INET6)
+ udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
return &udp_bpf_prots[family];
}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 9ebf3fe0d2b1..c70c192bc91b 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -191,6 +191,13 @@ static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
return -EAFNOSUPPORT;
}
+static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ kfree_skb(skb);
+ return -EAFNOSUPPORT;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
.ipv6_route_input = eafnosupport_ipv6_route_input,
@@ -201,6 +208,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
.fib6_nh_init = eafnosupport_fib6_nh_init,
.ip6_del_rt = eafnosupport_ip6_del_rt,
+ .ipv6_fragment = eafnosupport_ipv6_fragment,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0306509ab063..e648fbebb167 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -661,6 +661,7 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
+ .flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet6_release,
.bind = inet6_bind,
@@ -1026,6 +1027,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
.xfrm6_rcv_encap = xfrm6_rcv_encap,
#endif
.nd_tbl = &nd_tbl,
+ .ipv6_fragment = ip6_fragment,
};
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 2d3add9e6116..55c290d55605 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -94,7 +94,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum,
const struct in6_addr *daddr,
- const int dif, const int sdif, bool exact_dif)
+ const int dif, const int sdif)
{
int score = -1;
@@ -138,15 +138,13 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif, const int sdif)
{
- bool exact_dif = inet6_exact_dif_match(net, skb);
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr, dif, sdif,
- exact_dif);
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
saddr, sport, daddr, hnum);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4a664ad4f4d4..141c0a4c569a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1812,10 +1812,14 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
children = 0;
child = NULL;
- if (fn_r)
- child = fn_r, children |= 1;
- if (fn_l)
- child = fn_l, children |= 2;
+ if (fn_r) {
+ child = fn_r;
+ children |= 1;
+ }
+ if (fn_l) {
+ child = fn_l;
+ children |= 2;
+ }
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c78e67d7747f..a2a65e327f49 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1492,7 +1492,7 @@ emsgsize:
* Otherwise, we need to reserve fragment header and
* fragment alignment (= 8-15 octects, in total).
*
- * Note that we may need to "move" the data from the tail of
+ * Note that we may need to "move" the data from the tail
* of the buffer to the new fragment when we split
* the message.
*
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index 9ee077bf4f49..787c74aa85e3 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -77,16 +77,43 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
return true;
}
+static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb,
+ struct ipv6hdr *_bounced_hdr)
+{
+ if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ return NULL;
+
+ if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type))
+ return NULL;
+
+ return skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(struct icmp6hdr),
+ sizeof(struct ipv6hdr),
+ _bounced_hdr);
+}
+
static unsigned int
ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, saddr));
return NF_DROP;
}
+
+ /* rewrite dst addr of bounced packet which was sent to dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->daddr);
+ }
+
return XT_CONTINUE;
}
@@ -94,12 +121,24 @@ static unsigned int
ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, daddr));
return NF_DROP;
}
+
+ /* rewrite src addr of bounced packet which was sent from dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->saddr);
+ }
+
return XT_CONTINUE;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb075d9545b9..bde6d48a9942 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5284,9 +5284,10 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
{
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
+ int last_err = 0;
int remaining;
int attrlen;
- int err = 1, last_err = 0;
+ int err;
remaining = cfg->fc_mp_len;
rtnh = (struct rtnexthop *)cfg->fc_mp;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 305870a72352..8db59f4e5f13 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -458,7 +458,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
@@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
@@ -509,13 +510,14 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
+ u8 tclass;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
@@ -527,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
opt = ireq->ipv6_opt;
+ tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tcp_rsk(req)->syn_tos : np->tclass;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+ tclass & ~INET_ECN_MASK,
sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
@@ -958,8 +963,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
- priority);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
+ tclass & ~INET_ECN_MASK, priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -1067,8 +1072,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
- label, priority);
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1,
+ ipv6_get_dsfield(ipv6h), label, priority);
#ifdef CONFIG_TCP_MD5SIG
out:
@@ -1121,7 +1126,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
- 0, 0, sk->sk_priority);
+ ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority);
}
@@ -1309,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ /* Set ToS of the new socket based upon the value of incoming SYN. */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,
diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile
index 399a7e5db2f4..cf8f27071d3f 100644
--- a/net/l2tp/Makefile
+++ b/net/l2tp/Makefile
@@ -5,6 +5,8 @@
obj-$(CONFIG_L2TP) += l2tp_core.o
+CFLAGS_l2tp_core.o += -I$(src)
+
# Build l2tp as modules if L2TP is M
obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o
obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 701fc72ad9f4..7be5103ff2a8 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -61,6 +61,10 @@
#include <linux/atomic.h>
#include "l2tp_core.h"
+#include "trace.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
#define L2TP_DRV_VERSION "V2.0"
@@ -116,11 +120,6 @@ static bool l2tp_sk_is_v6(struct sock *sk)
}
#endif
-static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk)
-{
- return sk->sk_user_data;
-}
-
static inline struct l2tp_net *l2tp_pernet(const struct net *net)
{
return net_generic(net, l2tp_net_id);
@@ -151,23 +150,30 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id)
static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
{
+ trace_free_tunnel(tunnel);
sock_put(tunnel->sock);
/* the tunnel is freed in the socket destructor */
}
static void l2tp_session_free(struct l2tp_session *session)
{
- struct l2tp_tunnel *tunnel = session->tunnel;
+ trace_free_session(session);
+ if (session->tunnel)
+ l2tp_tunnel_dec_refcount(session->tunnel);
+ kfree(session);
+}
- if (tunnel) {
+struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = sk->sk_user_data;
+
+ if (tunnel)
if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC))
- goto out;
- l2tp_tunnel_dec_refcount(tunnel);
- }
+ return NULL;
-out:
- kfree(session);
+ return tunnel;
}
+EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel);
void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel)
{
@@ -381,6 +387,8 @@ int l2tp_session_register(struct l2tp_session *session,
hlist_add_head(&session->hlist, head);
write_unlock_bh(&tunnel->hlist_lock);
+ trace_register_session(session);
+
return 0;
err_tlock_pnlock:
@@ -409,10 +417,6 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk
skb_queue_walk_safe(&session->reorder_q, skbp, tmp) {
if (L2TP_SKB_CB(skbp)->ns > ns) {
__skb_queue_before(&session->reorder_q, skbp, skb);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n",
- session->name, ns, L2TP_SKB_CB(skbp)->ns,
- skb_queue_len(&session->reorder_q));
atomic_long_inc(&session->stats.rx_oos_packets);
goto out;
}
@@ -445,9 +449,7 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *
/* Bump our Nr */
session->nr++;
session->nr &= session->nr_max;
-
- l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated nr to %hu\n",
- session->name, session->nr);
+ trace_session_seqnum_update(session);
}
/* call private receive handler */
@@ -472,37 +474,27 @@ static void l2tp_recv_dequeue(struct l2tp_session *session)
start:
spin_lock_bh(&session->reorder_q.lock);
skb_queue_walk_safe(&session->reorder_q, skb, tmp) {
- if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) {
+ struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb);
+
+ /* If the packet has been pending on the queue for too long, discard it */
+ if (time_after(jiffies, cb->expires)) {
atomic_long_inc(&session->stats.rx_seq_discards);
atomic_long_inc(&session->stats.rx_errors);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ trace_session_pkt_expired(session, cb->ns);
session->reorder_skip = 1;
__skb_unlink(skb, &session->reorder_q);
kfree_skb(skb);
continue;
}
- if (L2TP_SKB_CB(skb)->has_seq) {
+ if (cb->has_seq) {
if (session->reorder_skip) {
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: advancing nr to next pkt: %u -> %u",
- session->name, session->nr,
- L2TP_SKB_CB(skb)->ns);
session->reorder_skip = 0;
- session->nr = L2TP_SKB_CB(skb)->ns;
+ session->nr = cb->ns;
+ trace_session_seqnum_reset(session);
}
- if (L2TP_SKB_CB(skb)->ns != session->nr) {
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: holding oos pkt %u len %d, waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ if (cb->ns != session->nr)
goto out;
- }
}
__skb_unlink(skb, &session->reorder_q);
@@ -535,14 +527,13 @@ static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr)
*/
static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
{
- if (!l2tp_seq_check_rx_window(session, L2TP_SKB_CB(skb)->ns)) {
+ struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb);
+
+ if (!l2tp_seq_check_rx_window(session, cb->ns)) {
/* Packet sequence number is outside allowed window.
* Discard it.
*/
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: pkt %u len %d discarded, outside window, nr=%u\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr);
+ trace_session_pkt_outside_rx_window(session, cb->ns);
goto discard;
}
@@ -559,10 +550,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
* is seen. After nr_oos_count_max in-sequence packets, reset the
* sequence number to re-enable packet reception.
*/
- if (L2TP_SKB_CB(skb)->ns == session->nr) {
+ if (cb->ns == session->nr) {
skb_queue_tail(&session->reorder_q, skb);
} else {
- u32 nr_oos = L2TP_SKB_CB(skb)->ns;
+ u32 nr_oos = cb->ns;
u32 nr_next = (session->nr_oos + 1) & session->nr_max;
if (nr_oos == nr_next)
@@ -573,17 +564,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
session->nr_oos = nr_oos;
if (session->nr_oos_count > session->nr_oos_count_max) {
session->reorder_skip = 1;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: %d oos packets received. Resetting sequence numbers\n",
- session->name, session->nr_oos_count);
}
if (!session->reorder_skip) {
atomic_long_inc(&session->stats.rx_seq_discards);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ trace_session_pkt_oos(session, cb->ns);
goto discard;
}
skb_queue_tail(&session->reorder_q, skb);
@@ -660,16 +644,14 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
int length)
{
struct l2tp_tunnel *tunnel = session->tunnel;
- u32 ns = 0, nr = 0;
int offset;
/* Parse and check optional cookie */
if (session->peer_cookie_len > 0) {
if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: cookie mismatch (%u/%u). Discarding.\n",
- tunnel->name, tunnel->tunnel_id,
- session->session_id);
+ pr_warn_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n",
+ tunnel->name, tunnel->tunnel_id,
+ session->session_id);
atomic_long_inc(&session->stats.rx_cookie_discards);
goto discard;
}
@@ -686,32 +668,21 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
L2TP_SKB_CB(skb)->has_seq = 0;
if (tunnel->version == L2TP_HDR_VER_2) {
if (hdrflags & L2TP_HDRFLAG_S) {
- ns = ntohs(*(__be16 *)ptr);
- ptr += 2;
- nr = ntohs(*(__be16 *)ptr);
- ptr += 2;
-
/* Store L2TP info in the skb */
- L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->ns = ntohs(*(__be16 *)ptr);
L2TP_SKB_CB(skb)->has_seq = 1;
+ ptr += 2;
+ /* Skip past nr in the header */
+ ptr += 2;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: recv data ns=%u, nr=%u, session nr=%u\n",
- session->name, ns, nr, session->nr);
}
} else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
u32 l2h = ntohl(*(__be32 *)ptr);
if (l2h & 0x40000000) {
- ns = l2h & 0x00ffffff;
-
/* Store L2TP info in the skb */
- L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->ns = l2h & 0x00ffffff;
L2TP_SKB_CB(skb)->has_seq = 1;
-
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: recv data ns=%u, session nr=%u\n",
- session->name, ns, session->nr);
}
ptr += 4;
}
@@ -722,9 +693,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* configure it so.
*/
if (!session->lns_mode && !session->send_seq) {
- l2tp_info(session, L2TP_MSG_SEQ,
- "%s: requested to enable seq numbers by LNS\n",
- session->name);
+ trace_session_seqnum_lns_enable(session);
session->send_seq = 1;
l2tp_session_set_header_len(session, tunnel->version);
}
@@ -733,9 +702,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* If user has configured mandatory sequence numbers, discard.
*/
if (session->recv_seq) {
- l2tp_warn(session, L2TP_MSG_SEQ,
- "%s: recv data has no seq numbers when required. Discarding.\n",
- session->name);
+ pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -746,15 +714,12 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* LAC is broken. Discard the frame.
*/
if (!session->lns_mode && session->send_seq) {
- l2tp_info(session, L2TP_MSG_SEQ,
- "%s: requested to disable seq numbers by LNS\n",
- session->name);
+ trace_session_seqnum_lns_disable(session);
session->send_seq = 0;
l2tp_session_set_header_len(session, tunnel->version);
} else if (session->send_seq) {
- l2tp_warn(session, L2TP_MSG_SEQ,
- "%s: recv data has no seq numbers when required. Discarding.\n",
- session->name);
+ pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -816,9 +781,6 @@ static void l2tp_session_queue_purge(struct l2tp_session *session)
{
struct sk_buff *skb = NULL;
- if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
- return;
-
while ((skb = skb_dequeue(&session->reorder_q))) {
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
@@ -847,22 +809,11 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
/* Short packet? */
if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: recv short packet (len=%d)\n",
- tunnel->name, skb->len);
+ pr_warn_ratelimited("%s: recv short packet (len=%d)\n",
+ tunnel->name, skb->len);
goto error;
}
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto error;
-
- pr_debug("%s: recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
- }
-
/* Point to L2TP header */
optr = skb->data;
ptr = skb->data;
@@ -873,9 +824,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
/* Check protocol version */
version = hdrflags & L2TP_HDR_VER_MASK;
if (version != tunnel->version) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: recv protocol version mismatch: got %d expected %d\n",
- tunnel->name, version, tunnel->version);
+ pr_warn_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n",
+ tunnel->name, version, tunnel->version);
goto error;
}
@@ -883,12 +833,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
length = skb->len;
/* If type is control packet, it is handled by userspace. */
- if (hdrflags & L2TP_HDRFLAG_T) {
- l2tp_dbg(tunnel, L2TP_MSG_DATA,
- "%s: recv control packet, len=%d\n",
- tunnel->name, length);
+ if (hdrflags & L2TP_HDRFLAG_T)
goto error;
- }
/* Skip flags */
ptr += 2;
@@ -917,9 +863,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
l2tp_session_dec_refcount(session);
/* Not found? Pass to userspace to deal with */
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: no session found (%u/%u). Passing up.\n",
- tunnel->name, tunnel_id, session_id);
+ pr_warn_ratelimited("%s: no session found (%u/%u). Passing up.\n",
+ tunnel->name, tunnel_id, session_id);
goto error;
}
@@ -949,12 +894,17 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct l2tp_tunnel *tunnel;
+ /* Note that this is called from the encap_rcv hook inside an
+ * RCU-protected region, but without the socket being locked.
+ * Hence we use rcu_dereference_sk_user_data to access the
+ * tunnel data structure rather the usual l2tp_sk_to_tunnel
+ * accessor function.
+ */
tunnel = rcu_dereference_sk_user_data(sk);
if (!tunnel)
goto pass_up;
-
- l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
- tunnel->name, skb->len);
+ if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC))
+ goto pass_up;
if (l2tp_udp_recv_core(tunnel, skb))
goto pass_up;
@@ -993,8 +943,7 @@ static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf)
*bufp++ = 0;
session->ns++;
session->ns &= 0xffff;
- l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated ns to %u\n",
- session->name, session->ns);
+ trace_session_seqnum_update(session);
}
return bufp - optr;
@@ -1030,9 +979,7 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
l2h = 0x40000000 | session->ns;
session->ns++;
session->ns &= 0xffffff;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: updated ns to %u\n",
- session->name, session->ns);
+ trace_session_seqnum_update(session);
}
*((__be32 *)bufp) = htonl(l2h);
@@ -1042,74 +989,39 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
-static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
- struct flowi *fl, size_t data_len)
+/* Queue the packet to IP for output: tunnel socket lock must be held */
+static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl)
{
- struct l2tp_tunnel *tunnel = session->tunnel;
- unsigned int len = skb->len;
- int error;
-
- /* Debug */
- if (session->send_seq)
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
- session->name, data_len, session->ns - 1);
- else
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
- session->name, data_len);
-
- if (session->debug & L2TP_MSG_DATA) {
- int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
- unsigned char *datap = skb->data + uhlen;
-
- pr_debug("%s: xmit\n", session->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET,
- datap, min_t(size_t, 32, len - uhlen));
- }
+ int err;
- /* Queue the packet to IP for output */
skb->ignore_df = 1;
skb_dst_drop(skb);
#if IS_ENABLED(CONFIG_IPV6)
if (l2tp_sk_is_v6(tunnel->sock))
- error = inet6_csk_xmit(tunnel->sock, skb, NULL);
+ err = inet6_csk_xmit(tunnel->sock, skb, NULL);
else
#endif
- error = ip_queue_xmit(tunnel->sock, skb, fl);
+ err = ip_queue_xmit(tunnel->sock, skb, fl);
- /* Update stats */
- if (error >= 0) {
- atomic_long_inc(&tunnel->stats.tx_packets);
- atomic_long_add(len, &tunnel->stats.tx_bytes);
- atomic_long_inc(&session->stats.tx_packets);
- atomic_long_add(len, &session->stats.tx_bytes);
- } else {
- atomic_long_inc(&tunnel->stats.tx_errors);
- atomic_long_inc(&session->stats.tx_errors);
- }
+ return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
-/* If caller requires the skb to have a ppp header, the header must be
- * inserted in the skb data before calling this function.
- */
-int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len)
+static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, unsigned int *len)
{
- int data_len = skb->len;
struct l2tp_tunnel *tunnel = session->tunnel;
+ unsigned int data_len = skb->len;
struct sock *sk = tunnel->sock;
- struct flowi *fl;
- struct udphdr *uh;
- struct inet_sock *inet;
- int headroom;
- int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
- int udp_len;
+ int headroom, uhlen, udp_len;
int ret = NET_XMIT_SUCCESS;
+ struct inet_sock *inet;
+ struct udphdr *uh;
/* Check that there's enough headroom in the skb to insert IP,
* UDP and L2TP headers. If not enough, expand it to
* make room. Adjust truesize.
*/
- headroom = NET_SKB_PAD + sizeof(struct iphdr) +
- uhlen + hdr_len;
+ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(*uh) : 0;
+ headroom = NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len;
if (skb_cow_head(skb, headroom)) {
kfree_skb(skb);
return NET_XMIT_DROP;
@@ -1117,14 +1029,13 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
/* Setup L2TP header */
if (tunnel->version == L2TP_HDR_VER_2)
- l2tp_build_l2tpv2_header(session, __skb_push(skb, hdr_len));
+ l2tp_build_l2tpv2_header(session, __skb_push(skb, session->hdr_len));
else
- l2tp_build_l2tpv3_header(session, __skb_push(skb, hdr_len));
+ l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len));
/* Reset skb netfilter state */
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED);
nf_reset_ct(skb);
bh_lock_sock(sk);
@@ -1143,8 +1054,12 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
goto out_unlock;
}
+ /* Report transmitted length before we add encap header, which keeps
+ * statistics consistent for both UDP and IP encap tx/rx paths.
+ */
+ *len = skb->len;
+
inet = inet_sk(sk);
- fl = &inet->cork.fl;
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
/* Setup UDP header */
@@ -1153,7 +1068,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
uh = udp_hdr(skb);
uh->source = inet->inet_sport;
uh->dest = inet->inet_dport;
- udp_len = uhlen + hdr_len + data_len;
+ udp_len = uhlen + session->hdr_len + data_len;
uh->len = htons(udp_len);
/* Calculate UDP checksum if configured to do so */
@@ -1172,12 +1087,34 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
break;
}
- l2tp_xmit_core(session, skb, fl, data_len);
+ ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl);
+
out_unlock:
bh_unlock_sock(sk);
return ret;
}
+
+/* If caller requires the skb to have a ppp header, the header must be
+ * inserted in the skb data before calling this function.
+ */
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+ unsigned int len = 0;
+ int ret;
+
+ ret = l2tp_xmit_core(session, skb, &len);
+ if (ret == NET_XMIT_SUCCESS) {
+ atomic_long_inc(&session->tunnel->stats.tx_packets);
+ atomic_long_add(len, &session->tunnel->stats.tx_bytes);
+ atomic_long_inc(&session->stats.tx_packets);
+ atomic_long_add(len, &session->stats.tx_bytes);
+ } else {
+ atomic_long_inc(&session->tunnel->stats.tx_errors);
+ atomic_long_inc(&session->stats.tx_errors);
+ }
+ return ret;
+}
EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
/*****************************************************************************
@@ -1190,13 +1127,11 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
*/
static void l2tp_tunnel_destruct(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
if (!tunnel)
goto end;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name);
-
/* Disable udp encapsulation */
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
@@ -1255,34 +1190,16 @@ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
struct hlist_node *tmp;
struct l2tp_session *session;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n",
- tunnel->name);
-
write_lock_bh(&tunnel->hlist_lock);
tunnel->acpt_newsess = false;
for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
again:
hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
session = hlist_entry(walk, struct l2tp_session, hlist);
-
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: closing session\n", session->name);
-
hlist_del_init(&session->hlist);
- if (test_and_set_bit(0, &session->dead))
- goto again;
-
write_unlock_bh(&tunnel->hlist_lock);
-
- l2tp_session_unhash(session);
- l2tp_session_queue_purge(session);
-
- if (session->session_close)
- (*session->session_close)(session);
-
- l2tp_session_dec_refcount(session);
-
+ l2tp_session_delete(session);
write_lock_bh(&tunnel->hlist_lock);
/* Now restart from the beginning of this hash
@@ -1299,7 +1216,7 @@ again:
/* Tunnel socket destroy hook for UDP encapsulation */
static void l2tp_udp_encap_destroy(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
if (tunnel)
l2tp_tunnel_delete(tunnel);
@@ -1464,7 +1381,7 @@ out:
static struct lock_class_key l2tp_socket_class;
-int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id,
+int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id,
struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp)
{
struct l2tp_tunnel *tunnel = NULL;
@@ -1483,16 +1400,12 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
tunnel->version = version;
tunnel->tunnel_id = tunnel_id;
tunnel->peer_tunnel_id = peer_tunnel_id;
- tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS;
tunnel->magic = L2TP_TUNNEL_MAGIC;
sprintf(&tunnel->name[0], "tunl %u", tunnel_id);
rwlock_init(&tunnel->hlist_lock);
tunnel->acpt_newsess = true;
- if (cfg)
- tunnel->debug = cfg->debug;
-
tunnel->encap = encap;
refcount_set(&tunnel->ref_count, 1);
@@ -1597,6 +1510,8 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
"l2tp_sock");
sk->sk_allocation = GFP_ATOMIC;
+ trace_register_tunnel(tunnel);
+
if (tunnel->fd >= 0)
sockfd_put(sock);
@@ -1617,6 +1532,7 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_register);
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
{
if (!test_and_set_bit(0, &tunnel->dead)) {
+ trace_delete_tunnel(tunnel);
l2tp_tunnel_inc_refcount(tunnel);
queue_work(l2tp_wq, &tunnel->del_work);
}
@@ -1628,6 +1544,7 @@ void l2tp_session_delete(struct l2tp_session *session)
if (test_and_set_bit(0, &session->dead))
return;
+ trace_delete_session(session);
l2tp_session_unhash(session);
l2tp_session_queue_purge(session);
if (session->session_close)
@@ -1686,12 +1603,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
INIT_HLIST_NODE(&session->hlist);
INIT_HLIST_NODE(&session->global_hlist);
- /* Inherit debug options from tunnel */
- session->debug = tunnel->debug;
-
if (cfg) {
session->pwtype = cfg->pw_type;
- session->debug = cfg->debug;
session->send_seq = cfg->send_seq;
session->recv_seq = cfg->recv_seq;
session->lns_mode = cfg->lns_mode;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 3468d6b177a0..cb21d906343e 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -51,7 +51,6 @@ struct l2tp_session_cfg {
unsigned int lns_mode:1; /* behave as LNS?
* LAC enables sequence numbers under LNS control.
*/
- int debug; /* bitmask of debug message categories */
u16 l2specific_type; /* Layer 2 specific type */
u8 cookie[8]; /* optional cookie */
int cookie_len; /* 0, 4 or 8 bytes */
@@ -66,6 +65,7 @@ struct l2tp_session_cfg {
* Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into
* an additional per-net ("global") hashlist.
*/
+#define L2TP_SESSION_NAME_MAX 32
struct l2tp_session {
int magic; /* should be L2TP_SESSION_MAGIC */
long dead;
@@ -90,14 +90,13 @@ struct l2tp_session {
struct hlist_node hlist; /* hash list node */
refcount_t ref_count;
- char name[32]; /* for logging */
+ char name[L2TP_SESSION_NAME_MAX]; /* for logging */
char ifname[IFNAMSIZ];
unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */
unsigned int send_seq:1; /* send packets with sequence numbers? */
unsigned int lns_mode:1; /* behave as LNS?
* LAC enables sequence numbers under LNS control.
*/
- int debug; /* bitmask of debug message categories */
int reorder_timeout; /* configured reorder timeout (in jiffies) */
int reorder_skip; /* set if skip to next nr */
enum l2tp_pwtype pwtype;
@@ -131,7 +130,6 @@ struct l2tp_session {
/* L2TP tunnel configuration */
struct l2tp_tunnel_cfg {
- int debug; /* bitmask of debug message categories */
enum l2tp_encap_type encap;
/* Used only for kernel-created sockets */
@@ -154,6 +152,7 @@ struct l2tp_tunnel_cfg {
* Maintains a hashlist of sessions belonging to the tunnel instance.
* Is linked into a per-net list of tunnels.
*/
+#define L2TP_TUNNEL_NAME_MAX 20
struct l2tp_tunnel {
int magic; /* Should be L2TP_TUNNEL_MAGIC */
@@ -170,8 +169,7 @@ struct l2tp_tunnel {
u32 peer_tunnel_id;
int version; /* 2=>L2TPv2, 3=>L2TPv3 */
- char name[20]; /* for logging */
- int debug; /* bitmask of debug message categories */
+ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */
enum l2tp_encap_type encap;
struct l2tp_stats stats;
@@ -237,7 +235,7 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
* Creation of a new instance is a two-step process: create, then register.
* Destruction is triggered using the *_delete functions, and completes asynchronously.
*/
-int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
+int l2tp_tunnel_create(int fd, int version, u32 tunnel_id,
u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
struct l2tp_tunnel **tunnelp);
int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
@@ -263,8 +261,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
/* Transmit path helpers for sending packets over the tunnel socket. */
void l2tp_session_set_header_len(struct l2tp_session *session, int version);
-int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
- int hdr_len);
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb);
/* Pseudowire management.
* Pseudowires should register with l2tp core on module init, and unregister
@@ -276,6 +273,11 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
/* IOCTL helper for IP encap modules. */
int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+/* Extract the tunnel structure from a socket's sk_user_data pointer,
+ * validating the tunnel magic feather.
+ */
+struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk);
+
static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
{
switch (session->l2specific_type) {
@@ -337,19 +339,6 @@ static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, str
return 0;
}
-#define l2tp_printk(ptr, type, func, fmt, ...) \
-do { \
- if (((ptr)->debug) & (type)) \
- func(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define l2tp_warn(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_warn, fmt, ##__VA_ARGS__)
-#define l2tp_info(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_info, fmt, ##__VA_ARGS__)
-#define l2tp_dbg(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__)
-
#define MODULE_ALIAS_L2TP_PWTYPE(type) \
MODULE_ALIAS("net-l2tp-type-" __stringify(type))
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 96cb9601c21b..bca75bef8282 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -167,7 +167,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0,
refcount_read(&tunnel->ref_count));
seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n",
- tunnel->debug,
+ 0,
atomic_long_read(&tunnel->stats.tx_packets),
atomic_long_read(&tunnel->stats.tx_bytes),
atomic_long_read(&tunnel->stats.tx_errors),
@@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
- session->debug,
+ 0,
jiffies_to_msecs(session->reorder_timeout));
seq_printf(m, " offset 0 l2specific %hu/%hu\n",
session->l2specific_type, l2tp_get_l2specific_len(session));
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 7ed2b4eced94..6cd97c75445c 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -76,7 +76,7 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev
struct l2tp_eth *priv = netdev_priv(dev);
struct l2tp_session *session = priv->session;
unsigned int len = skb->len;
- int ret = l2tp_xmit_skb(session, skb, session->hdr_len);
+ int ret = l2tp_xmit_skb(session, skb);
if (likely(ret == NET_XMIT_SUCCESS)) {
atomic_long_add(len, &priv->tx_bytes);
@@ -128,17 +128,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
struct net_device *dev;
struct l2tp_eth *priv;
- if (session->debug & L2TP_MSG_DATA) {
- unsigned int length;
-
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto error;
-
- pr_debug("%s: eth recv\n", session->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
- }
-
if (!pskb_may_pull(skb, ETH_HLEN))
goto error;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index df2a35b5714a..97ae1255fcb6 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -118,7 +118,6 @@ static int l2tp_ip_recv(struct sk_buff *skb)
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
struct iphdr *iph;
- int length;
if (!pskb_may_pull(skb, 4))
goto discard;
@@ -147,20 +146,6 @@ static int l2tp_ip_recv(struct sk_buff *skb)
if (!tunnel)
goto discard_sess;
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto discard_sess;
-
- /* Point to L2TP header */
- optr = skb->data;
- ptr = skb->data;
- ptr += 4;
- pr_debug("%s: ip recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
- }
-
if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
goto discard_sess;
@@ -248,8 +233,8 @@ static void l2tp_ip_close(struct sock *sk, long timeout)
static void l2tp_ip_destroy_sock(struct sock *sk)
{
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
struct sk_buff *skb;
- struct l2tp_tunnel *tunnel = sk->sk_user_data;
while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
kfree_skb(skb);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index bc757bc7e264..e5e5036257b0 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -131,7 +131,6 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
struct ipv6hdr *iph;
- int length;
if (!pskb_may_pull(skb, 4))
goto discard;
@@ -160,20 +159,6 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
if (!tunnel)
goto discard_sess;
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto discard_sess;
-
- /* Point to L2TP header */
- optr = skb->data;
- ptr = skb->data;
- ptr += 4;
- pr_debug("%s: ip recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
- }
-
if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
goto discard_sess;
@@ -262,7 +247,7 @@ static void l2tp_ip6_close(struct sock *sk, long timeout)
static void l2tp_ip6_destroy_sock(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = sk->sk_user_data;
+ struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk);
lock_sock(sk);
ip6_flush_pending_frames(sk);
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index def78eebca4c..83c015f7f20d 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -229,14 +229,11 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
goto out;
}
- if (attrs[L2TP_ATTR_DEBUG])
- cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]);
-
ret = -EINVAL;
switch (cfg.encap) {
case L2TP_ENCAPTYPE_UDP:
case L2TP_ENCAPTYPE_IP:
- ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id,
+ ret = l2tp_tunnel_create(fd, proto_version, tunnel_id,
peer_tunnel_id, &cfg, &tunnel);
break;
}
@@ -307,9 +304,6 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info
goto out;
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
ret = l2tp_tunnel_notify(&l2tp_nl_family, info,
tunnel, L2TP_CMD_TUNNEL_MODIFY);
@@ -400,7 +394,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) ||
nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
- nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) ||
nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap))
goto nla_put_failure;
@@ -605,9 +599,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -689,9 +680,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
goto out;
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -730,7 +718,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) ||
- nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) ||
nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
goto nla_put_failure;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 13c3153b40d6..aea85f91f059 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -237,17 +237,9 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
- l2tp_dbg(session, L2TP_MSG_DATA,
- "%s: recv %d byte data frame, passing to ppp\n",
- session->name, data_len);
-
po = pppox_sk(sk);
ppp_input(&po->chan, skb);
} else {
- l2tp_dbg(session, L2TP_MSG_DATA,
- "%s: recv %d byte data frame, passing to L2TP socket\n",
- session->name, data_len);
-
if (sock_queue_rcv_skb(sk, skb) < 0) {
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
@@ -259,7 +251,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
no_sock:
rcu_read_unlock();
- l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name);
+ pr_warn_ratelimited("%s: no socket in recv\n", session->name);
kfree_skb(skb);
}
@@ -324,7 +316,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
}
local_bh_disable();
- l2tp_xmit_skb(session, skb, session->hdr_len);
+ l2tp_xmit_skb(session, skb);
local_bh_enable();
sock_put(sk);
@@ -383,7 +375,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
skb->data[1] = PPP_UI;
local_bh_disable();
- l2tp_xmit_skb(session, skb, session->hdr_len);
+ l2tp_xmit_skb(session, skb);
local_bh_enable();
sock_put(sk);
@@ -710,7 +702,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (!tunnel) {
struct l2tp_tunnel_cfg tcfg = {
.encap = L2TP_ENCAPTYPE_UDP,
- .debug = 0,
};
/* Prevent l2tp_tunnel_register() from trying to set up
@@ -721,7 +712,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
- error = l2tp_tunnel_create(sock_net(sk), info.fd,
+ error = l2tp_tunnel_create(info.fd,
info.version,
info.tunnel_id,
info.peer_tunnel_id, &tcfg,
@@ -840,8 +831,6 @@ out_no_ppp:
drop_refcnt = false;
sk->sk_state = PPPOX_CONNECTED;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
- session->name);
end:
if (error) {
@@ -1076,6 +1065,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Not defined for tunnels */
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
@@ -1090,6 +1082,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Not defined for tunnels */
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
@@ -1103,6 +1098,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Session 0 represents the parent tunnel */
if (!session->session_id && !session->peer_session_id) {
u32 session_id;
@@ -1157,9 +1155,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
- tunnel->debug = val;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
- tunnel->name, tunnel->debug);
+ /* Tunnel debug flags option is deprecated */
break;
default:
@@ -1185,9 +1181,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
break;
}
session->recv_seq = !!val;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set recv_seq=%d\n",
- session->name, session->recv_seq);
break;
case PPPOL2TP_SO_SENDSEQ:
@@ -1203,9 +1196,6 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
}
l2tp_session_set_header_len(session, session->tunnel->version);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set send_seq=%d\n",
- session->name, session->send_seq);
break;
case PPPOL2TP_SO_LNSMODE:
@@ -1214,22 +1204,14 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
break;
}
session->lns_mode = !!val;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set lns_mode=%d\n",
- session->name, session->lns_mode);
break;
case PPPOL2TP_SO_DEBUG:
- session->debug = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
- session->name, session->debug);
+ /* Session debug flags option is deprecated */
break;
case PPPOL2TP_SO_REORDERTO:
session->reorder_timeout = msecs_to_jiffies(val);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set reorder_timeout=%d\n",
- session->name, session->reorder_timeout);
break;
default:
@@ -1297,9 +1279,8 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
- *val = tunnel->debug;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n",
- tunnel->name, tunnel->debug);
+ /* Tunnel debug flags option is deprecated */
+ *val = 0;
break;
default:
@@ -1321,32 +1302,23 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_RECVSEQ:
*val = session->recv_seq;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get recv_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_SENDSEQ:
*val = session->send_seq;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get send_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_LNSMODE:
*val = session->lns_mode;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get lns_mode=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_DEBUG:
- *val = session->debug;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n",
- session->name, *val);
+ /* Session debug flags option is deprecated */
+ *val = 0;
break;
case PPPOL2TP_SO_REORDERTO:
*val = (int)jiffies_to_msecs(session->reorder_timeout);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get reorder_timeout=%d\n", session->name, *val);
break;
default:
@@ -1534,7 +1506,7 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
(tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N',
refcount_read(&tunnel->ref_count) - 1);
seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n",
- tunnel->debug,
+ 0,
atomic_long_read(&tunnel->stats.tx_packets),
atomic_long_read(&tunnel->stats.tx_bytes),
atomic_long_read(&tunnel->stats.tx_errors),
@@ -1580,7 +1552,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
- session->debug,
+ 0,
jiffies_to_msecs(session->reorder_timeout));
seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n",
session->nr, session->ns,
diff --git a/net/l2tp/trace.h b/net/l2tp/trace.h
new file mode 100644
index 000000000000..8596eaa12a2e
--- /dev/null
+++ b/net/l2tp/trace.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM l2tp
+
+#if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_L2TP_H
+
+#include <linux/tracepoint.h>
+#include <linux/l2tp.h>
+#include "l2tp_core.h"
+
+#define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e }
+#define show_encap_type_name(val) \
+ __print_symbolic(val, \
+ encap_type_name(UDP), \
+ encap_type_name(IP))
+
+#define pw_type_name(p) { L2TP_PWTYPE_##p, #p }
+#define show_pw_type_name(val) \
+ __print_symbolic(val, \
+ pw_type_name(ETH_VLAN), \
+ pw_type_name(ETH), \
+ pw_type_name(PPP), \
+ pw_type_name(PPP_AC), \
+ pw_type_name(IP))
+
+DECLARE_EVENT_CLASS(tunnel_only_evt,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_TUNNEL_NAME_MAX)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX);
+ ),
+ TP_printk("%s", __entry->name)
+);
+
+DECLARE_EVENT_CLASS(session_only_evt,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ ),
+ TP_printk("%s", __entry->name)
+);
+
+TRACE_EVENT(register_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_TUNNEL_NAME_MAX)
+ __field(int, fd)
+ __field(u32, tid)
+ __field(u32, ptid)
+ __field(int, version)
+ __field(enum l2tp_encap_type, encap)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX);
+ __entry->fd = tunnel->fd;
+ __entry->tid = tunnel->tunnel_id;
+ __entry->ptid = tunnel->peer_tunnel_id;
+ __entry->version = tunnel->version;
+ __entry->encap = tunnel->encap;
+ ),
+ TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d",
+ __entry->name,
+ __entry->fd > 0 ? "managed" : "unmanaged",
+ show_encap_type_name(__entry->encap),
+ __entry->version,
+ __entry->tid,
+ __entry->ptid,
+ __entry->fd)
+);
+
+DEFINE_EVENT(tunnel_only_evt, delete_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel)
+);
+
+DEFINE_EVENT(tunnel_only_evt, free_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel)
+);
+
+TRACE_EVENT(register_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, tid)
+ __field(u32, ptid)
+ __field(u32, sid)
+ __field(u32, psid)
+ __field(enum l2tp_pwtype, pwtype)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0;
+ __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0;
+ __entry->sid = session->session_id;
+ __entry->psid = session->peer_session_id;
+ __entry->pwtype = session->pwtype;
+ ),
+ TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u",
+ __entry->name,
+ show_pw_type_name(__entry->pwtype),
+ __entry->sid,
+ __entry->psid,
+ __entry->sid,
+ __entry->psid)
+);
+
+DEFINE_EVENT(session_only_evt, delete_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, free_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DECLARE_EVENT_CLASS(session_seqnum_evt,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, ns)
+ __field(u32, nr)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->ns = session->ns;
+ __entry->nr = session->nr;
+ ),
+ TP_printk("%s: ns=%u nr=%u",
+ __entry->name,
+ __entry->ns,
+ __entry->nr)
+);
+
+DEFINE_EVENT(session_seqnum_evt, session_seqnum_update,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DECLARE_EVENT_CLASS(session_pkt_discard_evt,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, pkt_ns)
+ __field(u32, my_nr)
+ __field(u32, reorder_q_len)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->pkt_ns = pkt_ns,
+ __entry->my_nr = session->nr;
+ __entry->reorder_q_len = skb_queue_len(&session->reorder_q);
+ ),
+ TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u",
+ __entry->name,
+ __entry->pkt_ns,
+ __entry->my_nr,
+ __entry->reorder_q_len)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+#endif /* _TRACE_L2TP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 313ba97acae3..cd4cf84a7f99 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -350,7 +350,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
sta->sta.addr, tid);
/* We have no API to update the timeout value in the
* driver so reject the timeout update if the timeout
- * changed. If if did not change, i.e., no real update,
+ * changed. If it did not change, i.e., no real update,
* just reply with success.
*/
rcu_read_lock();
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 87fddd84c621..8d75a4045d6e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -826,9 +826,9 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
memcpy(new->data, resp, resp_len);
if (csa)
- memcpy(new->csa_counter_offsets, csa->counter_offsets_presp,
+ memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp,
csa->n_counter_offsets_presp *
- sizeof(new->csa_counter_offsets[0]));
+ sizeof(new->cntdwn_counter_offsets[0]));
rcu_assign_pointer(sdata->u.ap.probe_resp, new);
if (old)
@@ -837,6 +837,59 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
return 0;
}
+static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_fils_discovery *params)
+{
+ struct fils_discovery_data *new, *old = NULL;
+ struct ieee80211_fils_discovery *fd;
+
+ if (!params->tmpl || !params->tmpl_len)
+ return -EINVAL;
+
+ fd = &sdata->vif.bss_conf.fils_discovery;
+ fd->min_interval = params->min_interval;
+ fd->max_interval = params->max_interval;
+
+ old = sdata_dereference(sdata->u.ap.fils_discovery, sdata);
+ new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->len = params->tmpl_len;
+ memcpy(new->data, params->tmpl, params->tmpl_len);
+ rcu_assign_pointer(sdata->u.ap.fils_discovery, new);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ return 0;
+}
+
+static int
+ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_unsol_bcast_probe_resp *params)
+{
+ struct unsol_bcast_probe_resp_data *new, *old = NULL;
+
+ if (!params->tmpl || !params->tmpl_len)
+ return -EINVAL;
+
+ old = sdata_dereference(sdata->u.ap.unsol_bcast_probe_resp, sdata);
+ new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->len = params->tmpl_len;
+ memcpy(new->data, params->tmpl, params->tmpl_len);
+ rcu_assign_pointer(sdata->u.ap.unsol_bcast_probe_resp, new);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ sdata->vif.bss_conf.unsol_bcast_probe_resp_interval =
+ params->interval;
+
+ return 0;
+}
+
static int ieee80211_set_ftm_responder_params(
struct ieee80211_sub_if_data *sdata,
const u8 *lci, size_t lci_len,
@@ -926,10 +979,10 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
new->tail_len = new_tail_len;
if (csa) {
- new->csa_current_counter = csa->count;
- memcpy(new->csa_counter_offsets, csa->counter_offsets_beacon,
+ new->cntdwn_current_counter = csa->count;
+ memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon,
csa->n_counter_offsets_beacon *
- sizeof(new->csa_counter_offsets[0]));
+ sizeof(new->cntdwn_counter_offsets[0]));
}
/* copy in head */
@@ -1099,12 +1152,26 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
}
err = ieee80211_assign_beacon(sdata, &params->beacon, NULL);
- if (err < 0) {
- ieee80211_vif_release_channel(sdata);
- return err;
- }
+ if (err < 0)
+ goto error;
changed |= err;
+ if (params->fils_discovery.max_interval) {
+ err = ieee80211_set_fils_discovery(sdata,
+ &params->fils_discovery);
+ if (err < 0)
+ goto error;
+ changed |= BSS_CHANGED_FILS_DISCOVERY;
+ }
+
+ if (params->unsol_bcast_probe_resp.interval) {
+ err = ieee80211_set_unsol_bcast_probe_resp(sdata,
+ &params->unsol_bcast_probe_resp);
+ if (err < 0)
+ goto error;
+ changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP;
+ }
+
err = drv_start_ap(sdata->local, sdata);
if (err) {
old = sdata_dereference(sdata->u.ap.beacon, sdata);
@@ -1112,8 +1179,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
if (old)
kfree_rcu(old, rcu_head);
RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
- ieee80211_vif_release_channel(sdata);
- return err;
+ goto error;
}
ieee80211_recalc_dtim(local, sdata);
@@ -1124,6 +1190,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
netif_carrier_on(vlan->dev);
return 0;
+
+error:
+ ieee80211_vif_release_channel(sdata);
+ return err;
}
static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
@@ -1160,6 +1230,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
struct ieee80211_local *local = sdata->local;
struct beacon_data *old_beacon;
struct probe_resp *old_probe_resp;
+ struct fils_discovery_data *old_fils_discovery;
+ struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp;
struct cfg80211_chan_def chandef;
sdata_assert_lock(sdata);
@@ -1168,6 +1240,11 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
if (!old_beacon)
return -ENOENT;
old_probe_resp = sdata_dereference(sdata->u.ap.probe_resp, sdata);
+ old_fils_discovery = sdata_dereference(sdata->u.ap.fils_discovery,
+ sdata);
+ old_unsol_bcast_probe_resp =
+ sdata_dereference(sdata->u.ap.unsol_bcast_probe_resp,
+ sdata);
/* abort any running channel switch */
mutex_lock(&local->mtx);
@@ -1191,9 +1268,15 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
/* remove beacon and probe response */
RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
RCU_INIT_POINTER(sdata->u.ap.probe_resp, NULL);
+ RCU_INIT_POINTER(sdata->u.ap.fils_discovery, NULL);
+ RCU_INIT_POINTER(sdata->u.ap.unsol_bcast_probe_resp, NULL);
kfree_rcu(old_beacon, rcu_head);
if (old_probe_resp)
kfree_rcu(old_probe_resp, rcu_head);
+ if (old_fils_discovery)
+ kfree_rcu(old_fils_discovery, rcu_head);
+ if (old_unsol_bcast_probe_resp)
+ kfree_rcu(old_unsol_bcast_probe_resp, rcu_head);
kfree(sdata->vif.bss_conf.ftmr_params);
sdata->vif.bss_conf.ftmr_params = NULL;
@@ -1696,6 +1779,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
__ieee80211_check_fast_rx_iface(vlansdata);
+ drv_sta_set_4addr(local, sta->sdata, &sta->sta, true);
}
if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -3186,9 +3270,9 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
break;
if ((params->n_counter_offsets_beacon >
- IEEE80211_MAX_CSA_COUNTERS_NUM) ||
+ IEEE80211_MAX_CNTDWN_COUNTERS_NUM) ||
(params->n_counter_offsets_presp >
- IEEE80211_MAX_CSA_COUNTERS_NUM))
+ IEEE80211_MAX_CNTDWN_COUNTERS_NUM))
return -EINVAL;
csa.counter_offsets_beacon = params->counter_offsets_beacon;
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 54080290d6e2..90470392fdaa 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -408,6 +408,7 @@ static const char *hw_flag_names[] = {
FLAG(SUPPORTS_MULTI_BSSID),
FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
FLAG(AMPDU_KEYBORDER_SUPPORT),
+ FLAG(SUPPORTS_TX_ENCAP_OFFLOAD),
#undef FLAG
};
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 41d495d73d3a..bcdfd19a596b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1384,4 +1384,33 @@ static inline int drv_reset_tid_config(struct ieee80211_local *local,
return ret;
}
+
+static inline void drv_update_vif_offload(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+ check_sdata_in_driver(sdata);
+
+ if (!local->ops->update_vif_offload)
+ return;
+
+ trace_drv_update_vif_offload(local, sdata);
+ local->ops->update_vif_offload(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_sta_set_4addr(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, bool enabled)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_set_4addr(local, sdata, sta, enabled);
+ if (local->ops->sta_set_4addr)
+ local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled);
+ trace_drv_return_void(local);
+}
+
#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 53632c2f5217..c0963969a465 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -145,9 +145,9 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
*pos++ = csa_settings->block_tx ? 1 : 0;
*pos++ = ieee80211_frequency_to_channel(
csa_settings->chandef.chan->center_freq);
- presp->csa_counter_offsets[0] = (pos - presp->head);
+ presp->cntdwn_counter_offsets[0] = (pos - presp->head);
*pos++ = csa_settings->count;
- presp->csa_current_counter = csa_settings->count;
+ presp->cntdwn_current_counter = csa_settings->count;
}
/* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 0b1eaec6649f..ecd9229012bf 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -259,15 +259,27 @@ struct beacon_data {
u8 *head, *tail;
int head_len, tail_len;
struct ieee80211_meshconf_ie *meshconf;
- u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
- u8 csa_current_counter;
+ u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
+ u8 cntdwn_current_counter;
struct rcu_head rcu_head;
};
struct probe_resp {
struct rcu_head rcu_head;
int len;
- u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
+ u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
+ u8 data[];
+};
+
+struct fils_discovery_data {
+ struct rcu_head rcu_head;
+ int len;
+ u8 data[];
+};
+
+struct unsol_bcast_probe_resp_data {
+ struct rcu_head rcu_head;
+ int len;
u8 data[];
};
@@ -286,6 +298,8 @@ struct ps_data {
struct ieee80211_if_ap {
struct beacon_data __rcu *beacon;
struct probe_resp __rcu *probe_resp;
+ struct fils_discovery_data __rcu *fils_discovery;
+ struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp;
/* to be used after channel switch. */
struct cfg80211_beacon_data *next_beacon;
@@ -989,8 +1003,6 @@ struct ieee80211_sub_if_data {
} debugfs;
#endif
- bool hw_80211_encap;
-
/* must be last, dynamically sized area in this! */
struct ieee80211_vif vif;
};
@@ -1767,6 +1779,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local);
bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
bool update_bss);
+void ieee80211_recalc_offload(struct ieee80211_local *local);
static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
{
@@ -2040,8 +2053,6 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t);
void ieee80211_send_nullfunc(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
bool powersave);
-void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_hdr *hdr);
void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
struct ieee80211_hdr *hdr, bool ack, u16 tx_time);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 9740ae8fa697..7ac9af66f545 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -348,439 +348,6 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
return 0;
}
-void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
- const int offset)
-{
- struct ieee80211_local *local = sdata->local;
- u32 flags = sdata->u.mntr.flags;
-
-#define ADJUST(_f, _s) do { \
- if (flags & MONITOR_FLAG_##_f) \
- local->fif_##_s += offset; \
- } while (0)
-
- ADJUST(FCSFAIL, fcsfail);
- ADJUST(PLCPFAIL, plcpfail);
- ADJUST(CONTROL, control);
- ADJUST(CONTROL, pspoll);
- ADJUST(OTHER_BSS, other_bss);
-
-#undef ADJUST
-}
-
-static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
-{
- struct ieee80211_local *local = sdata->local;
- int i;
-
- for (i = 0; i < IEEE80211_NUM_ACS; i++) {
- if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
- sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
- else if (local->hw.queues >= IEEE80211_NUM_ACS)
- sdata->vif.hw_queue[i] = i;
- else
- sdata->vif.hw_queue[i] = 0;
- }
- sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
-}
-
-int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
-{
- struct ieee80211_sub_if_data *sdata;
- int ret;
-
- if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
- return 0;
-
- ASSERT_RTNL();
-
- if (local->monitor_sdata)
- return 0;
-
- sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL);
- if (!sdata)
- return -ENOMEM;
-
- /* set up data */
- sdata->local = local;
- sdata->vif.type = NL80211_IFTYPE_MONITOR;
- snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
- wiphy_name(local->hw.wiphy));
- sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
-
- sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
-
- ieee80211_set_default_queues(sdata);
-
- ret = drv_add_interface(local, sdata);
- if (WARN_ON(ret)) {
- /* ok .. stupid driver, it asked for this! */
- kfree(sdata);
- return ret;
- }
-
- ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
- if (ret) {
- kfree(sdata);
- return ret;
- }
-
- mutex_lock(&local->iflist_mtx);
- rcu_assign_pointer(local->monitor_sdata, sdata);
- mutex_unlock(&local->iflist_mtx);
-
- mutex_lock(&local->mtx);
- ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef,
- IEEE80211_CHANCTX_EXCLUSIVE);
- mutex_unlock(&local->mtx);
- if (ret) {
- mutex_lock(&local->iflist_mtx);
- RCU_INIT_POINTER(local->monitor_sdata, NULL);
- mutex_unlock(&local->iflist_mtx);
- synchronize_net();
- drv_remove_interface(local, sdata);
- kfree(sdata);
- return ret;
- }
-
- skb_queue_head_init(&sdata->skb_queue);
- INIT_WORK(&sdata->work, ieee80211_iface_work);
-
- return 0;
-}
-
-void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
-{
- struct ieee80211_sub_if_data *sdata;
-
- if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
- return;
-
- ASSERT_RTNL();
-
- mutex_lock(&local->iflist_mtx);
-
- sdata = rcu_dereference_protected(local->monitor_sdata,
- lockdep_is_held(&local->iflist_mtx));
- if (!sdata) {
- mutex_unlock(&local->iflist_mtx);
- return;
- }
-
- RCU_INIT_POINTER(local->monitor_sdata, NULL);
- mutex_unlock(&local->iflist_mtx);
-
- synchronize_net();
-
- mutex_lock(&local->mtx);
- ieee80211_vif_release_channel(sdata);
- mutex_unlock(&local->mtx);
-
- drv_remove_interface(local, sdata);
-
- kfree(sdata);
-}
-
-/*
- * NOTE: Be very careful when changing this function, it must NOT return
- * an error on interface type changes that have been pre-checked, so most
- * checks should be in ieee80211_check_concurrent_iface.
- */
-int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct net_device *dev = wdev->netdev;
- struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- u32 changed = 0;
- int res;
- u32 hw_reconf_flags = 0;
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_WDS:
- if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
- return -ENOLINK;
- break;
- case NL80211_IFTYPE_AP_VLAN: {
- struct ieee80211_sub_if_data *master;
-
- if (!sdata->bss)
- return -ENOLINK;
-
- mutex_lock(&local->mtx);
- list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
- mutex_unlock(&local->mtx);
-
- master = container_of(sdata->bss,
- struct ieee80211_sub_if_data, u.ap);
- sdata->control_port_protocol =
- master->control_port_protocol;
- sdata->control_port_no_encrypt =
- master->control_port_no_encrypt;
- sdata->control_port_over_nl80211 =
- master->control_port_over_nl80211;
- sdata->control_port_no_preauth =
- master->control_port_no_preauth;
- sdata->vif.cab_queue = master->vif.cab_queue;
- memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
- sizeof(sdata->vif.hw_queue));
- sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
-
- mutex_lock(&local->key_mtx);
- sdata->crypto_tx_tailroom_needed_cnt +=
- master->crypto_tx_tailroom_needed_cnt;
- mutex_unlock(&local->key_mtx);
-
- break;
- }
- case NL80211_IFTYPE_AP:
- sdata->bss = &sdata->u.ap;
- break;
- case NL80211_IFTYPE_MESH_POINT:
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_MONITOR:
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_P2P_DEVICE:
- case NL80211_IFTYPE_OCB:
- case NL80211_IFTYPE_NAN:
- /* no special treatment */
- break;
- case NL80211_IFTYPE_UNSPECIFIED:
- case NUM_NL80211_IFTYPES:
- case NL80211_IFTYPE_P2P_CLIENT:
- case NL80211_IFTYPE_P2P_GO:
- /* cannot happen */
- WARN_ON(1);
- break;
- }
-
- if (local->open_count == 0) {
- res = drv_start(local);
- if (res)
- goto err_del_bss;
- /* we're brought up, everything changes */
- hw_reconf_flags = ~0;
- ieee80211_led_radio(local, true);
- ieee80211_mod_tpt_led_trig(local,
- IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
- }
-
- /*
- * Copy the hopefully now-present MAC address to
- * this interface, if it has the special null one.
- */
- if (dev && is_zero_ether_addr(dev->dev_addr)) {
- memcpy(dev->dev_addr,
- local->hw.wiphy->perm_addr,
- ETH_ALEN);
- memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
-
- if (!is_valid_ether_addr(dev->dev_addr)) {
- res = -EADDRNOTAVAIL;
- goto err_stop;
- }
- }
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_AP_VLAN:
- /* no need to tell driver, but set carrier and chanctx */
- if (rtnl_dereference(sdata->bss->beacon)) {
- ieee80211_vif_vlan_copy_chanctx(sdata);
- netif_carrier_on(dev);
- } else {
- netif_carrier_off(dev);
- }
- break;
- case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
- local->cooked_mntrs++;
- break;
- }
-
- if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
- res = drv_add_interface(local, sdata);
- if (res)
- goto err_stop;
- } else if (local->monitors == 0 && local->open_count == 0) {
- res = ieee80211_add_virtual_monitor(local);
- if (res)
- goto err_stop;
- }
-
- /* must be before the call to ieee80211_configure_filter */
- local->monitors++;
- if (local->monitors == 1) {
- local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
- hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
- }
-
- ieee80211_adjust_monitor_flags(sdata, 1);
- ieee80211_configure_filter(local);
- mutex_lock(&local->mtx);
- ieee80211_recalc_idle(local);
- mutex_unlock(&local->mtx);
-
- netif_carrier_on(dev);
- break;
- default:
- if (coming_up) {
- ieee80211_del_virtual_monitor(local);
-
- res = drv_add_interface(local, sdata);
- if (res)
- goto err_stop;
- res = ieee80211_check_queues(sdata,
- ieee80211_vif_type_p2p(&sdata->vif));
- if (res)
- goto err_del_interface;
- }
-
- if (sdata->vif.type == NL80211_IFTYPE_AP) {
- local->fif_pspoll++;
- local->fif_probe_req++;
-
- ieee80211_configure_filter(local);
- } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
- local->fif_probe_req++;
- }
-
- if (sdata->vif.probe_req_reg)
- drv_config_iface_filter(local, sdata,
- FIF_PROBE_REQ,
- FIF_PROBE_REQ);
-
- if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
- sdata->vif.type != NL80211_IFTYPE_NAN)
- changed |= ieee80211_reset_erp_info(sdata);
- ieee80211_bss_info_change_notify(sdata, changed);
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_MESH_POINT:
- case NL80211_IFTYPE_OCB:
- netif_carrier_off(dev);
- break;
- case NL80211_IFTYPE_WDS:
- case NL80211_IFTYPE_P2P_DEVICE:
- case NL80211_IFTYPE_NAN:
- break;
- default:
- /* not reached */
- WARN_ON(1);
- }
-
- /*
- * Set default queue parameters so drivers don't
- * need to initialise the hardware if the hardware
- * doesn't start up with sane defaults.
- * Enable QoS for anything but station interfaces.
- */
- ieee80211_set_wmm_default(sdata, true,
- sdata->vif.type != NL80211_IFTYPE_STATION);
- }
-
- set_bit(SDATA_STATE_RUNNING, &sdata->state);
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_WDS:
- /* Create STA entry for the WDS peer */
- sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
- GFP_KERNEL);
- if (!sta) {
- res = -ENOMEM;
- goto err_del_interface;
- }
-
- sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
- sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
- sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);
-
- res = sta_info_insert(sta);
- if (res) {
- /* STA has been freed */
- goto err_del_interface;
- }
-
- rate_control_rate_init(sta);
- netif_carrier_on(dev);
- break;
- case NL80211_IFTYPE_P2P_DEVICE:
- rcu_assign_pointer(local->p2p_sdata, sdata);
- break;
- case NL80211_IFTYPE_MONITOR:
- if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
- break;
- list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
- break;
- default:
- break;
- }
-
- /*
- * set_multicast_list will be invoked by the networking core
- * which will check whether any increments here were done in
- * error and sync them down to the hardware as filter flags.
- */
- if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
- atomic_inc(&local->iff_allmultis);
-
- if (coming_up)
- local->open_count++;
-
- if (hw_reconf_flags)
- ieee80211_hw_config(local, hw_reconf_flags);
-
- ieee80211_recalc_ps(local);
-
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- local->ops->wake_tx_queue) {
- /* XXX: for AP_VLAN, actually track AP queues */
- if (dev)
- netif_tx_start_all_queues(dev);
- } else if (dev) {
- unsigned long flags;
- int n_acs = IEEE80211_NUM_ACS;
- int ac;
-
- if (local->hw.queues < IEEE80211_NUM_ACS)
- n_acs = 1;
-
- spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
- if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE ||
- (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 &&
- skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) {
- for (ac = 0; ac < n_acs; ac++) {
- int ac_queue = sdata->vif.hw_queue[ac];
-
- if (local->queue_stop_reasons[ac_queue] == 0 &&
- skb_queue_empty(&local->pending[ac_queue]))
- netif_start_subqueue(dev, ac);
- }
- }
- spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
- }
-
- return 0;
- err_del_interface:
- drv_remove_interface(local, sdata);
- err_stop:
- if (!local->open_count)
- drv_stop(local);
- err_del_bss:
- sdata->bss = NULL;
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
- mutex_lock(&local->mtx);
- list_del(&sdata->u.vlan.list);
- mutex_unlock(&local->mtx);
- }
- /* might already be clear but that doesn't matter */
- clear_bit(SDATA_STATE_RUNNING, &sdata->state);
- return res;
-}
-
static int ieee80211_open(struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -1227,60 +794,547 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = {
.ndo_get_stats64 = ieee80211_get_stats64,
};
-static void __ieee80211_set_hw_80211_encap(struct ieee80211_sub_if_data *sdata,
- bool enable)
+static bool ieee80211_iftype_supports_encap_offload(enum nl80211_iftype iftype)
{
- sdata->dev->netdev_ops = enable ? &ieee80211_dataif_8023_ops :
- &ieee80211_dataif_ops;
- sdata->hw_80211_encap = enable;
+ switch (iftype) {
+ /* P2P GO and client are mapped to AP/STATION types */
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_STATION:
+ return true;
+ default:
+ return false;
+ }
}
-bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable)
+static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdata)
{
- struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct ieee80211_local *local = sdata->local;
- struct ieee80211_sub_if_data *iter;
- struct ieee80211_key *key;
+ u32 flags;
+
+ flags = sdata->vif.offload_flags;
+
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) &&
+ ieee80211_iftype_supports_encap_offload(sdata->vif.type)) {
+ flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED;
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) &&
+ local->hw.wiphy->frag_threshold != (u32)-1)
+ flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
+
+ if (local->monitors)
+ flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
+ } else {
+ flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
+ }
+
+ if (sdata->vif.offload_flags == flags)
+ return false;
+
+ sdata->vif.offload_flags = flags;
+ return true;
+}
+
+static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *bss = sdata;
+ bool enabled;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (!sdata->bss)
+ return;
+
+ bss = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
+ }
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) ||
+ !ieee80211_iftype_supports_encap_offload(bss->vif.type))
+ return;
+
+ enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED;
+ if (sdata->wdev.use_4addr &&
+ !(bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_4ADDR))
+ enabled = false;
+
+ sdata->dev->netdev_ops = enabled ? &ieee80211_dataif_8023_ops :
+ &ieee80211_dataif_ops;
+}
+
+static void ieee80211_recalc_sdata_offload(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *vsdata;
+
+ if (ieee80211_set_sdata_offload_flags(sdata)) {
+ drv_update_vif_offload(local, sdata);
+ ieee80211_set_vif_encap_ops(sdata);
+ }
+
+ list_for_each_entry(vsdata, &local->interfaces, list) {
+ if (vsdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
+ vsdata->bss != &sdata->u.ap)
+ continue;
+
+ ieee80211_set_vif_encap_ops(vsdata);
+ }
+}
+
+void ieee80211_recalc_offload(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD))
+ return;
mutex_lock(&local->iflist_mtx);
- list_for_each_entry(iter, &local->interfaces, list) {
- struct ieee80211_sub_if_data *disable = NULL;
-
- if (vif->type == NL80211_IFTYPE_MONITOR) {
- disable = iter;
- __ieee80211_set_hw_80211_encap(iter, false);
- } else if (iter->vif.type == NL80211_IFTYPE_MONITOR) {
- disable = sdata;
- enable = false;
- }
- if (disable)
- sdata_dbg(disable,
- "disable hw 80211 encap due to mon co-exist\n");
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ ieee80211_recalc_sdata_offload(sdata);
}
+
mutex_unlock(&local->iflist_mtx);
+}
- if (enable == sdata->hw_80211_encap)
- return enable;
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+ const int offset)
+{
+ struct ieee80211_local *local = sdata->local;
+ u32 flags = sdata->u.mntr.flags;
- if (!sdata->dev)
- return false;
+#define ADJUST(_f, _s) do { \
+ if (flags & MONITOR_FLAG_##_f) \
+ local->fif_##_s += offset; \
+ } while (0)
+
+ ADJUST(FCSFAIL, fcsfail);
+ ADJUST(PLCPFAIL, plcpfail);
+ ADJUST(CONTROL, control);
+ ADJUST(CONTROL, pspoll);
+ ADJUST(OTHER_BSS, other_bss);
+
+#undef ADJUST
+}
+
+static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int i;
+
+ for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+ if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
+ sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
+ else if (local->hw.queues >= IEEE80211_NUM_ACS)
+ sdata->vif.hw_queue[i] = i;
+ else
+ sdata->vif.hw_queue[i] = 0;
+ }
+ sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
+}
+
+int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int ret;
+
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
+ return 0;
+
+ ASSERT_RTNL();
+
+ if (local->monitor_sdata)
+ return 0;
+
+ sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+
+ /* set up data */
+ sdata->local = local;
+ sdata->vif.type = NL80211_IFTYPE_MONITOR;
+ snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
+ wiphy_name(local->hw.wiphy));
+ sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
+
+ sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
+
+ ieee80211_set_default_queues(sdata);
+
+ ret = drv_add_interface(local, sdata);
+ if (WARN_ON(ret)) {
+ /* ok .. stupid driver, it asked for this! */
+ kfree(sdata);
+ return ret;
+ }
+
+ ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
+ if (ret) {
+ kfree(sdata);
+ return ret;
+ }
+
+ mutex_lock(&local->iflist_mtx);
+ rcu_assign_pointer(local->monitor_sdata, sdata);
+ mutex_unlock(&local->iflist_mtx);
+
+ mutex_lock(&local->mtx);
+ ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef,
+ IEEE80211_CHANCTX_EXCLUSIVE);
+ mutex_unlock(&local->mtx);
+ if (ret) {
+ mutex_lock(&local->iflist_mtx);
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+ synchronize_net();
+ drv_remove_interface(local, sdata);
+ kfree(sdata);
+ return ret;
+ }
+
+ skb_queue_head_init(&sdata->skb_queue);
+ INIT_WORK(&sdata->work, ieee80211_iface_work);
+
+ return 0;
+}
+
+void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
+ return;
+
+ ASSERT_RTNL();
+
+ mutex_lock(&local->iflist_mtx);
+
+ sdata = rcu_dereference_protected(local->monitor_sdata,
+ lockdep_is_held(&local->iflist_mtx));
+ if (!sdata) {
+ mutex_unlock(&local->iflist_mtx);
+ return;
+ }
+
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+
+ synchronize_net();
+
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+
+ drv_remove_interface(local, sdata);
+
+ kfree(sdata);
+}
+
+/*
+ * NOTE: Be very careful when changing this function, it must NOT return
+ * an error on interface type changes that have been pre-checked, so most
+ * checks should be in ieee80211_check_concurrent_iface.
+ */
+int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct net_device *dev = wdev->netdev;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ u32 changed = 0;
+ int res;
+ u32 hw_reconf_flags = 0;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_WDS:
+ if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
+ return -ENOLINK;
+ break;
+ case NL80211_IFTYPE_AP_VLAN: {
+ struct ieee80211_sub_if_data *master;
+
+ if (!sdata->bss)
+ return -ENOLINK;
+
+ mutex_lock(&local->mtx);
+ list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
+ mutex_unlock(&local->mtx);
+
+ master = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ sdata->control_port_protocol =
+ master->control_port_protocol;
+ sdata->control_port_no_encrypt =
+ master->control_port_no_encrypt;
+ sdata->control_port_over_nl80211 =
+ master->control_port_over_nl80211;
+ sdata->control_port_no_preauth =
+ master->control_port_no_preauth;
+ sdata->vif.cab_queue = master->vif.cab_queue;
+ memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
+ sizeof(sdata->vif.hw_queue));
+ sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
+
+ mutex_lock(&local->key_mtx);
+ sdata->crypto_tx_tailroom_needed_cnt +=
+ master->crypto_tx_tailroom_needed_cnt;
+ mutex_unlock(&local->key_mtx);
+
+ break;
+ }
+ case NL80211_IFTYPE_AP:
+ sdata->bss = &sdata->u.ap;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_NAN:
+ /* no special treatment */
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ /* cannot happen */
+ WARN_ON(1);
+ break;
+ }
+
+ if (local->open_count == 0) {
+ res = drv_start(local);
+ if (res)
+ goto err_del_bss;
+ /* we're brought up, everything changes */
+ hw_reconf_flags = ~0;
+ ieee80211_led_radio(local, true);
+ ieee80211_mod_tpt_led_trig(local,
+ IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
+ }
+
+ /*
+ * Copy the hopefully now-present MAC address to
+ * this interface, if it has the special null one.
+ */
+ if (dev && is_zero_ether_addr(dev->dev_addr)) {
+ memcpy(dev->dev_addr,
+ local->hw.wiphy->perm_addr,
+ ETH_ALEN);
+ memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
+
+ if (!is_valid_ether_addr(dev->dev_addr)) {
+ res = -EADDRNOTAVAIL;
+ goto err_stop;
+ }
+ }
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ /* no need to tell driver, but set carrier and chanctx */
+ if (rtnl_dereference(sdata->bss->beacon)) {
+ ieee80211_vif_vlan_copy_chanctx(sdata);
+ netif_carrier_on(dev);
+ ieee80211_set_vif_encap_ops(sdata);
+ } else {
+ netif_carrier_off(dev);
+ }
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
+ local->cooked_mntrs++;
+ break;
+ }
+
+ if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
+ res = drv_add_interface(local, sdata);
+ if (res)
+ goto err_stop;
+ } else if (local->monitors == 0 && local->open_count == 0) {
+ res = ieee80211_add_virtual_monitor(local);
+ if (res)
+ goto err_stop;
+ }
+
+ /* must be before the call to ieee80211_configure_filter */
+ local->monitors++;
+ if (local->monitors == 1) {
+ local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
+ hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
+ }
+
+ ieee80211_adjust_monitor_flags(sdata, 1);
+ ieee80211_configure_filter(local);
+ ieee80211_recalc_offload(local);
+ mutex_lock(&local->mtx);
+ ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
+ netif_carrier_on(dev);
+ break;
+ default:
+ if (coming_up) {
+ ieee80211_del_virtual_monitor(local);
+ ieee80211_set_sdata_offload_flags(sdata);
+
+ res = drv_add_interface(local, sdata);
+ if (res)
+ goto err_stop;
+
+ ieee80211_set_vif_encap_ops(sdata);
+ res = ieee80211_check_queues(sdata,
+ ieee80211_vif_type_p2p(&sdata->vif));
+ if (res)
+ goto err_del_interface;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ local->fif_pspoll++;
+ local->fif_probe_req++;
+
+ ieee80211_configure_filter(local);
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ local->fif_probe_req++;
+ }
+
+ if (sdata->vif.probe_req_reg)
+ drv_config_iface_filter(local, sdata,
+ FIF_PROBE_REQ,
+ FIF_PROBE_REQ);
+
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_NAN)
+ changed |= ieee80211_reset_erp_info(sdata);
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ netif_carrier_off(dev);
+ break;
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_NAN:
+ break;
+ default:
+ /* not reached */
+ WARN_ON(1);
+ }
- if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) &&
- (local->hw.wiphy->frag_threshold != (u32)-1))
- enable = false;
+ /*
+ * Set default queue parameters so drivers don't
+ * need to initialise the hardware if the hardware
+ * doesn't start up with sane defaults.
+ * Enable QoS for anything but station interfaces.
+ */
+ ieee80211_set_wmm_default(sdata, true,
+ sdata->vif.type != NL80211_IFTYPE_STATION);
+ }
- mutex_lock(&sdata->local->key_mtx);
- list_for_each_entry(key, &sdata->key_list, list) {
- if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)
- enable = false;
+ set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_WDS:
+ /* Create STA entry for the WDS peer */
+ sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
+ GFP_KERNEL);
+ if (!sta) {
+ res = -ENOMEM;
+ goto err_del_interface;
+ }
+
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
+ sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);
+
+ res = sta_info_insert(sta);
+ if (res) {
+ /* STA has been freed */
+ goto err_del_interface;
+ }
+
+ rate_control_rate_init(sta);
+ netif_carrier_on(dev);
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ rcu_assign_pointer(local->p2p_sdata, sdata);
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
+ break;
+ list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
+ break;
+ default:
+ break;
}
- mutex_unlock(&sdata->local->key_mtx);
- __ieee80211_set_hw_80211_encap(sdata, enable);
+ /*
+ * set_multicast_list will be invoked by the networking core
+ * which will check whether any increments here were done in
+ * error and sync them down to the hardware as filter flags.
+ */
+ if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+ atomic_inc(&local->iff_allmultis);
+
+ if (coming_up)
+ local->open_count++;
- return enable;
+ if (hw_reconf_flags)
+ ieee80211_hw_config(local, hw_reconf_flags);
+
+ ieee80211_recalc_ps(local);
+
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ local->ops->wake_tx_queue) {
+ /* XXX: for AP_VLAN, actually track AP queues */
+ if (dev)
+ netif_tx_start_all_queues(dev);
+ } else if (dev) {
+ unsigned long flags;
+ int n_acs = IEEE80211_NUM_ACS;
+ int ac;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ n_acs = 1;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE ||
+ (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 &&
+ skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) {
+ for (ac = 0; ac < n_acs; ac++) {
+ int ac_queue = sdata->vif.hw_queue[ac];
+
+ if (local->queue_stop_reasons[ac_queue] == 0 &&
+ skb_queue_empty(&local->pending[ac_queue]))
+ netif_start_subqueue(dev, ac);
+ }
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ }
+
+ return 0;
+ err_del_interface:
+ drv_remove_interface(local, sdata);
+ err_stop:
+ if (!local->open_count)
+ drv_stop(local);
+ err_del_bss:
+ sdata->bss = NULL;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ mutex_lock(&local->mtx);
+ list_del(&sdata->u.vlan.list);
+ mutex_unlock(&local->mtx);
+ }
+ /* might already be clear but that doesn't matter */
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+ return res;
}
-EXPORT_SYMBOL(ieee80211_set_hw_80211_encap);
static void ieee80211_if_free(struct net_device *dev)
{
@@ -1484,7 +1538,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.txpower = INT_MIN; /* unset */
sdata->noack_map = 0;
- sdata->hw_80211_encap = false;
/* only monitor/p2p-device differ */
if (sdata->dev) {
@@ -1619,6 +1672,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
ieee80211_teardown_sdata(sdata);
+ ieee80211_set_sdata_offload_flags(sdata);
ret = drv_change_interface(local, sdata, internal_type, p2p);
if (ret)
type = ieee80211_vif_type_p2p(&sdata->vif);
@@ -1631,6 +1685,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
ieee80211_check_queues(sdata, type);
ieee80211_setup_sdata(sdata, type);
+ ieee80211_set_vif_encap_ops(sdata);
err = ieee80211_do_open(&sdata->wdev, false);
WARN(err, "type change: do_open returned %d", err);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 2df636c32432..8c5f829ff6d7 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -177,13 +177,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
}
}
- /* TKIP countermeasures don't work in encap offload mode */
- if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP &&
- sdata->hw_80211_encap) {
- sdata_dbg(sdata, "TKIP is not allowed in hw 80211 encap mode\n");
- return -EINVAL;
- }
-
ret = drv_set_key(key->local, SET_KEY, sdata,
sta ? &sta->sta : NULL, &key->conf);
@@ -219,14 +212,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
case WLAN_CIPHER_SUITE_CCMP_256:
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
- /* We cannot do software crypto of data frames with
- * encapsulation offload enabled. However for 802.11w to
- * function properly we need cmac/gmac keys.
- */
- if (sdata->hw_80211_encap)
- return -EINVAL;
- fallthrough;
-
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index b4a2efe8e83a..523380aed92e 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1168,7 +1168,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT;
}
- local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM;
+ local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CNTDWN_COUNTERS_NUM;
/*
* We use the number of queues for feature tests (QoS, HT) internally
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 7ecd801a943b..ce5825d6f1d1 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -672,7 +672,7 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh)
* @hdr: 802.11 frame header
* @fc: frame control field
* @meshda: destination address in the mesh
- * @meshsa: source address address in the mesh. Same as TA, as frame is
+ * @meshsa: source address in the mesh. Same as TA, as frame is
* locally originated.
*
* Return the length of the 802.11 (does not include a mesh control header)
@@ -864,8 +864,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
*pos++ = 0x0;
*pos++ = ieee80211_frequency_to_channel(
csa->settings.chandef.chan->center_freq);
- bcn->csa_current_counter = csa->settings.count;
- bcn->csa_counter_offsets[0] = hdr_len + 6;
+ bcn->cntdwn_current_counter = csa->settings.count;
+ bcn->cntdwn_counter_offsets[0] = hdr_len + 6;
*pos++ = csa->settings.count;
*pos++ = WLAN_EID_CHAN_SWITCH_PARAM;
*pos++ = 6;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index bec23d2eee7a..313eee12410e 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -212,7 +212,7 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata,
skb->priority = 7;
info->control.vif = &sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
ieee80211_set_qos_hdr(sdata, skb);
ieee80211_mps_set_frame_flags(sdata, NULL, hdr);
}
@@ -1163,7 +1163,7 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata,
if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN)
skb_to_free = skb_dequeue(&mpath->frame_queue);
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
ieee80211_set_qos_hdr(sdata, skb);
skb_queue_tail(&mpath->frame_queue, skb);
if (skb_to_free)
diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c
index 031e905f684a..76d19c09d26e 100644
--- a/net/mac80211/mesh_ps.c
+++ b/net/mac80211/mesh_ps.c
@@ -432,7 +432,7 @@ static void mpsp_qos_null_append(struct sta_info *sta,
info = IEEE80211_SKB_CB(new_skb);
info->control.vif = &sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
__skb_queue_tail(frames, new_skb);
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2e400b0ff696..2489c6c64c2d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2432,23 +2432,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
}
-void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_hdr *hdr)
-{
- /*
- * We can postpone the mgd.timer whenever receiving unicast frames
- * from AP because we know that the connection is working both ways
- * at that time. But multicast frames (and hence also beacons) must
- * be ignored here, because we need to trigger the timer during
- * data idle periods for sending the periodic probe request to the
- * AP we're connected to.
- */
- if (is_multicast_ether_addr(hdr->addr1))
- return;
-
- ieee80211_sta_reset_conn_monitor(sdata);
-}
-
static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -2521,21 +2504,13 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
{
ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
- if (!ieee80211_is_data(hdr->frame_control))
- return;
-
- if (ieee80211_is_any_nullfunc(hdr->frame_control) &&
- sdata->u.mgd.probe_send_count > 0) {
- if (ack)
- ieee80211_sta_reset_conn_monitor(sdata);
- else
- sdata->u.mgd.nullfunc_failed = true;
- ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ if (!ieee80211_is_any_nullfunc(hdr->frame_control) ||
+ !sdata->u.mgd.probe_send_count)
return;
- }
- if (ack)
- ieee80211_sta_reset_conn_monitor(sdata);
+ if (!ack)
+ sdata->u.mgd.nullfunc_failed = true;
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
}
static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata,
@@ -3548,6 +3523,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
goto out;
}
+ if (sdata->wdev.use_4addr)
+ drv_sta_set_4addr(local, sdata, &sta->sta, true);
+
mutex_unlock(&sdata->local->sta_mtx);
/*
@@ -3605,8 +3583,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* Start timer to probe the connection to the AP now.
* Also start the timer that will detect beacon loss.
*/
- ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
ieee80211_sta_reset_beacon_monitor(sdata);
+ ieee80211_sta_reset_conn_monitor(sdata);
ret = true;
out:
@@ -4577,10 +4555,26 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t)
from_timer(sdata, t, u.mgd.conn_mon_timer);
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ unsigned long timeout;
if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn)
return;
+ sta = sta_info_get(sdata, ifmgd->bssid);
+ if (!sta)
+ return;
+
+ timeout = sta->status_stats.last_ack;
+ if (time_before(sta->status_stats.last_ack, sta->rx_stats.last_rx))
+ timeout = sta->rx_stats.last_rx;
+ timeout += IEEE80211_CONNECTION_IDLE_TIME;
+
+ if (time_is_before_jiffies(timeout)) {
+ mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(timeout));
+ return;
+ }
+
ieee80211_queue_work(&local->hw, &ifmgd->monitor_work);
}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index f470d1a7ce9b..1ac7b8c374c9 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -916,7 +916,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
if (beacon)
for (i = 0; i < params->n_csa_offsets; i++)
data[params->csa_offsets[i]] =
- beacon->csa_current_counter;
+ beacon->cntdwn_current_counter;
rcu_read_unlock();
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a959ebf56852..7f88a1b2215c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1812,9 +1812,6 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
sta->rx_stats.last_rate = sta_stats_encode_rate(status);
}
- if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
- ieee80211_sta_rx_notify(rx->sdata, hdr);
-
sta->rx_stats.fragments++;
u64_stats_update_begin(&rx->sta->rx_stats.syncp);
@@ -2900,7 +2897,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY);
info = IEEE80211_SKB_CB(fwd_skb);
memset(info, 0, sizeof(*info));
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
info->control.vif = &rx->sdata->vif;
info->control.jiffies = jiffies;
if (is_multicast_ether_addr(fwd_hdr->addr1)) {
@@ -4149,7 +4146,6 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
fastrx.expected_ds_bits = 0;
} else {
- fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0;
fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3);
fastrx.expected_ds_bits =
@@ -4379,11 +4375,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
pskb_trim(skb, skb->len - fast_rx->icv_len))
goto drop;
- if (unlikely(fast_rx->sta_notify)) {
- ieee80211_sta_rx_notify(rx->sdata, hdr);
- fast_rx->sta_notify = false;
- }
-
/* statistics part of ieee80211_rx_h_sta_process() */
if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
stats->last_signal = status->signal;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d5010116cf4d..91a61b44b4e0 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -336,7 +336,6 @@ struct ieee80211_fast_tx {
* @expected_ds_bits: from/to DS bits expected
* @icv_len: length of the MIC if present
* @key: bool indicating encryption is expected (key is set)
- * @sta_notify: notify the MLME code (once)
* @internal_forward: forward froms internally on AP/VLAN type interfaces
* @uses_rss: copy of USES_RSS hw flag
* @da_offs: offset of the DA in the header (for header conversion)
@@ -352,7 +351,6 @@ struct ieee80211_fast_rx {
__le16 expected_ds_bits;
u8 icv_len;
u8 key:1,
- sta_notify:1,
internal_forward:1,
uses_rss:1;
u8 da_offs, sa_offs;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 0794396a7988..7fe5bececfd9 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -66,8 +66,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
info->control.jiffies = jiffies;
info->control.vif = &sta->sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
- IEEE80211_TX_INTFL_RETRANSMISSION;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
+ info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
sta->status_stats.filtered++;
@@ -184,18 +184,6 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
struct ieee80211_mgmt *mgmt = (void *) skb->data;
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb);
-
- if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- sta->status_stats.last_ack = jiffies;
- if (txinfo->status.is_valid_ack_signal) {
- sta->status_stats.last_ack_signal =
- (s8)txinfo->status.ack_signal;
- sta->status_stats.ack_signal_filled = true;
- ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
- -txinfo->status.ack_signal);
- }
- }
if (ieee80211_is_data_qos(mgmt->frame_control)) {
struct ieee80211_hdr *hdr = (void *) skb->data;
@@ -890,7 +878,8 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
}
static void __ieee80211_tx_status(struct ieee80211_hw *hw,
- struct ieee80211_tx_status *status)
+ struct ieee80211_tx_status *status,
+ int rates_idx, int retry_count)
{
struct sk_buff *skb = status->skb;
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -899,17 +888,12 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
struct sta_info *sta;
__le16 fc;
struct ieee80211_supported_band *sband;
- int retry_count;
- int rates_idx;
bool send_to_cooked;
bool acked;
bool noack_success;
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
- u16 tx_time_est;
-
- rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
sband = local->hw.wiphy->bands[info->band];
fc = hdr->frame_control;
@@ -987,24 +971,14 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) {
ieee80211_handle_filtered_frame(local, sta, skb);
return;
- } else {
+ } else if (ieee80211_is_data_present(fc)) {
if (!acked && !noack_success)
- sta->status_stats.retry_failed++;
- sta->status_stats.retry_count += retry_count;
+ sta->status_stats.msdu_failed[tid]++;
- if (ieee80211_is_data_present(fc)) {
- if (!acked && !noack_success)
- sta->status_stats.msdu_failed[tid]++;
-
- sta->status_stats.msdu_retries[tid] +=
- retry_count;
- }
+ sta->status_stats.msdu_retries[tid] +=
+ retry_count;
}
- rate_control_tx_status(local, sband, status);
- if (ieee80211_vif_is_mesh(&sta->sdata->vif))
- ieee80211s_update_metric(local, sta, status);
-
if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
ieee80211_frame_acked(sta, skb);
@@ -1012,37 +986,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
acked, info->status.tx_time);
-
- if (info->status.tx_time &&
- wiphy_ext_feature_isset(local->hw.wiphy,
- NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
- ieee80211_sta_register_airtime(&sta->sta, tid,
- info->status.tx_time, 0);
-
- if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) {
- /* Do this here to avoid the expensive lookup of the sta
- * in ieee80211_report_used_skb().
- */
- ieee80211_sta_update_pending_airtime(local, sta,
- skb_get_queue_mapping(skb),
- tx_time_est,
- true);
- ieee80211_info_set_tx_time_est(info, 0);
- }
-
- if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- if (acked) {
- if (sta->status_stats.lost_packets)
- sta->status_stats.lost_packets = 0;
-
- /* Track when last TDLS packet was ACKed */
- sta->status_stats.last_pkt_time = jiffies;
- } else if (noack_success) {
- /* nothing to do here, do not account as lost */
- } else {
- ieee80211_lost_packet(sta, info);
- }
- }
}
/* SNMP counters
@@ -1101,7 +1044,10 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
* with this test...
*/
if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) {
- dev_kfree_skb(skb);
+ if (status->free_list)
+ list_add_tail(&skb->list, status->free_list);
+ else
+ dev_kfree_skb(skb);
return;
}
@@ -1126,7 +1072,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
if (sta)
status.sta = &sta->sta;
- __ieee80211_tx_status(hw, &status);
+ ieee80211_tx_status_ext(hw, &status);
rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_tx_status);
@@ -1137,10 +1083,12 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_tx_info *info = status->info;
struct ieee80211_sta *pubsta = status->sta;
+ struct sk_buff *skb = status->skb;
struct ieee80211_supported_band *sband;
- struct sta_info *sta;
- int retry_count;
+ struct sta_info *sta = NULL;
+ int rates_idx, retry_count;
bool acked, noack_success;
+ u16 tx_time_est;
if (pubsta) {
sta = container_of(pubsta, struct sta_info, sta);
@@ -1149,13 +1097,22 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
sta->tx_stats.last_rate_info = *status->rate;
}
- if (status->skb)
- return __ieee80211_tx_status(hw, status);
+ if (skb && (tx_time_est =
+ ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) {
+ /* Do this here to avoid the expensive lookup of the sta
+ * in ieee80211_report_used_skb().
+ */
+ ieee80211_sta_update_pending_airtime(local, sta,
+ skb_get_queue_mapping(skb),
+ tx_time_est,
+ true);
+ ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0);
+ }
- if (!status->sta)
- return;
+ if (!status->info)
+ goto free;
- ieee80211_tx_get_rates(hw, info, &retry_count);
+ rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
sband = hw->wiphy->bands[info->band];
@@ -1167,20 +1124,30 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
sta->status_stats.retry_failed++;
sta->status_stats.retry_count += retry_count;
- if (acked) {
- sta->status_stats.last_ack = jiffies;
+ if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
+ if (acked) {
+ sta->status_stats.last_ack = jiffies;
- if (sta->status_stats.lost_packets)
- sta->status_stats.lost_packets = 0;
+ if (sta->status_stats.lost_packets)
+ sta->status_stats.lost_packets = 0;
- /* Track when last packet was ACKed */
- sta->status_stats.last_pkt_time = jiffies;
- } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
- return;
- } else if (noack_success) {
- /* nothing to do here, do not account as lost */
- } else {
- ieee80211_lost_packet(sta, info);
+ /* Track when last packet was ACKed */
+ sta->status_stats.last_pkt_time = jiffies;
+
+ if (info->status.is_valid_ack_signal) {
+ sta->status_stats.last_ack_signal =
+ (s8)info->status.ack_signal;
+ sta->status_stats.ack_signal_filled = true;
+ ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
+ -info->status.ack_signal);
+ }
+ } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ return;
+ } else if (noack_success) {
+ /* nothing to do here, do not account as lost */
+ } else {
+ ieee80211_lost_packet(sta, info);
+ }
}
rate_control_tx_status(local, sband, status);
@@ -1188,6 +1155,10 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
ieee80211s_update_metric(local, sta, status);
}
+ if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP))
+ return __ieee80211_tx_status(hw, status, rates_idx,
+ retry_count);
+
if (acked || noack_success) {
I802_DEBUG_INC(local->dot11TransmittedFrameCount);
if (!pubsta)
@@ -1199,6 +1170,16 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
} else {
I802_DEBUG_INC(local->dot11FailedCount);
}
+
+free:
+ if (!skb)
+ return;
+
+ ieee80211_report_used_skb(local, skb, false);
+ if (status->free_list)
+ list_add_tail(&skb->list, status->free_list);
+ else
+ dev_kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_tx_status_ext);
@@ -1225,69 +1206,23 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
struct sk_buff *skb)
{
- struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_sub_if_data *sdata;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_status status = {
+ .skb = skb,
+ .info = IEEE80211_SKB_CB(skb),
+ };
struct sta_info *sta;
- int retry_count;
- int rates_idx;
- bool acked;
sdata = vif_to_sdata(vif);
- acked = info->flags & IEEE80211_TX_STAT_ACK;
- rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
-
rcu_read_lock();
- if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
- goto counters_update;
-
- if (IS_ERR(sta))
- goto counters_update;
-
- if (!acked)
- sta->status_stats.retry_failed++;
-
- if (rates_idx != -1)
- sta->tx_stats.last_rate = info->status.rates[rates_idx];
-
- sta->status_stats.retry_count += retry_count;
-
- if (ieee80211_hw_check(hw, REPORTS_TX_ACK_STATUS)) {
- if (acked && vif->type == NL80211_IFTYPE_STATION)
- ieee80211_sta_reset_conn_monitor(sdata);
-
- sta->status_stats.last_ack = jiffies;
- if (info->flags & IEEE80211_TX_STAT_ACK) {
- if (sta->status_stats.lost_packets)
- sta->status_stats.lost_packets = 0;
+ if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta))
+ status.sta = &sta->sta;
- sta->status_stats.last_pkt_time = jiffies;
- } else {
- ieee80211_lost_packet(sta, info);
- }
- }
+ ieee80211_tx_status_ext(hw, &status);
-counters_update:
rcu_read_unlock();
- ieee80211_led_tx(local);
-
- if (!(info->flags & IEEE80211_TX_STAT_ACK) &&
- !(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED))
- goto skip_stats_update;
-
- I802_DEBUG_INC(local->dot11TransmittedFrameCount);
- if (is_multicast_ether_addr(skb->data))
- I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount);
- if (retry_count > 0)
- I802_DEBUG_INC(local->dot11RetryCount);
- if (retry_count > 1)
- I802_DEBUG_INC(local->dot11MultipleRetryCount);
-
-skip_stats_update:
- ieee80211_report_used_skb(local, skb, false);
- dev_kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_tx_status_8023);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 50ab5b9d8eab..89723907a094 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2734,6 +2734,39 @@ TRACE_EVENT(drv_get_ftm_responder_stats,
)
);
+DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_sta_set_4addr,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, bool enabled),
+
+ TP_ARGS(local, sdata, sta, enabled),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(bool, enabled)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " enabled:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->enabled
+ )
+);
+
#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index dca01d7e6e3e..adc83d830691 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -531,7 +531,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
info->control.jiffies = jiffies;
info->control.vif = &tx->sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb);
spin_unlock(&sta->ps_lock);
@@ -1134,7 +1134,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
tx->sta->sta.addr, tx->sta->sta.aid);
}
info->control.vif = &tx->sdata->vif;
- info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
__skb_queue_tail(&tid_tx->pending, skb);
if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER)
@@ -1179,7 +1179,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
* we are doing the needed processing, so remove the flag
* now.
*/
- info->flags &= ~IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.flags &= ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
hdr = (struct ieee80211_hdr *) skb->data;
@@ -1258,7 +1258,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
return NULL;
- if (!(info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) &&
+ if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
if ((!ieee80211_is_mgmt(hdr->frame_control) ||
ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
@@ -3649,7 +3649,7 @@ begin:
else
info->flags &= ~IEEE80211_TX_CTL_AMPDU;
- if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP)
+ if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)
goto encap_out;
if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
@@ -4190,38 +4190,18 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
struct net_device *dev, struct sta_info *sta,
- struct sk_buff *skb)
+ struct ieee80211_key *key, struct sk_buff *skb)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ethhdr *ehdr = (struct ethhdr *)skb->data;
struct ieee80211_local *local = sdata->local;
- bool authorized = false;
- bool multicast;
- unsigned char *ra = ehdr->h_dest;
-
- if (IS_ERR(sta) || (sta && !sta->uploaded))
- sta = NULL;
-
- if (sdata->vif.type == NL80211_IFTYPE_STATION &&
- (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER)))
- ra = sdata->u.mgd.bssid;
+ struct tid_ampdu_tx *tid_tx;
+ u8 tid;
- if (is_zero_ether_addr(ra))
- goto out_free;
-
- multicast = is_multicast_ether_addr(ra);
-
- if (sta)
- authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
-
- if (!multicast && !authorized &&
- (ehdr->h_proto != sdata->control_port_protocol ||
- !ether_addr_equal(sdata->vif.addr, ehdr->h_source)))
- goto out_free;
-
- if (multicast && sdata->vif.type == NL80211_IFTYPE_AP &&
- !atomic_read(&sdata->u.ap.num_mcast_sta))
- goto out_free;
+ if (local->ops->wake_tx_queue) {
+ u16 queue = __ieee80211_select_queue(sdata, sta, skb);
+ skb_set_queue_mapping(skb, queue);
+ skb_get_hash(skb);
+ }
if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) &&
test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
@@ -4229,36 +4209,42 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
memset(info, 0, sizeof(*info));
- if (unlikely(!multicast && skb->sk &&
- skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
- info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
- &info->flags, NULL);
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (tid_tx) {
+ if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+ /* fall back to non-offload slow path */
+ __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
+ return;
+ }
- if (unlikely(sdata->control_port_protocol == ehdr->h_proto)) {
- if (sdata->control_port_no_encrypt)
- info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
- info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ if (tid_tx->timeout)
+ tid_tx->last_tx = jiffies;
}
- if (multicast)
- info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ if (unlikely(skb->sk &&
+ skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
+ info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
+ &info->flags, NULL);
info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
ieee80211_tx_stats(dev, skb->len);
- if (sta) {
- sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
- sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
- }
+ sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+ sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(sdata->bss,
struct ieee80211_sub_if_data, u.ap);
- info->control.flags |= IEEE80211_TX_CTRL_HW_80211_ENCAP;
+ info->flags |= IEEE80211_TX_CTL_HW_80211_ENCAP;
info->control.vif = &sdata->vif;
+ if (key)
+ info->control.hw_key = &key->conf;
+
ieee80211_tx_8023(sdata, skb, skb->len, sta, false);
return;
@@ -4271,12 +4257,10 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ethhdr *ehdr = (struct ethhdr *)skb->data;
+ struct ieee80211_key *key;
struct sta_info *sta;
-
- if (WARN_ON(!sdata->hw_80211_encap)) {
- kfree_skb(skb);
- return NETDEV_TX_OK;
- }
+ bool offload = true;
if (unlikely(skb->len < ETH_HLEN)) {
kfree_skb(skb);
@@ -4285,11 +4269,26 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
rcu_read_lock();
- if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
kfree_skb(skb);
+ goto out;
+ }
+
+ if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded ||
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+ sdata->control_port_protocol == ehdr->h_proto))
+ offload = false;
+ else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) &&
+ (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) ||
+ key->conf.cipher == WLAN_CIPHER_SUITE_TKIP))
+ offload = false;
+
+ if (offload)
+ ieee80211_8023_xmit(sdata, dev, sta, key, skb);
else
- ieee80211_8023_xmit(sdata, dev, sta, skb);
+ ieee80211_subif_start_xmit(skb, dev);
+out:
rcu_read_unlock();
return NETDEV_TX_OK;
@@ -4365,7 +4364,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
sdata = vif_to_sdata(info->control.vif);
- if (info->flags & IEEE80211_TX_INTFL_NEED_TXPROCESSING) {
+ if (info->control.flags & IEEE80211_TX_INTCFL_NEED_TXPROCESSING) {
chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
if (unlikely(!chanctx_conf)) {
dev_kfree_skb(skb);
@@ -4373,7 +4372,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
}
info->band = chanctx_conf->def.chan->band;
result = ieee80211_tx(sdata, NULL, skb, true);
- } else if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) {
+ } else if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) {
if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
dev_kfree_skb(skb);
return true;
@@ -4538,14 +4537,14 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
return 0;
}
-static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata,
- struct beacon_data *beacon)
+static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata,
+ struct beacon_data *beacon)
{
struct probe_resp *resp;
u8 *beacon_data;
size_t beacon_data_len;
int i;
- u8 count = beacon->csa_current_counter;
+ u8 count = beacon->cntdwn_current_counter;
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
@@ -4565,36 +4564,36 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata,
}
rcu_read_lock();
- for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; ++i) {
+ for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; ++i) {
resp = rcu_dereference(sdata->u.ap.probe_resp);
- if (beacon->csa_counter_offsets[i]) {
- if (WARN_ON_ONCE(beacon->csa_counter_offsets[i] >=
+ if (beacon->cntdwn_counter_offsets[i]) {
+ if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[i] >=
beacon_data_len)) {
rcu_read_unlock();
return;
}
- beacon_data[beacon->csa_counter_offsets[i]] = count;
+ beacon_data[beacon->cntdwn_counter_offsets[i]] = count;
}
if (sdata->vif.type == NL80211_IFTYPE_AP && resp)
- resp->data[resp->csa_counter_offsets[i]] = count;
+ resp->data[resp->cntdwn_counter_offsets[i]] = count;
}
rcu_read_unlock();
}
-static u8 __ieee80211_csa_update_counter(struct beacon_data *beacon)
+static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon)
{
- beacon->csa_current_counter--;
+ beacon->cntdwn_current_counter--;
/* the counter should never reach 0 */
- WARN_ON_ONCE(!beacon->csa_current_counter);
+ WARN_ON_ONCE(!beacon->cntdwn_current_counter);
- return beacon->csa_current_counter;
+ return beacon->cntdwn_current_counter;
}
-u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
+u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct beacon_data *beacon = NULL;
@@ -4612,15 +4611,15 @@ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
if (!beacon)
goto unlock;
- count = __ieee80211_csa_update_counter(beacon);
+ count = __ieee80211_beacon_update_cntdwn(beacon);
unlock:
rcu_read_unlock();
return count;
}
-EXPORT_SYMBOL(ieee80211_csa_update_counter);
+EXPORT_SYMBOL(ieee80211_beacon_update_cntdwn);
-void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
+void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct beacon_data *beacon = NULL;
@@ -4637,15 +4636,15 @@ void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
if (!beacon)
goto unlock;
- if (counter < beacon->csa_current_counter)
- beacon->csa_current_counter = counter;
+ if (counter < beacon->cntdwn_current_counter)
+ beacon->cntdwn_current_counter = counter;
unlock:
rcu_read_unlock();
}
-EXPORT_SYMBOL(ieee80211_csa_set_counter);
+EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn);
-bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
+bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct beacon_data *beacon = NULL;
@@ -4688,20 +4687,21 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
goto out;
}
- if (!beacon->csa_counter_offsets[0])
+ if (!beacon->cntdwn_counter_offsets[0])
goto out;
- if (WARN_ON_ONCE(beacon->csa_counter_offsets[0] > beacon_data_len))
+ if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[0] > beacon_data_len))
goto out;
- if (beacon_data[beacon->csa_counter_offsets[0]] == 1)
+ if (beacon_data[beacon->cntdwn_counter_offsets[0]] == 1)
ret = true;
+
out:
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL(ieee80211_csa_is_complete);
+EXPORT_SYMBOL(ieee80211_beacon_cntdwn_is_complete);
static int ieee80211_beacon_protect(struct sk_buff *skb,
struct ieee80211_local *local,
@@ -4761,11 +4761,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
beacon = rcu_dereference(ap->beacon);
if (beacon) {
- if (beacon->csa_counter_offsets[0]) {
+ if (beacon->cntdwn_counter_offsets[0]) {
if (!is_template)
- __ieee80211_csa_update_counter(beacon);
+ ieee80211_beacon_update_cntdwn(vif);
- ieee80211_set_csa(sdata, beacon);
+ ieee80211_set_beacon_cntdwn(sdata, beacon);
}
/*
@@ -4809,11 +4809,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (!beacon)
goto out;
- if (beacon->csa_counter_offsets[0]) {
+ if (beacon->cntdwn_counter_offsets[0]) {
if (!is_template)
- __ieee80211_csa_update_counter(beacon);
+ __ieee80211_beacon_update_cntdwn(beacon);
- ieee80211_set_csa(sdata, beacon);
+ ieee80211_set_beacon_cntdwn(sdata, beacon);
}
skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
@@ -4833,16 +4833,16 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (!beacon)
goto out;
- if (beacon->csa_counter_offsets[0]) {
+ if (beacon->cntdwn_counter_offsets[0]) {
if (!is_template)
/* TODO: For mesh csa_counter is in TU, so
* decrementing it by one isn't correct, but
* for now we leave it consistent with overall
* mac80211's behavior.
*/
- __ieee80211_csa_update_counter(beacon);
+ __ieee80211_beacon_update_cntdwn(beacon);
- ieee80211_set_csa(sdata, beacon);
+ ieee80211_set_beacon_cntdwn(sdata, beacon);
}
if (ifmsh->sync_ops)
@@ -4874,13 +4874,13 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (offs && beacon) {
int i;
- for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; i++) {
- u16 csa_off = beacon->csa_counter_offsets[i];
+ for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) {
+ u16 csa_off = beacon->cntdwn_counter_offsets[i];
if (!csa_off)
continue;
- offs->csa_counter_offs[i] = csa_off_base + csa_off;
+ offs->cntdwn_counter_offs[i] = csa_off_base + csa_off;
}
}
@@ -4999,6 +4999,63 @@ out:
}
EXPORT_SYMBOL(ieee80211_proberesp_get);
+struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct sk_buff *skb = NULL;
+ struct fils_discovery_data *tmpl = NULL;
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return NULL;
+
+ rcu_read_lock();
+ tmpl = rcu_dereference(sdata->u.ap.fils_discovery);
+ if (!tmpl) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len);
+ if (skb) {
+ skb_reserve(skb, sdata->local->hw.extra_tx_headroom);
+ skb_put_data(skb, tmpl->data, tmpl->len);
+ }
+
+ rcu_read_unlock();
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl);
+
+struct sk_buff *
+ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct sk_buff *skb = NULL;
+ struct unsol_bcast_probe_resp_data *tmpl = NULL;
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return NULL;
+
+ rcu_read_lock();
+ tmpl = rcu_dereference(sdata->u.ap.unsol_bcast_probe_resp);
+ if (!tmpl) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len);
+ if (skb) {
+ skb_reserve(skb, sdata->local->hw.extra_tx_headroom);
+ skb_put_data(skb, tmpl->data, tmpl->len);
+ }
+
+ rcu_read_unlock();
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl);
+
struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
struct ieee80211_vif *vif)
{
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index d1b64d0751f2..fb0e3a657d2d 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -315,10 +315,6 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
- /* If HT IE reported 3839 bytes only, stay with that size. */
- if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839)
- return;
-
switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 0a6a15f3456d..056986c7a228 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -22,6 +22,11 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL),
+ SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE),
+ SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE),
+ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
+ SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index d7de340fc997..937a177729f1 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -15,6 +15,11 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */
+ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */
+ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */
+ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */
+ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 770da3627848..b4a9624d7bf2 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -23,8 +23,6 @@ static int pm_nl_pernet_id;
struct mptcp_pm_addr_entry {
struct list_head list;
- unsigned int flags;
- int ifindex;
struct mptcp_addr_info addr;
struct rcu_head rcu;
};
@@ -129,7 +127,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
rcu_read_lock();
spin_lock_bh(&msk->join_list_lock);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
/* avoid any address already in use by subflows and
@@ -160,7 +158,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
if (i++ == pos) {
ret = entry;
@@ -220,8 +218,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
msk->pm.subflows++;
check_work_pending(msk);
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect(sk, local->ifindex,
- &local->addr, &remote);
+ __mptcp_subflow_connect(sk, &local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
return;
}
@@ -267,13 +264,13 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
local.family = remote.family;
spin_unlock_bh(&msk->pm.lock);
- __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ __mptcp_subflow_connect((struct sock *)msk, &local, &remote);
spin_lock_bh(&msk->pm.lock);
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
- return (entry->flags &
+ return (entry->addr.flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
@@ -303,9 +300,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max++;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max++;
entry->addr.id = pernet->next_id++;
@@ -358,8 +355,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
if (!entry)
return -ENOMEM;
- entry->flags = 0;
entry->addr = skc_local;
+ entry->addr.ifindex = 0;
+ entry->addr.flags = 0;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -397,8 +395,8 @@ mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
[MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
[MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
[MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
- [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr), },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
[MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
[MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
@@ -473,14 +471,17 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
skip_family:
- if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
- entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
+ u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ entry->addr.ifindex = val;
+ }
if (tb[MPTCP_PM_ADDR_ATTR_ID])
entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
- entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+ entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
return 0;
}
@@ -548,9 +549,9 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
ret = -EINVAL;
goto out;
}
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
pernet->add_addr_signal_max--;
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
pernet->local_addr_max--;
pernet->addrs--;
@@ -606,10 +607,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
- if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags))
goto nla_put_failure;
- if (entry->ifindex &&
- nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ if (entry->addr.ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex))
goto nla_put_failure;
if (addr->family == AF_INET &&
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 365ba96c84b0..386cd4e60250 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -24,8 +24,6 @@
#include "protocol.h"
#include "mib.h"
-#define MPTCP_SAME_STATE TCP_MAX_STATES
-
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct mptcp6_sock {
struct mptcp_sock msk;
@@ -34,6 +32,8 @@ struct mptcp6_sock {
#endif
struct mptcp_skb_cb {
+ u64 map_seq;
+ u64 end_seq;
u32 offset;
};
@@ -112,64 +112,205 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
return 0;
}
-static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb,
- unsigned int offset, size_t copy_len)
+static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
+static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ bool fragstolen;
+ int delta;
+
+ if (MPTCP_SKB_CB(from)->offset ||
+ !skb_try_coalesce(to, from, &fragstolen, &delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ kfree_skb_partial(from, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return true;
+}
+
+static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
+ return false;
+
+ return mptcp_try_coalesce((struct sock *)msk, to, from);
+}
+
+/* "inspired" by tcp_data_queue_ofo(), main differences:
+ * - use mptcp seqs
+ * - don't cope with sacks
+ */
+static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
+ struct rb_node **p, *parent;
+ u64 seq, end_seq, max_seq;
+ struct sk_buff *skb1;
+ int space;
+
+ seq = MPTCP_SKB_CB(skb)->map_seq;
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ space = tcp_space(sk);
+ max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
+
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ if (after64(seq, max_seq)) {
+ /* out of window */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+ return;
+ }
- __skb_unlink(skb, &ssk->sk_receive_queue);
+ p = &msk->out_of_order_queue.rb_node;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
+ if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+ msk->ooo_last_skb = skb;
+ goto end;
+ }
- skb_ext_reset(skb);
- skb_orphan(skb);
- msk->ack_seq += copy_len;
+ /* with 2 subflows, adding at end of ooo queue is quite likely
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ return;
+ }
- tail = skb_peek_tail(&sk->sk_receive_queue);
- if (offset == 0 && tail) {
- bool fragstolen;
- int delta;
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ parent = &msk->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
- if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
- kfree_skb_partial(skb, fragstolen);
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ return;
+ }
+ if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ /* partial overlap:
+ * | skb |
+ * | skb1 |
+ * continue traversing
+ */
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ goto merge_right;
+ }
+ } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
return;
}
+ p = &parent->rb_right;
}
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- MPTCP_SKB_CB(skb)->offset = offset;
-}
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
-static void mptcp_stop_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ break;
+ rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ msk->ooo_last_skb = skb;
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
- mptcp_sk(sk)->timer_ival = 0;
+end:
+ skb_condense(skb);
+ skb_set_owner_r(skb, sk);
}
-/* both sockets must be locked */
-static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
- struct sock *ssk)
+static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb, unsigned int offset,
+ size_t copy_len)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
- /* revalidate data sequence number.
- *
- * mptcp_subflow_data_available() is usually called
- * without msk lock. Its unlikely (but possible)
- * that msk->ack_seq has been advanced since the last
- * call found in-sequence data.
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
*/
- if (likely(dsn == msk->ack_seq))
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+
+ if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ /* in sequence */
+ msk->ack_seq += copy_len;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail && mptcp_try_coalesce(sk, tail, skb))
+ return true;
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
+ } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ mptcp_data_queue_ofo(msk, skb);
+ return false;
+ }
- subflow->data_avail = 0;
- return mptcp_subflow_data_available(ssk);
+ /* old data, keep it simple and drop the whole pkt, sender
+ * will retransmit as needed, if needed.
+ */
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ mptcp_drop(sk, skb);
+ return false;
+}
+
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
}
static void mptcp_check_data_fin_ack(struct sock *sk)
@@ -315,11 +456,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
bool done = false;
- if (!mptcp_subflow_dsn_valid(msk, ssk)) {
- *bytes = 0;
- return false;
- }
-
+ pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
do {
u32 map_remaining, offset;
@@ -357,9 +494,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (tp->urg_data)
done = true;
- __mptcp_move_skb(msk, ssk, skb, offset, len);
+ if (__mptcp_move_skb(msk, ssk, skb, offset, len))
+ moved += len;
seq += len;
- moved += len;
if (WARN_ON_ONCE(map_remaining < len))
break;
@@ -378,20 +515,56 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
} while (more_data_avail);
- *bytes = moved;
-
- /* If the moves have caught up with the DATA_FIN sequence number
- * it's time to ack the DATA_FIN and change socket state, but
- * this is not a good place to change state. Let the workqueue
- * do it.
- */
- if (mptcp_pending_data_fin(sk, NULL) &&
- schedule_work(&msk->work))
- sock_hold(sk);
+ *bytes += moved;
+ if (moved)
+ tcp_cleanup_rbuf(ssk, moved);
return done;
}
+static bool mptcp_ofo_queue(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb, *tail;
+ bool moved = false;
+ struct rb_node *p;
+ u64 end_seq;
+
+ p = rb_first(&msk->out_of_order_queue);
+ pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ while (p) {
+ skb = rb_to_skb(p);
+ if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
+ break;
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &msk->out_of_order_queue);
+
+ if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
+ msk->ack_seq))) {
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ continue;
+ }
+
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
+ int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ /* skip overlapping data, if any */
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
+ delta);
+ MPTCP_SKB_CB(skb)->offset += delta;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+ msk->ack_seq = end_seq;
+ moved = true;
+ }
+ return moved;
+}
+
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
@@ -407,8 +580,19 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
return false;
/* must re-check after taking the lock */
- if (!READ_ONCE(sk->sk_lock.owned))
+ if (!READ_ONCE(sk->sk_lock.owned)) {
__mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_ofo_queue(msk);
+
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+ }
spin_unlock_bh(&sk->sk_lock.slock);
@@ -417,9 +601,17 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool wake;
- set_bit(MPTCP_DATA_READY, &msk->flags);
+ /* move_skbs_to_msk below can legitly clear the data_avail flag,
+ * but we will need later to properly woke the reader, cache its
+ * value
+ */
+ wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
+ if (wake)
+ set_bit(MPTCP_DATA_READY, &msk->flags);
if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
move_skbs_to_msk(msk, ssk))
@@ -440,7 +632,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
move_skbs_to_msk(msk, ssk);
}
wake:
- sk->sk_data_ready(sk);
+ if (wake)
+ sk->sk_data_ready(sk);
}
static void __mptcp_flush_join_list(struct mptcp_sock *msk)
@@ -474,7 +667,7 @@ void mptcp_data_acked(struct sock *sk)
{
mptcp_reset_timer(sk);
- if ((!sk_stream_is_writeable(sk) ||
+ if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
(inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
schedule_work(&mptcp_sk(sk)->work))
sock_hold(sk);
@@ -569,6 +762,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
put_page(dfrag->page);
}
+static bool mptcp_is_writeable(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!sk_stream_is_writeable((struct sock *)msk))
+ return false;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (sk_stream_is_writeable(subflow->tcp_sock))
+ return true;
+ }
+ return false;
+}
+
static void mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -611,8 +818,15 @@ out:
sk_mem_reclaim_partial(sk);
/* Only wake up writers if a subflow is ready */
- if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ if (mptcp_is_writeable(msk)) {
+ set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
+ smp_mb__after_atomic();
+
+ /* set SEND_SPACE before sk_stream_write_space clears
+ * NOSPACE
+ */
sk_stream_write_space(sk);
+ }
}
}
@@ -803,60 +1017,128 @@ out:
return ret;
}
-static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+static void mptcp_nospace(struct mptcp_sock *msk)
{
+ struct mptcp_subflow_context *subflow;
+
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+ /* enables ssk->write_space() callbacks */
+ if (sock)
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
}
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ /* only send if our side has not closed yet */
+ return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
+
+struct subflow_send_info {
+ struct sock *ssk;
+ u64 ratio;
+};
+
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
+ u32 *sndbuf)
+{
+ struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
- struct sock *backup = NULL;
+ int i, nr_active = 0;
+ struct sock *ssk;
+ u64 ratio;
+ u32 pace;
- sock_owned_by_me((const struct sock *)msk);
+ sock_owned_by_me((struct sock *)msk);
+ *sndbuf = 0;
if (!mptcp_ext_cache_refill(msk))
return NULL;
+ if (__mptcp_check_fallback(msk)) {
+ if (!msk->first)
+ return NULL;
+ *sndbuf = msk->first->sk_sndbuf;
+ return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ }
+
+ /* re-use last subflow, if the burst allow that */
+ if (msk->last_snd && msk->snd_burst > 0 &&
+ sk_stream_memory_free(msk->last_snd) &&
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ }
+ return msk->last_snd;
+ }
+
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < 2; ++i) {
+ send_info[i].ssk = NULL;
+ send_info[i].ratio = -1;
+ }
mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
- if (!sk_stream_memory_free(ssk)) {
- struct socket *sock = ssk->sk_socket;
+ nr_active += !subflow->backup;
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
+ if (!sk_stream_memory_free(subflow->tcp_sock))
+ continue;
- if (sock)
- mptcp_nospace(msk, sock);
+ pace = READ_ONCE(ssk->sk_pacing_rate);
+ if (!pace)
+ continue;
- return NULL;
+ ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
+ pace);
+ if (ratio < send_info[subflow->backup].ratio) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].ratio = ratio;
}
+ }
- if (subflow->backup) {
- if (!backup)
- backup = ssk;
+ pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
+ msk, nr_active, send_info[0].ssk, send_info[0].ratio,
+ send_info[1].ssk, send_info[1].ratio);
- continue;
- }
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[0].ssk = send_info[1].ssk;
- return ssk;
+ if (send_info[0].ssk) {
+ msk->last_snd = send_info[0].ssk;
+ msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
+ sk_stream_wspace(msk->last_snd));
+ return msk->last_snd;
}
-
- return backup;
+ return NULL;
}
-static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+static void ssk_check_wmem(struct mptcp_sock *msk)
{
- struct socket *sock;
-
- if (likely(sk_stream_is_writeable(ssk)))
- return;
-
- sock = READ_ONCE(ssk->sk_socket);
- if (sock)
- mptcp_nospace(msk, sock);
+ if (unlikely(!mptcp_is_writeable(msk)))
+ mptcp_nospace(msk);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
@@ -866,6 +1148,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct page_frag *pfrag;
size_t copied = 0;
struct sock *ssk;
+ u32 sndbuf;
bool tx_ok;
long timeo;
@@ -892,7 +1175,7 @@ restart:
}
__mptcp_flush_join_list(msk);
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
while (!sk_stream_memory_free(sk) ||
!ssk ||
!mptcp_page_frag_refill(ssk, pfrag)) {
@@ -909,19 +1192,25 @@ restart:
mptcp_reset_timer(sk);
}
+ mptcp_nospace(msk);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
mptcp_clean_una(sk);
- ssk = mptcp_subflow_get_send(msk);
+ ssk = mptcp_subflow_get_send(msk, &sndbuf);
if (list_empty(&msk->conn_list)) {
ret = -ENOTCONN;
goto out;
}
}
+ /* do auto tuning */
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+ sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
@@ -938,6 +1227,10 @@ restart:
break;
}
+ /* burst can be negative, we will try move to the next subflow
+ * at selection time, if possible.
+ */
+ msk->snd_burst -= ret;
copied += ret;
tx_ok = msg_data_left(msg);
@@ -947,7 +1240,6 @@ restart:
if (!sk_stream_memory_free(ssk) ||
!mptcp_page_frag_refill(ssk, pfrag) ||
!mptcp_ext_cache_refill(msk)) {
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
tcp_push(ssk, msg->msg_flags, mss_now,
tcp_sk(ssk)->nonagle, size_goal);
mptcp_set_timeout(sk, ssk);
@@ -995,9 +1287,9 @@ restart:
mptcp_reset_timer(sk);
}
- ssk_check_wmem(msk, ssk);
release_sock(ssk);
out:
+ ssk_check_wmem(msk);
release_sock(sk);
return copied ? : ret;
}
@@ -1135,10 +1427,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
*/
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk;
+ bool slow;
ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
tcp_sk(ssk)->window_clamp = window_clamp;
+ tcp_cleanup_rbuf(ssk, 1);
+ unlock_sock_fast(ssk, slow);
}
}
}
@@ -1154,6 +1450,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool done;
+ /* avoid looping forever below on racing close */
+ if (((struct sock *)msk)->sk_state == TCP_CLOSE)
+ return false;
+
+ __mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
@@ -1165,7 +1466,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
release_sock(ssk);
} while (!done);
- return moved > 0;
+ if (mptcp_ofo_queue(msk) || moved > 0) {
+ mptcp_check_data_fin((struct sock *)msk);
+ return true;
+ }
+ return false;
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -1261,6 +1566,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
set_bit(MPTCP_DATA_READY, &msk->flags);
}
out_err:
+ pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
+ msk, test_bit(MPTCP_DATA_READY, &msk->flags),
+ skb_queue_empty(&sk->sk_receive_queue), copied);
mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
@@ -1311,9 +1619,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (__mptcp_check_fallback(msk))
+ return msk->first;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
/* still data outstanding at TCP level? Don't retransmit. */
if (!tcp_write_queue_empty(ssk))
return NULL;
@@ -1474,6 +1788,7 @@ static int __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker);
+ msk->out_of_order_queue = RB_ROOT;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -1507,7 +1822,7 @@ static int mptcp_init_sock(struct sock *sk)
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
return 0;
}
@@ -1813,6 +2128,7 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_token_destroy(msk);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
@@ -2288,13 +2604,13 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
+ pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
if (state == TCP_LISTEN)
return mptcp_check_readable(msk);
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
- if (sk_stream_is_writeable(sk) &&
- test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
mask |= EPOLLOUT | EPOLLWRNORM;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 60b27d44c184..493bd2c13bc6 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -140,6 +140,8 @@ struct mptcp_addr_info {
sa_family_t family;
__be16 port;
u8 id;
+ u8 flags;
+ int ifindex;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -194,6 +196,8 @@ struct mptcp_sock {
u64 write_seq;
u64 ack_seq;
u64 rcv_data_fin_seq;
+ struct sock *last_snd;
+ int snd_burst;
atomic64_t snd_una;
unsigned long timer_ival;
u32 token;
@@ -204,6 +208,8 @@ struct mptcp_sock {
bool snd_data_fin_enable;
spinlock_t join_list_lock;
struct work_struct work;
+ struct sk_buff *ooo_last_skb;
+ struct rb_root out_of_order_queue;
struct list_head conn_list;
struct list_head rtx_queue;
struct list_head join_list;
@@ -268,6 +274,12 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
return (struct mptcp_subflow_request_sock *)rsk;
}
+enum mptcp_data_avail {
+ MPTCP_SUBFLOW_NODATA,
+ MPTCP_SUBFLOW_DATA_AVAIL,
+ MPTCP_SUBFLOW_OOO_DATA
+};
+
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -292,10 +304,10 @@ struct mptcp_subflow_context {
map_valid : 1,
mpc_map : 1,
backup : 1,
- data_avail : 1,
rx_eof : 1,
use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
can_ack : 1; /* only after processing the remote a key */
+ enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -350,8 +362,7 @@ bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
/* called with sk socket lock held */
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
@@ -464,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
-static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
+static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
{
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
-static inline bool mptcp_check_fallback(struct sock *sk)
+static inline bool mptcp_check_fallback(const struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 9ead43f79023..141d555b7bd2 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,6 +20,7 @@
#include <net/ip6_route.h>
#endif
#include <net/mptcp.h>
+#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
@@ -434,6 +435,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
+ skb_rbtree_purge(&mptcp_sk(sk)->out_of_order_queue);
mptcp_token_destroy(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -804,16 +806,25 @@ validate_seq:
return MAPPING_OK;
}
-static int subflow_read_actor(read_descriptor_t *desc,
- struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
+ u64 limit)
{
- size_t copy_len = min(desc->count, len);
-
- desc->count -= copy_len;
-
- pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ u32 incr;
+
+ incr = limit >= skb->len ? skb->len + fin : limit;
+
+ pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
+ subflow->map_subflow_seq);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
+ tcp_sk(ssk)->copied_seq += incr;
+ if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
+ sk_eat_skb(ssk, skb);
+ if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
+ subflow->map_valid = 0;
+ if (incr)
+ tcp_cleanup_rbuf(ssk, incr);
}
static bool subflow_check_data_avail(struct sock *ssk)
@@ -825,13 +836,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (!skb_peek(&ssk->sk_receive_queue))
+ subflow->data_avail = 0;
if (subflow->data_avail)
return true;
msk = mptcp_sk(subflow->conn);
for (;;) {
- u32 map_remaining;
- size_t delta;
u64 ack_seq;
u64 old_ack;
@@ -849,6 +860,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
subflow->ssn_offset;
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
return true;
}
@@ -876,42 +888,18 @@ static bool subflow_check_data_avail(struct sock *ssk)
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
ack_seq);
- if (ack_seq == old_ack)
+ if (ack_seq == old_ack) {
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
break;
+ } else if (after64(ack_seq, old_ack)) {
+ subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
+ break;
+ }
/* only accept in-sequence mapping. Old values are spurious
- * retransmission; we can hit "future" values on active backup
- * subflow switch, we relay on retransmissions to get
- * in-sequence data.
- * Cuncurrent subflows support will require subflow data
- * reordering
+ * retransmission
*/
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- if (before64(ack_seq, old_ack))
- delta = min_t(size_t, old_ack - ack_seq, map_remaining);
- else
- delta = min_t(size_t, ack_seq - old_ack, map_remaining);
-
- /* discard mapped data */
- pr_debug("discarding %zu bytes, current map len=%d", delta,
- map_remaining);
- if (delta) {
- read_descriptor_t desc = {
- .count = delta,
- };
- int ret;
-
- ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
- if (ret < 0) {
- ssk->sk_err = -ret;
- goto fatal;
- }
- if (ret < delta)
- return false;
- if (delta == map_remaining)
- subflow->map_valid = 0;
- }
+ mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
}
return true;
@@ -922,13 +910,13 @@ fatal:
ssk->sk_error_report(ssk);
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
+ subflow->data_avail = 0;
return false;
}
bool mptcp_subflow_data_available(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sk_buff *skb;
/* check if current mapping is still valid */
if (subflow->map_valid &&
@@ -941,15 +929,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
subflow->map_data_len);
}
- if (!subflow_check_data_avail(sk)) {
- subflow->data_avail = 0;
- return false;
- }
-
- skb = skb_peek(&sk->sk_receive_queue);
- subflow->data_avail = skb &&
- before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
- return subflow->data_avail;
+ return subflow_check_data_avail(sk);
}
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
@@ -996,8 +976,10 @@ static void subflow_write_space(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- sk_stream_write_space(sk);
- if (sk_stream_is_writeable(sk)) {
+ if (!sk_stream_is_writeable(sk))
+ return;
+
+ if (sk_stream_is_writeable(parent)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
@@ -1056,8 +1038,7 @@ static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1102,7 +1083,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- ssk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = loc->ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
@@ -1114,7 +1095,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
subflow->local_id = local_id;
subflow->remote_id = remote_id;
subflow->request_join = 1;
- subflow->request_bkup = 1;
+ subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 2c1593089ede..eb0e329f9b8d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -29,7 +29,6 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
- select IP6_NF_IPTABLES
select NF_DEFRAG_IPV6
help
Add IPv6 support to IPVS.
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 678c5b14841c..8dbfd84322a8 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2508,6 +2508,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
+ } else if (!len) {
+ /* No more commands with len == 0 below */
+ ret = -EINVAL;
+ goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
@@ -2584,9 +2588,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
break;
case IP_VS_SO_SET_DELDEST:
ret = ip_vs_del_dest(svc, &udest);
- break;
- default:
- ret = -EINVAL;
}
out_unlock:
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5b97d233f89b..234b7cab37c3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -859,7 +859,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
out:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return -EEXIST;
}
@@ -909,6 +908,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}
+/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
{
@@ -922,23 +922,21 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
if (nf_ct_is_dying(ct))
return NF_DROP;
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- return NF_DROP;
-
if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
nf_ct_match(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);
+ nf_conntrack_get(&ct->ct_general);
+
nf_ct_acct_merge(ct, ctinfo, loser_ct);
nf_ct_add_to_dying_list(loser_ct);
nf_conntrack_put(&loser_ct->ct_general);
nf_ct_set(skb, ct, ctinfo);
- NF_CT_STAT_INC(net, insert_failed);
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
- nf_ct_put(ct);
return NF_DROP;
}
@@ -998,6 +996,8 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&nf_conntrack_hash[repl_idx]);
+
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
@@ -1027,10 +1027,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
*
* Failing that, the new, unconfirmed conntrack is still added to the table
* provided that the collision only occurs in the ORIGINAL direction.
- * The new entry will be added after the existing one in the hash list,
+ * The new entry will be added only in the non-clashing REPLY direction,
* so packets in the ORIGINAL direction will continue to match the existing
* entry. The new entry will also have a fixed timeout so it expires --
- * due to the collision, it will not see bidirectional traffic.
+ * due to the collision, it will only see reply traffic.
*
* Returns NF_DROP if the clash could not be resolved.
*/
@@ -1725,10 +1725,8 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl,
else
return NF_ACCEPT;
- if (ret <= 0) {
+ if (ret <= 0)
NF_CT_STAT_INC_ATOMIC(state->net, error);
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- }
return ret;
}
@@ -1802,10 +1800,8 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
- ctinfo == IP_CT_UNTRACKED) {
- NF_CT_STAT_INC_ATOMIC(state->net, ignore);
+ ctinfo == IP_CT_UNTRACKED)
return NF_ACCEPT;
- }
skb->_nfct = 0;
}
@@ -1813,7 +1809,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {
pr_debug("not prepared to track yet or error occurred\n");
- NF_CT_STAT_INC_ATOMIC(state->net, error);
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c3a4214dc958..3d0fd33be018 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2497,7 +2497,6 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
- nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
@@ -2505,7 +2504,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
- htonl(st->search_restart)))
+ htonl(st->search_restart)) ||
+ nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
+ htonl(st->clash_resolve)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index a604f43e3e6b..0ff39740797d 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -435,11 +435,11 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- 0,
+ st->clash_resolve, /* was: searched */
st->found,
0,
st->invalid,
- st->ignore,
+ 0,
0,
0,
st->insert,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4603b667973a..97fb6f776114 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -650,6 +650,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
[NFTA_TABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN }
};
static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -676,6 +678,11 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
NFTA_TABLE_PAD))
goto nla_put_failure;
+ if (table->udata) {
+ if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
+ goto nla_put_failure;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -988,8 +995,9 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
int family = nfmsg->nfgen_family;
const struct nlattr *attr;
struct nft_table *table;
- u32 flags = 0;
struct nft_ctx ctx;
+ u32 flags = 0;
+ u16 udlen = 0;
int err;
lockdep_assert_held(&net->nft.commit_mutex);
@@ -1025,6 +1033,16 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
if (table->name == NULL)
goto err_strdup;
+ if (nla[NFTA_TABLE_USERDATA]) {
+ udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
+ table->udata = kzalloc(udlen, GFP_KERNEL);
+ if (table->udata == NULL)
+ goto err_table_udata;
+
+ nla_memcpy(table->udata, nla[NFTA_TABLE_USERDATA], udlen);
+ table->udlen = udlen;
+ }
+
err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;
@@ -1047,6 +1065,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
+ kfree(table->udata);
+err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
@@ -5737,6 +5757,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
[NFTA_OBJ_HANDLE] = { .type = NLA_U64},
+ [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -5884,6 +5906,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
struct nft_object *obj;
struct nft_ctx ctx;
u32 objtype;
+ u16 udlen;
int err;
if (!nla[NFTA_OBJ_TYPE] ||
@@ -5928,7 +5951,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err1;
+ goto err_init;
}
obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
@@ -5936,32 +5959,44 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
if (!obj->key.name) {
err = -ENOMEM;
- goto err2;
+ goto err_strdup;
+ }
+
+ if (nla[NFTA_OBJ_USERDATA]) {
+ udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
+ obj->udata = kzalloc(udlen, GFP_KERNEL);
+ if (obj->udata == NULL)
+ goto err_userdata;
+
+ nla_memcpy(obj->udata, nla[NFTA_OBJ_USERDATA], udlen);
+ obj->udlen = udlen;
}
err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
if (err < 0)
- goto err3;
+ goto err_trans;
err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
nft_objname_ht_params);
if (err < 0)
- goto err4;
+ goto err_obj_ht;
list_add_tail_rcu(&obj->list, &table->objects);
table->use++;
return 0;
-err4:
+err_obj_ht:
/* queued in transaction log */
INIT_LIST_HEAD(&obj->list);
return err;
-err3:
+err_trans:
kfree(obj->key.name);
-err2:
+err_userdata:
+ kfree(obj->udata);
+err_strdup:
if (obj->ops->destroy)
obj->ops->destroy(&ctx, obj);
kfree(obj);
-err1:
+err_init:
module_put(type->owner);
return err;
}
@@ -5993,6 +6028,10 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
NFTA_OBJ_PAD))
goto nla_put_failure;
+ if (obj->udata &&
+ nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 637ce3e8c575..a28aca5124ce 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -14,6 +14,25 @@ struct nft_socket {
};
};
+static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
+ struct nft_regs *regs, struct sock *sk,
+ u32 *dest)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr));
+ break;
+#endif
+ default:
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
static void nft_socket_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -59,6 +78,13 @@ static void nft_socket_eval(const struct nft_expr *expr,
return;
}
break;
+ case NFT_SOCKET_WILDCARD:
+ if (!sk_fullsock(sk)) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ nft_socket_wildcard(pkt, regs, sk, dest);
+ break;
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
@@ -97,6 +123,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
+ case NFT_SOCKET_WILDCARD:
len = sizeof(u8);
break;
case NFT_SOCKET_MARK:
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
index 713fb38541df..8928ec56c388 100644
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -276,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
return 0;
/* follow-up fragments don't contain ports, skip all fragments */
- if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ if (ip_is_fragment(ip))
return 0;
hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 249da67d50a2..1a98247ab148 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -426,7 +426,7 @@ void calipso_doi_free(struct calipso_doi *doi_def)
/**
* calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
@@ -595,7 +595,7 @@ int calipso_req_setattr(struct request_sock *req,
/**
* calipso_req_delattr - Delete the CALIPSO option from a request socket
- * @reg: the request socket
+ * @req: the request socket
*
* Description:
* Removes the CALIPSO option from a request socket, if present.
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f73a8382c275..dc8c39f51f7d 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -612,9 +612,8 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
if (audit_buf != NULL) {
audit_log_format(audit_buf,
- " nlbl_domain=%s res=%u",
- entry->domain ? entry->domain : "(default)",
- ret_val == 0 ? 1 : 0);
+ " nlbl_domain=%s res=1",
+ entry->domain ? entry->domain : "(default)");
audit_log_end(audit_buf);
}
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d2d1448274f5..4ec4c0f72f61 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -848,7 +848,7 @@ retry:
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap in the user namespace @user_ns.
+ * message has the capability @cap in the user namespace @user_ns.
*/
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
struct user_namespace *user_ns, int cap)
@@ -867,7 +867,7 @@ EXPORT_SYMBOL(__netlink_ns_capable);
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap in the user namespace @user_ns.
+ * message has the capability @cap in the user namespace @user_ns.
*/
bool netlink_ns_capable(const struct sk_buff *skb,
struct user_namespace *user_ns, int cap)
@@ -883,7 +883,7 @@ EXPORT_SYMBOL(netlink_ns_capable);
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap in all user namespaces.
+ * message has the capability @cap in all user namespaces.
*/
bool netlink_capable(const struct sk_buff *skb, int cap)
{
@@ -898,7 +898,7 @@ EXPORT_SYMBOL(netlink_capable);
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
- * message has has the capability @cap over the network namespace of
+ * message has the capability @cap over the network namespace of
* the socket we received the message from.
*/
bool netlink_net_capable(const struct sk_buff *skb, int cap)
@@ -1853,7 +1853,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
struct scm_cookie scm;
u32 netlink_skb_flags = 0;
- if (msg->msg_flags&MSG_OOB)
+ if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
err = scm_send(sock, msg, &scm, true);
@@ -1916,7 +1916,7 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
refcount_inc(&skb->users);
netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
}
- err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
+ err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);
out:
scm_destroy(&scm);
@@ -1929,12 +1929,12 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
- int noblock = flags&MSG_DONTWAIT;
+ int noblock = flags & MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
- if (flags&MSG_OOB)
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 1eb65a7a27fd..3a718e327515 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -222,7 +222,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
family->mcgrp_offset = first_id;
- /* if still initializing, can't and don't need to to realloc bitmaps */
+ /* if still initializing, can't and don't need to realloc bitmaps */
if (!init_net.genl_sock)
return 0;
diff --git a/net/netlink/policy.c b/net/netlink/policy.c
index 641ffbdd977a..62f977fa645a 100644
--- a/net/netlink/policy.c
+++ b/net/netlink/policy.c
@@ -254,12 +254,6 @@ send_attribute:
pt->bitfield32_valid))
goto nla_put_failure;
break;
- case NLA_EXACT_LEN:
- type = NL_ATTR_TYPE_BINARY;
- if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) ||
- nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len))
- goto nla_put_failure;
- break;
case NLA_STRING:
case NLA_NUL_STRING:
case NLA_BINARY:
@@ -269,14 +263,27 @@ send_attribute:
type = NL_ATTR_TYPE_NUL_STRING;
else
type = NL_ATTR_TYPE_BINARY;
- if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
- pt->len))
- goto nla_put_failure;
- break;
- case NLA_MIN_LEN:
- type = NL_ATTR_TYPE_BINARY;
- if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len))
+
+ if (pt->validation_type == NLA_VALIDATE_RANGE ||
+ pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG) {
+ struct netlink_range_validation range;
+
+ nla_get_range_unsigned(pt, &range);
+
+ if (range.min &&
+ nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH,
+ range.min))
+ goto nla_put_failure;
+
+ if (range.max < U16_MAX &&
+ nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
+ range.max))
+ goto nla_put_failure;
+ } else if (pt->len &&
+ nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
+ pt->len)) {
goto nla_put_failure;
+ }
break;
case NLA_FLAG:
type = NL_ATTR_TYPE_FLAG;
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index 304b1a9bb18a..5971fb6f51cc 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -38,9 +38,6 @@
#define DIGITAL_GB_BIT 0x02
-#define DIGITAL_NFC_DEP_REQ_RES_HEADROOM 2 /* SoD: [SB (NFC-A)] + LEN */
-#define DIGITAL_NFC_DEP_REQ_RES_TAILROOM 2 /* EoD: 2-byte CRC */
-
#define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0)
#define DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT 0x10
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2611657f40ca..855f2c155956 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -9,7 +9,6 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
-#include <linux/netfilter_ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -742,7 +741,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
-static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ovs_vport_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
struct vport *vport = data->vport;
@@ -848,13 +848,9 @@ static void ovs_fragment(struct net *net, struct vport *vport,
ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else if (key->eth.type == htons(ETH_P_IPV6)) {
- const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
- if (!v6ops)
- goto err;
-
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
@@ -866,7 +862,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
skb_dst_set_noref(skb, &ovs_rt.dst);
IP6CB(skb)->frag_max_size = mru;
- v6ops->fragment(net, skb->sk, skb, ovs_vport_output);
+ ipv6_stub->ipv6_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
@@ -925,7 +921,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
- a = nla_next(a, &rem)) {
+ a = nla_next(a, &rem)) {
switch (nla_type(a)) {
case OVS_USERSPACE_ATTR_USERDATA:
upcall.userdata = a;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a3f1204f1ed2..e86b9601f5b1 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1901,8 +1901,8 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
lockdep_ovsl_is_held())
kfree_rcu(ct_limit, rcu);
}
- kfree(ovs_net->ct_limit_info->limits);
- kfree(ovs_net->ct_limit_info);
+ kfree(info->limits);
+ kfree(info);
}
static struct sk_buff *
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6e47ef7ef036..00df39b736ed 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -182,7 +182,7 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
head = vport_hash_bucket(dp, port_no);
hlist_for_each_entry_rcu(vport, head, dp_hash_node,
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (vport->port_no == port_no)
return vport;
}
@@ -254,7 +254,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
error = ovs_execute_actions(dp, skb, sf_acts, key);
if (unlikely(error))
net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n",
- ovs_dp_name(dp), error);
+ ovs_dp_name(dp), error);
stats_counter = &stats->n_hit;
@@ -302,7 +302,7 @@ err:
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info,
- uint32_t cutlen)
+ uint32_t cutlen)
{
unsigned int gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
@@ -1080,11 +1080,12 @@ error:
}
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
-static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net,
- const struct nlattr *a,
- const struct sw_flow_key *key,
- const struct sw_flow_mask *mask,
- bool log)
+static noinline_for_stack
+struct sw_flow_actions *get_flow_actions(struct net *net,
+ const struct nlattr *a,
+ const struct sw_flow_key *key,
+ const struct sw_flow_mask *mask,
+ bool log)
{
struct sw_flow_actions *acts;
struct sw_flow_key masked_key;
@@ -1383,7 +1384,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
ovs_notify(&dp_flow_genl_family, reply, info);
} else {
- netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
+ netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0,
+ PTR_ERR(reply));
}
}
@@ -1513,7 +1515,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
int err;
ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
- flags, cmd);
+ flags, cmd);
if (!ovs_header)
goto error;
@@ -1572,11 +1574,13 @@ static struct datapath *lookup_datapath(struct net *net,
return dp ? dp : ERR_PTR(-ENODEV);
}
-static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
+static void ovs_dp_reset_user_features(struct sk_buff *skb,
+ struct genl_info *info)
{
struct datapath *dp;
- dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr,
+ info->attrs);
if (IS_ERR(dp))
return;
@@ -2075,7 +2079,7 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
dev = vport->dev;
dev_headroom = netdev_get_fwd_headroom(dev);
if (dev_headroom > max_headroom)
@@ -2093,10 +2097,11 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
int i;
dp->max_headroom = new_headroom;
- for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
- lockdep_ovsl_is_held())
+ lockdep_ovsl_is_held())
netdev_set_rx_headroom(vport->dev, new_headroom);
+ }
}
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -2476,13 +2481,19 @@ error:
static int __net_init ovs_init_net(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ int err;
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance);
+
+ err = ovs_ct_init(net);
+ if (err)
+ return err;
+
schedule_delayed_work(&ovs_net->masks_rebalance,
msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL));
- return ovs_ct_init(net);
+ return 0;
}
static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2551,7 +2562,8 @@ static int __init dp_init(void)
{
int err;
- BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) >
+ sizeof_field(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath\n");
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e2235849a57e..87c286ad660e 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,12 +111,16 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
- ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
+ flow->sf_acts);
/* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask))
+ for (cpu = 0; cpu < nr_cpu_ids;
+ cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
(struct sw_flow_stats __force *)flow->stats[cpu]);
+ }
+
kmem_cache_free(flow_cache, flow);
}
@@ -164,7 +168,6 @@ static struct table_instance *table_instance_alloc(int new_size)
ti->n_buckets = new_size;
ti->node_ver = 0;
- ti->keep_flows = false;
get_random_bytes(&ti->hash_seed, sizeof(u32));
return ti;
@@ -192,7 +195,7 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
* zero based counter we store the value at reset, and subtract it
* later when processing.
*/
- for (i = 0; i < ma->max; i++) {
+ for (i = 0; i < ma->max; i++) {
ma->masks_usage_zero_cntr[i] = 0;
for_each_possible_cpu(cpu) {
@@ -273,7 +276,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
if (ma_count >= ma->max) {
err = tbl_mask_array_realloc(tbl, ma->max +
- MASK_ARRAY_SIZE_MIN);
+ MASK_ARRAY_SIZE_MIN);
if (err)
return err;
@@ -288,7 +291,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
BUG_ON(ovsl_dereference(ma->masks[ma_count]));
rcu_assign_pointer(ma->masks[ma_count], new);
- WRITE_ONCE(ma->count, ma_count +1);
+ WRITE_ONCE(ma->count, ma_count + 1);
return 0;
}
@@ -309,10 +312,10 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl,
return;
found:
- WRITE_ONCE(ma->count, ma_count -1);
+ WRITE_ONCE(ma->count, ma_count - 1);
- rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]);
- RCU_INIT_POINTER(ma->masks[ma_count -1], NULL);
+ rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]);
+ RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL);
kfree_rcu(mask, rcu);
@@ -448,26 +451,23 @@ free_mask_cache:
static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
- struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+ struct table_instance *ti;
+ ti = container_of(rcu, struct table_instance, rcu);
__table_instance_destroy(ti);
}
static void table_instance_flow_free(struct flow_table *table,
- struct table_instance *ti,
- struct table_instance *ufid_ti,
- struct sw_flow *flow,
- bool count)
+ struct table_instance *ti,
+ struct table_instance *ufid_ti,
+ struct sw_flow *flow)
{
hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
- if (count)
- table->count--;
+ table->count--;
if (ovs_identifier_is_ufid(&flow->id)) {
hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
-
- if (count)
- table->ufid_count--;
+ table->ufid_count--;
}
flow_mask_remove(table, flow->mask);
@@ -480,22 +480,25 @@ void table_instance_flow_flush(struct flow_table *table,
{
int i;
- if (ti->keep_flows)
- return;
-
for (i = 0; i < ti->n_buckets; i++) {
- struct sw_flow *flow;
struct hlist_head *head = &ti->buckets[i];
struct hlist_node *n;
+ struct sw_flow *flow;
hlist_for_each_entry_safe(flow, n, head,
flow_table.node[ti->node_ver]) {
table_instance_flow_free(table, ti, ufid_ti,
- flow, false);
+ flow);
ovs_flow_free(flow, true);
}
}
+
+ if (WARN_ON(table->count != 0 ||
+ table->ufid_count != 0)) {
+ table->count = 0;
+ table->ufid_count = 0;
+ }
}
static void table_instance_destroy(struct table_instance *ti,
@@ -596,8 +599,6 @@ static void flow_table_copy_flows(struct table_instance *old,
lockdep_ovsl_is_held())
table_instance_insert(new, flow);
}
-
- old->keep_flows = true;
}
static struct table_instance *table_instance_rehash(struct table_instance *ti,
@@ -632,8 +633,6 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
rcu_assign_pointer(flow_table->ti, new_ti);
rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
flow_table->last_rehash = jiffies;
- flow_table->count = 0;
- flow_table->ufid_count = 0;
table_instance_flow_flush(flow_table, old_ti, old_ufid_ti);
table_instance_destroy(old_ti, old_ufid_ti);
@@ -661,7 +660,7 @@ static int flow_key_start(const struct sw_flow_key *key)
return 0;
else
return rounddown(offsetof(struct sw_flow_key, phy),
- sizeof(long));
+ sizeof(long));
}
static bool cmp_key(const struct sw_flow_key *key1,
@@ -673,7 +672,7 @@ static bool cmp_key(const struct sw_flow_key *key1,
long diffs = 0;
int i;
- for (i = key_start; i < key_end; i += sizeof(long))
+ for (i = key_start; i < key_end; i += sizeof(long))
diffs |= *cp1++ ^ *cp2++;
return diffs == 0;
@@ -713,7 +712,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
(*n_mask_hit)++;
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
@@ -897,7 +896,8 @@ static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
}
-bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
+bool ovs_flow_cmp(const struct sw_flow *flow,
+ const struct sw_flow_match *match)
{
if (ovs_identifier_is_ufid(&flow->id))
return flow_cmp_masked_key(flow, match->key, &match->range);
@@ -916,7 +916,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
hash = ufid_hash(ufid);
head = find_bucket(ti, hash);
hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
- lockdep_ovsl_is_held()) {
+ lockdep_ovsl_is_held()) {
if (flow->ufid_table.hash == hash &&
ovs_flow_cmp_ufid(flow, ufid))
return flow;
@@ -950,7 +950,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
BUG_ON(table->count == 0);
- table_instance_flow_free(table, ti, ufid_ti, flow, true);
+ table_instance_flow_free(table, ti, ufid_ti, flow);
}
static struct sw_flow_mask *mask_alloc(void)
@@ -1107,7 +1107,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
if (!masks_and_count)
return;
- for (i = 0; i < ma->max; i++) {
+ for (i = 0; i < ma->max; i++) {
struct sw_flow_mask *mask;
unsigned int start;
int cpu;
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 6e7d4ac59353..d8fb7a3a3dfd 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -53,7 +53,6 @@ struct table_instance {
struct rcu_head rcu;
int node_ver;
u32 hash_seed;
- bool keep_flows;
};
struct flow_table {
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 0d44c5c013fa..82d801f063b7 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -98,7 +98,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node,
- lockdep_ovsl_is_held())
+ lockdep_ovsl_is_held())
if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -118,7 +118,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
* vport_free().
*/
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
- const struct vport_parms *parms)
+ const struct vport_parms *parms)
{
struct vport *vport;
size_t alloc_size;
@@ -397,7 +397,8 @@ int ovs_vport_get_upcall_portids(const struct vport *vport,
*
* Returns the portid of the target socket. Must be called with rcu_read_lock.
*/
-u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
+u32 ovs_vport_find_upcall_portid(const struct vport *vport,
+ struct sk_buff *skb)
{
struct vport_portids *ids;
u32 ids_index;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2b33e977a905..cefbd50c1090 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -93,52 +93,56 @@
/*
Assumptions:
- - if device has no dev->hard_header routine, it adds and removes ll header
- inside itself. In this case ll header is invisible outside of device,
- but higher levels still should reserve dev->hard_header_len.
- Some devices are enough clever to reallocate skb, when header
- will not fit to reserved space (tunnel), another ones are silly
- (PPP).
+ - If the device has no dev->header_ops, there is no LL header visible
+ above the device. In this case, its hard_header_len should be 0.
+ The device may prepend its own header internally. In this case, its
+ needed_headroom should be set to the space needed for it to add its
+ internal header.
+ For example, a WiFi driver pretending to be an Ethernet driver should
+ set its hard_header_len to be the Ethernet header length, and set its
+ needed_headroom to be (the real WiFi header length - the fake Ethernet
+ header length).
- packet socket receives packets with pulled ll header,
so that SOCK_RAW should push it back.
On receive:
-----------
-Incoming, dev->hard_header!=NULL
+Incoming, dev->header_ops != NULL
mac_header -> ll header
data -> data
-Outgoing, dev->hard_header!=NULL
+Outgoing, dev->header_ops != NULL
mac_header -> ll header
data -> ll header
-Incoming, dev->hard_header==NULL
- mac_header -> UNKNOWN position. It is very likely, that it points to ll
- header. PPP makes it, that is wrong, because introduce
- assymetry between rx and tx paths.
+Incoming, dev->header_ops == NULL
+ mac_header -> data
+ However drivers often make it point to the ll header.
+ This is incorrect because the ll header should be invisible to us.
data -> data
-Outgoing, dev->hard_header==NULL
- mac_header -> data. ll header is still not built!
+Outgoing, dev->header_ops == NULL
+ mac_header -> data. ll header is invisible to us.
data -> data
Resume
- If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+ If dev->header_ops == NULL we are unable to restore the ll header,
+ because it is invisible to us.
On transmit:
------------
-dev->hard_header != NULL
+dev->header_ops != NULL
mac_header -> ll header
data -> ll header
-dev->hard_header == NULL (ll header is added by device, we cannot control it)
+dev->header_ops == NULL (ll header is invisible to us)
mac_header -> data
data -> data
- We should set nh.raw on output to correct posistion,
+ We should set network_header on output to the correct position,
packet classifier depends on it.
*/
@@ -177,7 +181,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
-#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
struct packet_sock;
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/rds/cong.c b/net/rds/cong.c
index ccdff09a79c8..8b689ebbd5b5 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -236,7 +236,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* tcp_setsockopt and/or tcp_sendmsg will deadlock
* when it tries to get the sock_lock())
* 2. Interrupts are masked so that we can mark the
- * the port congested from both send and recv paths.
+ * port congested from both send and recv paths.
* (See comment around declaration of rdc_cong_lock).
* An attempt to get the sock_lock() here will
* therefore trigger warnings.
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index c3319ff3ee11..06603dd1c8aa 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -711,7 +711,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
- * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * from an older version. This could be 3.0 or 2.0 - we can't tell.
* We really should have changed this for OFED 1.3 :-(
*/
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index ccdd304eae0a..1d0afb1dd77b 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -269,7 +269,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
} else {
nents = ret;
- sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+ sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 186c8a889b16..0a2f4817ec6c 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -308,9 +308,10 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
key = NULL; /* a no-security key */
memset(&p, 0, sizeof(p));
- p.user_call_ID = user_call_ID;
- p.tx_total_len = tx_total_len;
- p.interruptibility = interruptibility;
+ p.user_call_ID = user_call_ID;
+ p.tx_total_len = tx_total_len;
+ p.interruptibility = interruptibility;
+ p.kernel = true;
memset(&cp, 0, sizeof(cp));
cp.local = rx->local;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 884cff7bb169..19f714386654 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -76,14 +76,12 @@ struct rxrpc_net {
struct work_struct service_conn_reaper;
struct timer_list service_conn_reap_timer;
- unsigned int nr_client_conns;
- unsigned int nr_active_client_conns;
- bool kill_all_client_conns;
bool live;
+
+ bool kill_all_client_conns;
+ atomic_t nr_client_conns;
spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */
spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */
- struct list_head waiting_client_conns;
- struct list_head active_client_conns;
struct list_head idle_client_conns;
struct work_struct client_conn_reaper;
struct timer_list client_conn_reap_timer;
@@ -275,8 +273,8 @@ struct rxrpc_local {
struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
struct sk_buff_head reject_queue; /* packets awaiting rejection */
struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
- struct rb_root client_conns; /* Client connections by socket params */
- spinlock_t client_conns_lock; /* Lock for client_conns */
+ struct rb_root client_bundles; /* Client connection bundles by socket params */
+ spinlock_t client_bundles_lock; /* Lock for client_bundles */
spinlock_t lock; /* access lock */
rwlock_t services_lock; /* lock for services list */
int debug_id; /* debug ID for printks */
@@ -353,10 +351,7 @@ struct rxrpc_conn_parameters {
enum rxrpc_conn_flag {
RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */
RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */
- RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */
- RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */
RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */
- RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */
RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */
RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */
RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */
@@ -377,19 +372,6 @@ enum rxrpc_conn_event {
};
/*
- * The connection cache state.
- */
-enum rxrpc_conn_cache_state {
- RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */
- RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */
- RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */
- RXRPC_CONN_CLIENT_UPGRADE, /* Conn is on active list, probing for upgrade */
- RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */
- RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */
- RXRPC_CONN__NR_CACHE_STATES
-};
-
-/*
* The connection protocol state.
*/
enum rxrpc_conn_proto_state {
@@ -405,6 +387,23 @@ enum rxrpc_conn_proto_state {
};
/*
+ * RxRPC client connection bundle.
+ */
+struct rxrpc_bundle {
+ struct rxrpc_conn_parameters params;
+ atomic_t usage;
+ unsigned int debug_id;
+ bool try_upgrade; /* True if the bundle is attempting upgrade */
+ bool alloc_conn; /* True if someone's getting a conn */
+ short alloc_error; /* Error from last conn allocation */
+ spinlock_t channel_lock;
+ struct rb_node local_node; /* Node in local->client_conns */
+ struct list_head waiting_calls; /* Calls waiting for channels */
+ unsigned long avail_chans; /* Mask of available channels */
+ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */
+};
+
+/*
* RxRPC connection definition
* - matched by { local, peer, epoch, conn_id, direction }
* - each connection can only handle four simultaneous calls
@@ -417,10 +416,7 @@ struct rxrpc_connection {
struct rcu_head rcu;
struct list_head cache_link;
- spinlock_t channel_lock;
- unsigned char active_chans; /* Mask of active channels */
-#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1)
- struct list_head waiting_calls; /* Calls waiting for channels */
+ unsigned char act_chans; /* Mask of active channels */
struct rxrpc_channel {
unsigned long final_ack_at; /* Time at which to issue final ACK */
struct rxrpc_call __rcu *call; /* Active call */
@@ -437,10 +433,8 @@ struct rxrpc_connection {
struct timer_list timer; /* Conn event timer */
struct work_struct processor; /* connection event processor */
- union {
- struct rb_node client_node; /* Node in local->client_conns */
- struct rb_node service_node; /* Node in peer->service_conns */
- };
+ struct rxrpc_bundle *bundle; /* Client connection bundle */
+ struct rb_node service_node; /* Node in peer->service_conns */
struct list_head proc_link; /* link in procfs list */
struct list_head link; /* link in master connection list */
struct sk_buff_head rx_queue; /* received conn-level packets */
@@ -452,7 +446,6 @@ struct rxrpc_connection {
unsigned long events;
unsigned long idle_timestamp; /* Time at which last became idle */
spinlock_t state_lock; /* state-change lock */
- enum rxrpc_conn_cache_state cache_state;
enum rxrpc_conn_proto_state state; /* current state of connection */
u32 abort_code; /* Abort code of connection abort */
int debug_id; /* debug ID for printks */
@@ -464,6 +457,7 @@ struct rxrpc_connection {
u8 security_size; /* security header size */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
+ u8 bundle_shift; /* Index into bundle->avail_chans */
short error; /* Local error code */
};
@@ -493,6 +487,8 @@ enum rxrpc_call_flag {
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
+ RXRPC_CALL_KERNEL, /* The call was made by the kernel */
+ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */
};
/*
@@ -577,7 +573,7 @@ struct rxrpc_call {
struct work_struct processor; /* Event processor */
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
struct list_head link; /* link in master call list */
- struct list_head chan_wait_link; /* Link in conn->waiting_calls */
+ struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */
struct hlist_node error_link; /* link in error distribution list */
struct list_head accept_link; /* Link in rx->acceptq */
struct list_head recvmsg_link; /* Link in rx->recvmsg_q */
@@ -727,6 +723,7 @@ struct rxrpc_call_params {
u32 normal; /* Max time since last call packet (msec) */
} timeouts;
u8 nr_timeouts; /* Number of timeouts specified */
+ bool kernel; /* T if kernel is making the call */
enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */
};
@@ -815,18 +812,19 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call)
/*
* conn_client.c
*/
-extern unsigned int rxrpc_max_client_connections;
extern unsigned int rxrpc_reap_client_connections;
extern unsigned long rxrpc_conn_idle_client_expiry;
extern unsigned long rxrpc_conn_idle_client_fast_expiry;
extern struct idr rxrpc_client_conn_ids;
void rxrpc_destroy_client_conn_ids(void);
+struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *);
+void rxrpc_put_bundle(struct rxrpc_bundle *);
int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *,
struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *,
gfp_t);
void rxrpc_expose_client_call(struct rxrpc_call *);
-void rxrpc_disconnect_client_call(struct rxrpc_call *);
+void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *);
void rxrpc_put_client_conn(struct rxrpc_connection *);
void rxrpc_discard_expired_client_conns(struct work_struct *);
void rxrpc_destroy_all_client_connections(struct rxrpc_net *);
@@ -852,7 +850,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *);
void rxrpc_kill_connection(struct rxrpc_connection *);
bool rxrpc_queue_conn(struct rxrpc_connection *);
void rxrpc_see_connection(struct rxrpc_connection *);
-void rxrpc_get_connection(struct rxrpc_connection *);
+struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *);
struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *);
void rxrpc_put_service_conn(struct rxrpc_connection *);
void rxrpc_service_connection_reaper(struct work_struct *);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index a40fae013942..c8015c76a81c 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -41,6 +41,11 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
struct kmem_cache *rxrpc_call_jar;
+static struct semaphore rxrpc_call_limiter =
+ __SEMAPHORE_INITIALIZER(rxrpc_call_limiter, 1000);
+static struct semaphore rxrpc_kernel_call_limiter =
+ __SEMAPHORE_INITIALIZER(rxrpc_kernel_call_limiter, 1000);
+
static void rxrpc_call_timer_expired(struct timer_list *t)
{
struct rxrpc_call *call = from_timer(call, t, timer);
@@ -210,6 +215,34 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
}
/*
+ * Wait for a call slot to become available.
+ */
+static struct semaphore *rxrpc_get_call_slot(struct rxrpc_call_params *p, gfp_t gfp)
+{
+ struct semaphore *limiter = &rxrpc_call_limiter;
+
+ if (p->kernel)
+ limiter = &rxrpc_kernel_call_limiter;
+ if (p->interruptibility == RXRPC_UNINTERRUPTIBLE) {
+ down(limiter);
+ return limiter;
+ }
+ return down_interruptible(limiter) < 0 ? NULL : limiter;
+}
+
+/*
+ * Release a call slot.
+ */
+static void rxrpc_put_call_slot(struct rxrpc_call *call)
+{
+ struct semaphore *limiter = &rxrpc_call_limiter;
+
+ if (test_bit(RXRPC_CALL_KERNEL, &call->flags))
+ limiter = &rxrpc_kernel_call_limiter;
+ up(limiter);
+}
+
+/*
* Set up a call for the given parameters.
* - Called with the socket lock held, which it must release.
* - If it returns a call, the call's lock will need releasing by the caller.
@@ -225,15 +258,21 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
{
struct rxrpc_call *call, *xcall;
struct rxrpc_net *rxnet;
+ struct semaphore *limiter;
struct rb_node *parent, **pp;
const void *here = __builtin_return_address(0);
int ret;
_enter("%p,%lx", rx, p->user_call_ID);
+ limiter = rxrpc_get_call_slot(p, gfp);
+ if (!limiter)
+ return ERR_PTR(-ERESTARTSYS);
+
call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id);
if (IS_ERR(call)) {
release_sock(&rx->sk);
+ up(limiter);
_leave(" = %ld", PTR_ERR(call));
return call;
}
@@ -243,6 +282,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
trace_rxrpc_call(call->debug_id, rxrpc_call_new_client,
atomic_read(&call->usage),
here, (const void *)p->user_call_ID);
+ if (p->kernel)
+ __set_bit(RXRPC_CALL_KERNEL, &call->flags);
/* We need to protect a partially set up call against the user as we
* will be acting outside the socket lock.
@@ -471,6 +512,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
BUG();
spin_unlock_bh(&call->lock);
+ rxrpc_put_call_slot(call);
+
del_timer_sync(&call->timer);
/* Make sure we don't get any more notifications */
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 159e3eda7914..78c845a4f1ad 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -1,63 +1,15 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Client connection-specific management code.
*
- * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2016, 2020 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* Client connections need to be cached for a little while after they've made a
* call so as to handle retransmitted DATA packets in case the server didn't
* receive the final ACK or terminating ABORT we sent it.
*
- * Client connections can be in one of a number of cache states:
- *
- * (1) INACTIVE - The connection is not held in any list and may not have been
- * exposed to the world. If it has been previously exposed, it was
- * discarded from the idle list after expiring.
- *
- * (2) WAITING - The connection is waiting for the number of client conns to
- * drop below the maximum capacity. Calls may be in progress upon it from
- * when it was active and got culled.
- *
- * The connection is on the rxrpc_waiting_client_conns list which is kept
- * in to-be-granted order. Culled conns with waiters go to the back of
- * the queue just like new conns.
- *
- * (3) ACTIVE - The connection has at least one call in progress upon it, it
- * may freely grant available channels to new calls and calls may be
- * waiting on it for channels to become available.
- *
- * The connection is on the rxnet->active_client_conns list which is kept
- * in activation order for culling purposes.
- *
- * rxrpc_nr_active_client_conns is held incremented also.
- *
- * (4) UPGRADE - As for ACTIVE, but only one call may be in progress and is
- * being used to probe for service upgrade.
- *
- * (5) CULLED - The connection got summarily culled to try and free up
- * capacity. Calls currently in progress on the connection are allowed to
- * continue, but new calls will have to wait. There can be no waiters in
- * this state - the conn would have to go to the WAITING state instead.
- *
- * (6) IDLE - The connection has no calls in progress upon it and must have
- * been exposed to the world (ie. the EXPOSED flag must be set). When it
- * expires, the EXPOSED flag is cleared and the connection transitions to
- * the INACTIVE state.
- *
- * The connection is on the rxnet->idle_client_conns list which is kept in
- * order of how soon they'll expire.
- *
* There are flags of relevance to the cache:
*
- * (1) EXPOSED - The connection ID got exposed to the world. If this flag is
- * set, an extra ref is added to the connection preventing it from being
- * reaped when it has no calls outstanding. This flag is cleared and the
- * ref dropped when a conn is discarded from the idle list.
- *
- * This allows us to move terminal call state retransmission to the
- * connection and to discard the call immediately we think it is done
- * with. It also give us a chance to reuse the connection.
- *
* (2) DONT_REUSE - The connection should be discarded as soon as possible and
* should not be reused. This is set when an exclusive connection is used
* or a call ID counter overflows.
@@ -78,7 +30,6 @@
#include "ar-internal.h"
-__read_mostly unsigned int rxrpc_max_client_connections = 1000;
__read_mostly unsigned int rxrpc_reap_client_connections = 900;
__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
@@ -89,8 +40,6 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
DEFINE_IDR(rxrpc_client_conn_ids);
static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
-static void rxrpc_cull_active_client_conns(struct rxrpc_net *);
-
/*
* Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The
@@ -162,13 +111,50 @@ void rxrpc_destroy_client_conn_ids(void)
}
/*
+ * Allocate a connection bundle.
+ */
+static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp,
+ gfp_t gfp)
+{
+ struct rxrpc_bundle *bundle;
+
+ bundle = kzalloc(sizeof(*bundle), gfp);
+ if (bundle) {
+ bundle->params = *cp;
+ rxrpc_get_peer(bundle->params.peer);
+ atomic_set(&bundle->usage, 1);
+ spin_lock_init(&bundle->channel_lock);
+ INIT_LIST_HEAD(&bundle->waiting_calls);
+ }
+ return bundle;
+}
+
+struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle)
+{
+ atomic_inc(&bundle->usage);
+ return bundle;
+}
+
+void rxrpc_put_bundle(struct rxrpc_bundle *bundle)
+{
+ unsigned int d = bundle->debug_id;
+ unsigned int u = atomic_dec_return(&bundle->usage);
+
+ _debug("PUT B=%x %u", d, u);
+ if (u == 0) {
+ rxrpc_put_peer(bundle->params.peer);
+ kfree(bundle);
+ }
+}
+
+/*
* Allocate a client connection.
*/
static struct rxrpc_connection *
-rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
+rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp)
{
struct rxrpc_connection *conn;
- struct rxrpc_net *rxnet = cp->local->rxnet;
+ struct rxrpc_net *rxnet = bundle->params.local->rxnet;
int ret;
_enter("");
@@ -180,15 +166,11 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
}
atomic_set(&conn->usage, 1);
- if (cp->exclusive)
- __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
- if (cp->upgrade)
- __set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags);
-
- conn->params = *cp;
+ conn->bundle = bundle;
+ conn->params = bundle->params;
conn->out_clientflag = RXRPC_CLIENT_INITIATED;
conn->state = RXRPC_CONN_CLIENT;
- conn->service_id = cp->service_id;
+ conn->service_id = conn->params.service_id;
ret = rxrpc_get_client_connection_id(conn, gfp);
if (ret < 0)
@@ -207,14 +189,16 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
write_unlock(&rxnet->conn_lock);
- /* We steal the caller's peer ref. */
- cp->peer = NULL;
+ rxrpc_get_bundle(bundle);
+ rxrpc_get_peer(conn->params.peer);
rxrpc_get_local(conn->params.local);
key_get(conn->params.key);
trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client,
atomic_read(&conn->usage),
__builtin_return_address(0));
+
+ atomic_inc(&rxnet->nr_client_conns);
trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
_leave(" = %p", conn);
return conn;
@@ -234,13 +218,18 @@ error_0:
*/
static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_net *rxnet = conn->params.local->rxnet;
+ struct rxrpc_net *rxnet;
int id_cursor, id, distance, limit;
+ if (!conn)
+ goto dont_reuse;
+
+ rxnet = conn->params.local->rxnet;
if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags))
goto dont_reuse;
- if (conn->proto.epoch != rxnet->epoch)
+ if (conn->state != RXRPC_CONN_CLIENT ||
+ conn->proto.epoch != rxnet->epoch)
goto mark_dont_reuse;
/* The IDR tree gets very expensive on memory if the connection IDs are
@@ -254,7 +243,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
distance = id - id_cursor;
if (distance < 0)
distance = -distance;
- limit = max(rxrpc_max_client_connections * 4, 1024U);
+ limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024);
if (distance > limit)
goto mark_dont_reuse;
@@ -267,277 +256,247 @@ dont_reuse:
}
/*
- * Create or find a client connection to use for a call.
- *
- * If we return with a connection, the call will be on its waiting list. It's
- * left to the caller to assign a channel and wake up the call.
+ * Look up the conn bundle that matches the connection parameters, adding it if
+ * it doesn't yet exist.
*/
-static int rxrpc_get_client_conn(struct rxrpc_sock *rx,
- struct rxrpc_call *call,
- struct rxrpc_conn_parameters *cp,
- struct sockaddr_rxrpc *srx,
- gfp_t gfp)
+static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *cp,
+ gfp_t gfp)
{
- struct rxrpc_connection *conn, *candidate = NULL;
+ static atomic_t rxrpc_bundle_id;
+ struct rxrpc_bundle *bundle, *candidate;
struct rxrpc_local *local = cp->local;
struct rb_node *p, **pp, *parent;
long diff;
- int ret = -ENOMEM;
- _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+ _enter("{%px,%x,%u,%u}",
+ cp->peer, key_serial(cp->key), cp->security_level, cp->upgrade);
- cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp);
- if (!cp->peer)
- goto error;
+ if (cp->exclusive)
+ return rxrpc_alloc_bundle(cp, gfp);
- call->cong_cwnd = cp->peer->cong_cwnd;
- if (call->cong_cwnd >= call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- else
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ /* First, see if the bundle is already there. */
+ _debug("search 1");
+ spin_lock(&local->client_bundles_lock);
+ p = local->client_bundles.rb_node;
+ while (p) {
+ bundle = rb_entry(p, struct rxrpc_bundle, local_node);
- /* If the connection is not meant to be exclusive, search the available
- * connections to see if the connection we want to use already exists.
- */
- if (!cp->exclusive) {
- _debug("search 1");
- spin_lock(&local->client_conns_lock);
- p = local->client_conns.rb_node;
- while (p) {
- conn = rb_entry(p, struct rxrpc_connection, client_node);
-
-#define cmp(X) ((long)conn->params.X - (long)cp->X)
- diff = (cmp(peer) ?:
- cmp(key) ?:
- cmp(security_level) ?:
- cmp(upgrade));
+#define cmp(X) ((long)bundle->params.X - (long)cp->X)
+ diff = (cmp(peer) ?:
+ cmp(key) ?:
+ cmp(security_level) ?:
+ cmp(upgrade));
#undef cmp
- if (diff < 0) {
- p = p->rb_left;
- } else if (diff > 0) {
- p = p->rb_right;
- } else {
- if (rxrpc_may_reuse_conn(conn) &&
- rxrpc_get_connection_maybe(conn))
- goto found_extant_conn;
- /* The connection needs replacing. It's better
- * to effect that when we have something to
- * replace it with so that we don't have to
- * rebalance the tree twice.
- */
- break;
- }
- }
- spin_unlock(&local->client_conns_lock);
- }
-
- /* There wasn't a connection yet or we need an exclusive connection.
- * We need to create a candidate and then potentially redo the search
- * in case we're racing with another thread also trying to connect on a
- * shareable connection.
- */
- _debug("new conn");
- candidate = rxrpc_alloc_client_connection(cp, gfp);
- if (IS_ERR(candidate)) {
- ret = PTR_ERR(candidate);
- goto error_peer;
+ if (diff < 0)
+ p = p->rb_left;
+ else if (diff > 0)
+ p = p->rb_right;
+ else
+ goto found_bundle;
}
+ spin_unlock(&local->client_bundles_lock);
+ _debug("not found");
- /* Add the call to the new connection's waiting list in case we're
- * going to have to wait for the connection to come live. It's our
- * connection, so we want first dibs on the channel slots. We would
- * normally have to take channel_lock but we do this before anyone else
- * can see the connection.
- */
- list_add(&call->chan_wait_link, &candidate->waiting_calls);
-
- if (cp->exclusive) {
- call->conn = candidate;
- call->security = candidate->security;
- call->security_ix = candidate->security_ix;
- call->service_id = candidate->service_id;
- _leave(" = 0 [exclusive %d]", candidate->debug_id);
- return 0;
- }
+ /* It wasn't. We need to add one. */
+ candidate = rxrpc_alloc_bundle(cp, gfp);
+ if (!candidate)
+ return NULL;
- /* Publish the new connection for userspace to find. We need to redo
- * the search before doing this lest we race with someone else adding a
- * conflicting instance.
- */
_debug("search 2");
- spin_lock(&local->client_conns_lock);
-
- pp = &local->client_conns.rb_node;
+ spin_lock(&local->client_bundles_lock);
+ pp = &local->client_bundles.rb_node;
parent = NULL;
while (*pp) {
parent = *pp;
- conn = rb_entry(parent, struct rxrpc_connection, client_node);
+ bundle = rb_entry(parent, struct rxrpc_bundle, local_node);
-#define cmp(X) ((long)conn->params.X - (long)candidate->params.X)
+#define cmp(X) ((long)bundle->params.X - (long)cp->X)
diff = (cmp(peer) ?:
cmp(key) ?:
cmp(security_level) ?:
cmp(upgrade));
#undef cmp
- if (diff < 0) {
+ if (diff < 0)
pp = &(*pp)->rb_left;
- } else if (diff > 0) {
+ else if (diff > 0)
pp = &(*pp)->rb_right;
- } else {
- if (rxrpc_may_reuse_conn(conn) &&
- rxrpc_get_connection_maybe(conn))
- goto found_extant_conn;
- /* The old connection is from an outdated epoch. */
- _debug("replace conn");
- clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags);
- rb_replace_node(&conn->client_node,
- &candidate->client_node,
- &local->client_conns);
- trace_rxrpc_client(conn, -1, rxrpc_client_replace);
- goto candidate_published;
- }
+ else
+ goto found_bundle_free;
}
- _debug("new conn");
- rb_link_node(&candidate->client_node, parent, pp);
- rb_insert_color(&candidate->client_node, &local->client_conns);
-
-candidate_published:
- set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
- call->conn = candidate;
- call->security = candidate->security;
- call->security_ix = candidate->security_ix;
- call->service_id = candidate->service_id;
- spin_unlock(&local->client_conns_lock);
- _leave(" = 0 [new %d]", candidate->debug_id);
- return 0;
+ _debug("new bundle");
+ candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id);
+ rb_link_node(&candidate->local_node, parent, pp);
+ rb_insert_color(&candidate->local_node, &local->client_bundles);
+ rxrpc_get_bundle(candidate);
+ spin_unlock(&local->client_bundles_lock);
+ _leave(" = %u [new]", candidate->debug_id);
+ return candidate;
+
+found_bundle_free:
+ kfree(candidate);
+found_bundle:
+ rxrpc_get_bundle(bundle);
+ spin_unlock(&local->client_bundles_lock);
+ _leave(" = %u [found]", bundle->debug_id);
+ return bundle;
+}
- /* We come here if we found a suitable connection already in existence.
- * Discard any candidate we may have allocated, and try to get a
- * channel on this one.
- */
-found_extant_conn:
- _debug("found conn");
- spin_unlock(&local->client_conns_lock);
+/*
+ * Create or find a client bundle to use for a call.
+ *
+ * If we return with a connection, the call will be on its waiting list. It's
+ * left to the caller to assign a channel and wake up the call.
+ */
+static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct rxrpc_conn_parameters *cp,
+ struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ struct rxrpc_bundle *bundle;
- if (candidate) {
- trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
- rxrpc_put_connection(candidate);
- candidate = NULL;
- }
+ _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
- spin_lock(&conn->channel_lock);
- call->conn = conn;
- call->security = conn->security;
- call->security_ix = conn->security_ix;
- call->service_id = conn->service_id;
- list_add_tail(&call->chan_wait_link, &conn->waiting_calls);
- spin_unlock(&conn->channel_lock);
- _leave(" = 0 [extant %d]", conn->debug_id);
- return 0;
+ cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp);
+ if (!cp->peer)
+ goto error;
+
+ call->cong_cwnd = cp->peer->cong_cwnd;
+ if (call->cong_cwnd >= call->cong_ssthresh)
+ call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ else
+ call->cong_mode = RXRPC_CALL_SLOW_START;
+ if (cp->upgrade)
+ __set_bit(RXRPC_CALL_UPGRADE, &call->flags);
+
+ /* Find the client connection bundle. */
+ bundle = rxrpc_look_up_bundle(cp, gfp);
+ if (!bundle)
+ goto error;
+
+ /* Get this call queued. Someone else may activate it whilst we're
+ * lining up a new connection, but that's fine.
+ */
+ spin_lock(&bundle->channel_lock);
+ list_add_tail(&call->chan_wait_link, &bundle->waiting_calls);
+ spin_unlock(&bundle->channel_lock);
+
+ _leave(" = [B=%x]", bundle->debug_id);
+ return bundle;
-error_peer:
- rxrpc_put_peer(cp->peer);
- cp->peer = NULL;
error:
- _leave(" = %d", ret);
- return ret;
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
}
/*
- * Activate a connection.
+ * Allocate a new connection and add it into a bundle.
*/
-static void rxrpc_activate_conn(struct rxrpc_net *rxnet,
- struct rxrpc_connection *conn)
+static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp)
+ __releases(bundle->channel_lock)
{
- if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_upgrade);
- conn->cache_state = RXRPC_CONN_CLIENT_UPGRADE;
- } else {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_active);
- conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
- }
- rxnet->nr_active_client_conns++;
- list_move_tail(&conn->cache_link, &rxnet->active_client_conns);
-}
+ struct rxrpc_connection *candidate = NULL, *old = NULL;
+ bool conflict;
+ int i;
-/*
- * Attempt to animate a connection for a new call.
- *
- * If it's not exclusive, the connection is in the endpoint tree, and we're in
- * the conn's list of those waiting to grab a channel. There is, however, a
- * limit on the number of live connections allowed at any one time, so we may
- * have to wait for capacity to become available.
- *
- * Note that a connection on the waiting queue might *also* have active
- * channels if it has been culled to make space and then re-requested by a new
- * call.
- */
-static void rxrpc_animate_client_conn(struct rxrpc_net *rxnet,
- struct rxrpc_connection *conn)
-{
- unsigned int nr_conns;
+ _enter("");
- _enter("%d,%d", conn->debug_id, conn->cache_state);
+ conflict = bundle->alloc_conn;
+ if (!conflict)
+ bundle->alloc_conn = true;
+ spin_unlock(&bundle->channel_lock);
+ if (conflict) {
+ _leave(" [conf]");
+ return;
+ }
- if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE ||
- conn->cache_state == RXRPC_CONN_CLIENT_UPGRADE)
- goto out;
+ candidate = rxrpc_alloc_client_connection(bundle, gfp);
- spin_lock(&rxnet->client_conn_cache_lock);
+ spin_lock(&bundle->channel_lock);
+ bundle->alloc_conn = false;
- nr_conns = rxnet->nr_client_conns;
- if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_count);
- rxnet->nr_client_conns = nr_conns + 1;
+ if (IS_ERR(candidate)) {
+ bundle->alloc_error = PTR_ERR(candidate);
+ spin_unlock(&bundle->channel_lock);
+ _leave(" [err %ld]", PTR_ERR(candidate));
+ return;
}
- switch (conn->cache_state) {
- case RXRPC_CONN_CLIENT_ACTIVE:
- case RXRPC_CONN_CLIENT_UPGRADE:
- case RXRPC_CONN_CLIENT_WAITING:
- break;
-
- case RXRPC_CONN_CLIENT_INACTIVE:
- case RXRPC_CONN_CLIENT_CULLED:
- case RXRPC_CONN_CLIENT_IDLE:
- if (nr_conns >= rxrpc_max_client_connections)
- goto wait_for_capacity;
- goto activate_conn;
+ bundle->alloc_error = 0;
+
+ for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) {
+ unsigned int shift = i * RXRPC_MAXCALLS;
+ int j;
+
+ old = bundle->conns[i];
+ if (!rxrpc_may_reuse_conn(old)) {
+ if (old)
+ trace_rxrpc_client(old, -1, rxrpc_client_replace);
+ candidate->bundle_shift = shift;
+ bundle->conns[i] = candidate;
+ for (j = 0; j < RXRPC_MAXCALLS; j++)
+ set_bit(shift + j, &bundle->avail_chans);
+ candidate = NULL;
+ break;
+ }
- default:
- BUG();
+ old = NULL;
}
-out_unlock:
- spin_unlock(&rxnet->client_conn_cache_lock);
-out:
- _leave(" [%d]", conn->cache_state);
- return;
+ spin_unlock(&bundle->channel_lock);
-activate_conn:
- _debug("activate");
- rxrpc_activate_conn(rxnet, conn);
- goto out_unlock;
-
-wait_for_capacity:
- _debug("wait");
- trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
- conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
- list_move_tail(&conn->cache_link, &rxnet->waiting_client_conns);
- goto out_unlock;
+ if (candidate) {
+ _debug("discard C=%x", candidate->debug_id);
+ trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate);
+ rxrpc_put_connection(candidate);
+ }
+
+ rxrpc_put_connection(old);
+ _leave("");
}
/*
- * Deactivate a channel.
+ * Add a connection to a bundle if there are no usable connections or we have
+ * connections waiting for extra capacity.
*/
-static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn,
- unsigned int channel)
+static void rxrpc_maybe_add_conn(struct rxrpc_bundle *bundle, gfp_t gfp)
{
- struct rxrpc_channel *chan = &conn->channels[channel];
+ struct rxrpc_call *call;
+ int i, usable;
- rcu_assign_pointer(chan->call, NULL);
- conn->active_chans &= ~(1 << channel);
+ _enter("");
+
+ spin_lock(&bundle->channel_lock);
+
+ /* See if there are any usable connections. */
+ usable = 0;
+ for (i = 0; i < ARRAY_SIZE(bundle->conns); i++)
+ if (rxrpc_may_reuse_conn(bundle->conns[i]))
+ usable++;
+
+ if (!usable && !list_empty(&bundle->waiting_calls)) {
+ call = list_first_entry(&bundle->waiting_calls,
+ struct rxrpc_call, chan_wait_link);
+ if (test_bit(RXRPC_CALL_UPGRADE, &call->flags))
+ bundle->try_upgrade = true;
+ }
+
+ if (!usable)
+ goto alloc_conn;
+
+ if (!bundle->avail_chans &&
+ !bundle->try_upgrade &&
+ !list_empty(&bundle->waiting_calls) &&
+ usable < ARRAY_SIZE(bundle->conns))
+ goto alloc_conn;
+
+ spin_unlock(&bundle->channel_lock);
+ _leave("");
+ return;
+
+alloc_conn:
+ return rxrpc_add_conn_to_bundle(bundle, gfp);
}
/*
@@ -549,35 +508,42 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
unsigned int channel)
{
struct rxrpc_channel *chan = &conn->channels[channel];
- struct rxrpc_call *call = list_entry(conn->waiting_calls.next,
+ struct rxrpc_bundle *bundle = conn->bundle;
+ struct rxrpc_call *call = list_entry(bundle->waiting_calls.next,
struct rxrpc_call, chan_wait_link);
u32 call_id = chan->call_counter + 1;
+ _enter("C=%x,%u", conn->debug_id, channel);
+
trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
/* Cancel the final ACK on the previous call if it hasn't been sent yet
* as the DATA packet will implicitly ACK it.
*/
clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags);
-
- write_lock_bh(&call->state_lock);
- call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
- write_unlock_bh(&call->state_lock);
+ clear_bit(conn->bundle_shift + channel, &bundle->avail_chans);
rxrpc_see_call(call);
list_del_init(&call->chan_wait_link);
- conn->active_chans |= 1 << channel;
call->peer = rxrpc_get_peer(conn->params.peer);
+ call->conn = rxrpc_get_connection(conn);
call->cid = conn->proto.cid | channel;
call->call_id = call_id;
+ call->security = conn->security;
+ call->security_ix = conn->security_ix;
+ call->service_id = conn->service_id;
trace_rxrpc_connect_call(call);
_net("CONNECT call %08x:%08x as call %d on conn %d",
call->cid, call->call_id, call->debug_id, conn->debug_id);
- /* Paired with the read barrier in rxrpc_wait_for_channel(). This
- * orders cid and epoch in the connection wrt to call_id without the
- * need to take the channel_lock.
+ write_lock_bh(&call->state_lock);
+ call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+ write_unlock_bh(&call->state_lock);
+
+ /* Paired with the read barrier in rxrpc_connect_call(). This orders
+ * cid and epoch in the connection wrt to call_id without the need to
+ * take the channel_lock.
*
* We provisionally assign a callNumber at this point, but we don't
* confirm it until the call is about to be exposed.
@@ -586,101 +552,137 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
* at the call ID through a connection channel.
*/
smp_wmb();
- chan->call_id = call_id;
- chan->call_debug_id = call->debug_id;
+
+ chan->call_id = call_id;
+ chan->call_debug_id = call->debug_id;
rcu_assign_pointer(chan->call, call);
wake_up(&call->waitq);
}
/*
+ * Remove a connection from the idle list if it's on it.
+ */
+static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn)
+{
+ struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+ bool drop_ref;
+
+ if (!list_empty(&conn->cache_link)) {
+ drop_ref = false;
+ spin_lock(&rxnet->client_conn_cache_lock);
+ if (!list_empty(&conn->cache_link)) {
+ list_del_init(&conn->cache_link);
+ drop_ref = true;
+ }
+ spin_unlock(&rxnet->client_conn_cache_lock);
+ if (drop_ref)
+ rxrpc_put_connection(conn);
+ }
+}
+
+/*
* Assign channels and callNumbers to waiting calls with channel_lock
* held by caller.
*/
-static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn)
+static void rxrpc_activate_channels_locked(struct rxrpc_bundle *bundle)
{
- u8 avail, mask;
-
- switch (conn->cache_state) {
- case RXRPC_CONN_CLIENT_ACTIVE:
- mask = RXRPC_ACTIVE_CHANS_MASK;
- break;
- case RXRPC_CONN_CLIENT_UPGRADE:
- mask = 0x01;
- break;
- default:
- return;
- }
+ struct rxrpc_connection *conn;
+ unsigned long avail, mask;
+ unsigned int channel, slot;
- while (!list_empty(&conn->waiting_calls) &&
- (avail = ~conn->active_chans,
- avail &= mask,
- avail != 0))
- rxrpc_activate_one_channel(conn, __ffs(avail));
+ if (bundle->try_upgrade)
+ mask = 1;
+ else
+ mask = ULONG_MAX;
+
+ while (!list_empty(&bundle->waiting_calls)) {
+ avail = bundle->avail_chans & mask;
+ if (!avail)
+ break;
+ channel = __ffs(avail);
+ clear_bit(channel, &bundle->avail_chans);
+
+ slot = channel / RXRPC_MAXCALLS;
+ conn = bundle->conns[slot];
+ if (!conn)
+ break;
+
+ if (bundle->try_upgrade)
+ set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags);
+ rxrpc_unidle_conn(bundle, conn);
+
+ channel &= (RXRPC_MAXCALLS - 1);
+ conn->act_chans |= 1 << channel;
+ rxrpc_activate_one_channel(conn, channel);
+ }
}
/*
* Assign channels and callNumbers to waiting calls.
*/
-static void rxrpc_activate_channels(struct rxrpc_connection *conn)
+static void rxrpc_activate_channels(struct rxrpc_bundle *bundle)
{
- _enter("%d", conn->debug_id);
+ _enter("B=%x", bundle->debug_id);
- trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans);
+ trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans);
- if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK)
+ if (!bundle->avail_chans)
return;
- spin_lock(&conn->channel_lock);
- rxrpc_activate_channels_locked(conn);
- spin_unlock(&conn->channel_lock);
+ spin_lock(&bundle->channel_lock);
+ rxrpc_activate_channels_locked(bundle);
+ spin_unlock(&bundle->channel_lock);
_leave("");
}
/*
* Wait for a callNumber and a channel to be granted to a call.
*/
-static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
+static int rxrpc_wait_for_channel(struct rxrpc_bundle *bundle,
+ struct rxrpc_call *call, gfp_t gfp)
{
+ DECLARE_WAITQUEUE(myself, current);
int ret = 0;
_enter("%d", call->debug_id);
- if (!call->call_id) {
- DECLARE_WAITQUEUE(myself, current);
+ if (!gfpflags_allow_blocking(gfp)) {
+ rxrpc_maybe_add_conn(bundle, gfp);
+ rxrpc_activate_channels(bundle);
+ ret = bundle->alloc_error ?: -EAGAIN;
+ goto out;
+ }
- if (!gfpflags_allow_blocking(gfp)) {
- ret = -EAGAIN;
- goto out;
+ add_wait_queue_exclusive(&call->waitq, &myself);
+ for (;;) {
+ rxrpc_maybe_add_conn(bundle, gfp);
+ rxrpc_activate_channels(bundle);
+ ret = bundle->alloc_error;
+ if (ret < 0)
+ break;
+
+ switch (call->interruptibility) {
+ case RXRPC_INTERRUPTIBLE:
+ case RXRPC_PREINTERRUPTIBLE:
+ set_current_state(TASK_INTERRUPTIBLE);
+ break;
+ case RXRPC_UNINTERRUPTIBLE:
+ default:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ break;
}
-
- add_wait_queue_exclusive(&call->waitq, &myself);
- for (;;) {
- switch (call->interruptibility) {
- case RXRPC_INTERRUPTIBLE:
- case RXRPC_PREINTERRUPTIBLE:
- set_current_state(TASK_INTERRUPTIBLE);
- break;
- case RXRPC_UNINTERRUPTIBLE:
- default:
- set_current_state(TASK_UNINTERRUPTIBLE);
- break;
- }
- if (call->call_id)
- break;
- if ((call->interruptibility == RXRPC_INTERRUPTIBLE ||
- call->interruptibility == RXRPC_PREINTERRUPTIBLE) &&
- signal_pending(current)) {
- ret = -ERESTARTSYS;
- break;
- }
- schedule();
+ if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_AWAIT_CONN)
+ break;
+ if ((call->interruptibility == RXRPC_INTERRUPTIBLE ||
+ call->interruptibility == RXRPC_PREINTERRUPTIBLE) &&
+ signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
}
- remove_wait_queue(&call->waitq, &myself);
- __set_current_state(TASK_RUNNING);
+ schedule();
}
-
- /* Paired with the write barrier in rxrpc_activate_one_channel(). */
- smp_rmb();
+ remove_wait_queue(&call->waitq, &myself);
+ __set_current_state(TASK_RUNNING);
out:
_leave(" = %d", ret);
@@ -697,52 +699,50 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
struct sockaddr_rxrpc *srx,
gfp_t gfp)
{
+ struct rxrpc_bundle *bundle;
struct rxrpc_net *rxnet = cp->local->rxnet;
- int ret;
+ int ret = 0;
_enter("{%d,%lx},", call->debug_id, call->user_call_ID);
rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper);
- rxrpc_cull_active_client_conns(rxnet);
- ret = rxrpc_get_client_conn(rx, call, cp, srx, gfp);
- if (ret < 0)
+ bundle = rxrpc_prep_call(rx, call, cp, srx, gfp);
+ if (IS_ERR(bundle)) {
+ ret = PTR_ERR(bundle);
goto out;
+ }
- rxrpc_animate_client_conn(rxnet, call->conn);
- rxrpc_activate_channels(call->conn);
-
- ret = rxrpc_wait_for_channel(call, gfp);
- if (ret < 0) {
- trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed);
- rxrpc_disconnect_client_call(call);
- goto out;
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_CONN) {
+ ret = rxrpc_wait_for_channel(bundle, call, gfp);
+ if (ret < 0)
+ goto wait_failed;
}
- spin_lock_bh(&call->conn->params.peer->lock);
- hlist_add_head_rcu(&call->error_link,
- &call->conn->params.peer->error_targets);
- spin_unlock_bh(&call->conn->params.peer->lock);
+granted_channel:
+ /* Paired with the write barrier in rxrpc_activate_one_channel(). */
+ smp_rmb();
+out_put_bundle:
+ rxrpc_put_bundle(bundle);
out:
_leave(" = %d", ret);
return ret;
-}
-/*
- * Note that a connection is about to be exposed to the world. Once it is
- * exposed, we maintain an extra ref on it that stops it from being summarily
- * discarded before it's (a) had a chance to deal with retransmission and (b)
- * had a chance at re-use (the per-connection security negotiation is
- * expensive).
- */
-static void rxrpc_expose_client_conn(struct rxrpc_connection *conn,
- unsigned int channel)
-{
- if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
- rxrpc_get_connection(conn);
+wait_failed:
+ spin_lock(&bundle->channel_lock);
+ list_del_init(&call->chan_wait_link);
+ spin_unlock(&bundle->channel_lock);
+
+ if (call->state != RXRPC_CALL_CLIENT_AWAIT_CONN) {
+ ret = 0;
+ goto granted_channel;
}
+
+ trace_rxrpc_client(call->conn, ret, rxrpc_client_chan_wait_failed);
+ rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret);
+ rxrpc_disconnect_client_call(bundle, call);
+ goto out_put_bundle;
}
/*
@@ -764,7 +764,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
chan->call_counter++;
if (chan->call_counter >= INT_MAX)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
- rxrpc_expose_client_conn(conn, channel);
+ trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
}
}
@@ -773,62 +773,56 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
*/
static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet)
{
- unsigned long now = jiffies;
- unsigned long reap_at = now + rxrpc_conn_idle_client_expiry;
+ if (!rxnet->kill_all_client_conns) {
+ unsigned long now = jiffies;
+ unsigned long reap_at = now + rxrpc_conn_idle_client_expiry;
- if (rxnet->live)
- timer_reduce(&rxnet->client_conn_reap_timer, reap_at);
+ if (rxnet->live)
+ timer_reduce(&rxnet->client_conn_reap_timer, reap_at);
+ }
}
/*
* Disconnect a client call.
*/
-void rxrpc_disconnect_client_call(struct rxrpc_call *call)
+void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call *call)
{
- struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_connection *conn;
struct rxrpc_channel *chan = NULL;
- struct rxrpc_net *rxnet = conn->params.local->rxnet;
- unsigned int channel = -1;
+ struct rxrpc_net *rxnet = bundle->params.local->rxnet;
+ unsigned int channel;
+ bool may_reuse;
u32 cid;
- spin_lock(&conn->channel_lock);
- set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
+ _enter("c=%x", call->debug_id);
- cid = call->cid;
- if (cid) {
- channel = cid & RXRPC_CHANNELMASK;
- chan = &conn->channels[channel];
- }
- trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
+ spin_lock(&bundle->channel_lock);
+ set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
/* Calls that have never actually been assigned a channel can simply be
- * discarded. If the conn didn't get used either, it will follow
- * immediately unless someone else grabs it in the meantime.
+ * discarded.
*/
- if (!list_empty(&call->chan_wait_link)) {
+ conn = call->conn;
+ if (!conn) {
_debug("call is waiting");
ASSERTCMP(call->call_id, ==, 0);
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
list_del_init(&call->chan_wait_link);
-
- trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted);
-
- /* We must deactivate or idle the connection if it's now
- * waiting for nothing.
- */
- spin_lock(&rxnet->client_conn_cache_lock);
- if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING &&
- list_empty(&conn->waiting_calls) &&
- !conn->active_chans)
- goto idle_connection;
goto out;
}
+ cid = call->cid;
+ channel = cid & RXRPC_CHANNELMASK;
+ chan = &conn->channels[channel];
+ trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
+
if (rcu_access_pointer(chan->call) != call) {
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&bundle->channel_lock);
BUG();
}
+ may_reuse = rxrpc_may_reuse_conn(conn);
+
/* If a client call was exposed to the world, we save the result for
* retransmission.
*
@@ -841,14 +835,21 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
_debug("exposed %u,%u", call->call_id, call->abort_code);
__rxrpc_disconnect_call(conn, call);
+
+ if (test_and_clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_active);
+ bundle->try_upgrade = false;
+ if (may_reuse)
+ rxrpc_activate_channels_locked(bundle);
+ }
+
}
/* See if we can pass the channel directly to another call. */
- if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE &&
- !list_empty(&conn->waiting_calls)) {
+ if (may_reuse && !list_empty(&bundle->waiting_calls)) {
trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
rxrpc_activate_one_channel(conn, channel);
- goto out_2;
+ goto out;
}
/* Schedule the final ACK to be transmitted in a short while so that it
@@ -865,128 +866,95 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
rxrpc_reduce_conn_timer(conn, final_ack_at);
}
- /* Things are more complex and we need the cache lock. We might be
- * able to simply idle the conn or it might now be lurking on the wait
- * list. It might even get moved back to the active list whilst we're
- * waiting for the lock.
- */
- spin_lock(&rxnet->client_conn_cache_lock);
-
- switch (conn->cache_state) {
- case RXRPC_CONN_CLIENT_UPGRADE:
- /* Deal with termination of a service upgrade probe. */
- if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags);
- trace_rxrpc_client(conn, channel, rxrpc_client_to_active);
- conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE;
- rxrpc_activate_channels_locked(conn);
- }
- fallthrough;
- case RXRPC_CONN_CLIENT_ACTIVE:
- if (list_empty(&conn->waiting_calls)) {
- rxrpc_deactivate_one_channel(conn, channel);
- if (!conn->active_chans) {
- rxnet->nr_active_client_conns--;
- goto idle_connection;
- }
- goto out;
- }
-
- trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass);
- rxrpc_activate_one_channel(conn, channel);
- goto out;
+ /* Deactivate the channel. */
+ rcu_assign_pointer(chan->call, NULL);
+ set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans);
+ conn->act_chans &= ~(1 << channel);
- case RXRPC_CONN_CLIENT_CULLED:
- rxrpc_deactivate_one_channel(conn, channel);
- ASSERT(list_empty(&conn->waiting_calls));
- if (!conn->active_chans)
- goto idle_connection;
- goto out;
+ /* If no channels remain active, then put the connection on the idle
+ * list for a short while. Give it a ref to stop it going away if it
+ * becomes unbundled.
+ */
+ if (!conn->act_chans) {
+ trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
+ conn->idle_timestamp = jiffies;
- case RXRPC_CONN_CLIENT_WAITING:
- rxrpc_deactivate_one_channel(conn, channel);
- goto out;
+ rxrpc_get_connection(conn);
+ spin_lock(&rxnet->client_conn_cache_lock);
+ list_move_tail(&conn->cache_link, &rxnet->idle_client_conns);
+ spin_unlock(&rxnet->client_conn_cache_lock);
- default:
- BUG();
+ rxrpc_set_client_reap_timer(rxnet);
}
out:
- spin_unlock(&rxnet->client_conn_cache_lock);
-out_2:
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&bundle->channel_lock);
_leave("");
return;
+}
-idle_connection:
- /* As no channels remain active, the connection gets deactivated
- * immediately or moved to the idle list for a short while.
- */
- if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) {
- trace_rxrpc_client(conn, channel, rxrpc_client_to_idle);
- conn->idle_timestamp = jiffies;
- conn->cache_state = RXRPC_CONN_CLIENT_IDLE;
- list_move_tail(&conn->cache_link, &rxnet->idle_client_conns);
- if (rxnet->idle_client_conns.next == &conn->cache_link &&
- !rxnet->kill_all_client_conns)
- rxrpc_set_client_reap_timer(rxnet);
- } else {
- trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
- conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
- list_del_init(&conn->cache_link);
+/*
+ * Remove a connection from a bundle.
+ */
+static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
+{
+ struct rxrpc_bundle *bundle = conn->bundle;
+ struct rxrpc_local *local = bundle->params.local;
+ unsigned int bindex;
+ bool need_drop = false;
+ int i;
+
+ _enter("C=%x", conn->debug_id);
+
+ spin_lock(&bundle->channel_lock);
+ bindex = conn->bundle_shift / RXRPC_MAXCALLS;
+ if (bundle->conns[bindex] == conn) {
+ _debug("clear slot %u", bindex);
+ bundle->conns[bindex] = NULL;
+ for (i = 0; i < RXRPC_MAXCALLS; i++)
+ clear_bit(conn->bundle_shift + i, &bundle->avail_chans);
+ need_drop = true;
+ }
+ spin_unlock(&bundle->channel_lock);
+
+ /* If there are no more connections, remove the bundle */
+ if (!bundle->avail_chans) {
+ _debug("maybe unbundle");
+ spin_lock(&local->client_bundles_lock);
+
+ for (i = 0; i < ARRAY_SIZE(bundle->conns); i++)
+ if (bundle->conns[i])
+ break;
+ if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) {
+ _debug("erase bundle");
+ rb_erase(&bundle->local_node, &local->client_bundles);
+ }
+
+ spin_unlock(&local->client_bundles_lock);
+ if (i == ARRAY_SIZE(bundle->conns))
+ rxrpc_put_bundle(bundle);
}
- goto out;
+
+ if (need_drop)
+ rxrpc_put_connection(conn);
+ _leave("");
}
/*
* Clean up a dead client connection.
*/
-static struct rxrpc_connection *
-rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
+static void rxrpc_kill_client_conn(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *next = NULL;
struct rxrpc_local *local = conn->params.local;
struct rxrpc_net *rxnet = local->rxnet;
- unsigned int nr_conns;
- trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+ _enter("C=%x", conn->debug_id);
- if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) {
- spin_lock(&local->client_conns_lock);
- if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS,
- &conn->flags))
- rb_erase(&conn->client_node, &local->client_conns);
- spin_unlock(&local->client_conns_lock);
- }
+ trace_rxrpc_client(conn, -1, rxrpc_client_cleanup);
+ atomic_dec(&rxnet->nr_client_conns);
rxrpc_put_client_connection_id(conn);
-
- ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE);
-
- if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_uncount);
- spin_lock(&rxnet->client_conn_cache_lock);
- nr_conns = --rxnet->nr_client_conns;
-
- if (nr_conns < rxrpc_max_client_connections &&
- !list_empty(&rxnet->waiting_client_conns)) {
- next = list_entry(rxnet->waiting_client_conns.next,
- struct rxrpc_connection, cache_link);
- rxrpc_get_connection(next);
- rxrpc_activate_conn(rxnet, next);
- }
-
- spin_unlock(&rxnet->client_conn_cache_lock);
- }
-
rxrpc_kill_connection(conn);
- if (next)
- rxrpc_activate_channels(next);
-
- /* We need to get rid of the temporary ref we took upon next, but we
- * can't call rxrpc_put_connection() recursively.
- */
- return next;
}
/*
@@ -998,63 +966,12 @@ void rxrpc_put_client_conn(struct rxrpc_connection *conn)
unsigned int debug_id = conn->debug_id;
int n;
- do {
- n = atomic_dec_return(&conn->usage);
- trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here);
- if (n > 0)
- return;
+ n = atomic_dec_return(&conn->usage);
+ trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here);
+ if (n <= 0) {
ASSERTCMP(n, >=, 0);
-
- conn = rxrpc_put_one_client_conn(conn);
- } while (conn);
-}
-
-/*
- * Kill the longest-active client connections to make room for new ones.
- */
-static void rxrpc_cull_active_client_conns(struct rxrpc_net *rxnet)
-{
- struct rxrpc_connection *conn;
- unsigned int nr_conns = rxnet->nr_client_conns;
- unsigned int nr_active, limit;
-
- _enter("");
-
- ASSERTCMP(nr_conns, >=, 0);
- if (nr_conns < rxrpc_max_client_connections) {
- _leave(" [ok]");
- return;
- }
- limit = rxrpc_reap_client_connections;
-
- spin_lock(&rxnet->client_conn_cache_lock);
- nr_active = rxnet->nr_active_client_conns;
-
- while (nr_active > limit) {
- ASSERT(!list_empty(&rxnet->active_client_conns));
- conn = list_entry(rxnet->active_client_conns.next,
- struct rxrpc_connection, cache_link);
- ASSERTIFCMP(conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE,
- conn->cache_state, ==, RXRPC_CONN_CLIENT_UPGRADE);
-
- if (list_empty(&conn->waiting_calls)) {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_culled);
- conn->cache_state = RXRPC_CONN_CLIENT_CULLED;
- list_del_init(&conn->cache_link);
- } else {
- trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting);
- conn->cache_state = RXRPC_CONN_CLIENT_WAITING;
- list_move_tail(&conn->cache_link,
- &rxnet->waiting_client_conns);
- }
-
- nr_active--;
+ rxrpc_kill_client_conn(conn);
}
-
- rxnet->nr_active_client_conns = nr_active;
- spin_unlock(&rxnet->client_conn_cache_lock);
- ASSERTCMP(nr_active, >=, 0);
- _leave(" [culled]");
}
/*
@@ -1088,7 +1005,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
/* We keep an estimate of what the number of conns ought to be after
* we've discarded some so that we don't overdo the discarding.
*/
- nr_conns = rxnet->nr_client_conns;
+ nr_conns = atomic_read(&rxnet->nr_client_conns);
next:
spin_lock(&rxnet->client_conn_cache_lock);
@@ -1098,7 +1015,6 @@ next:
conn = list_entry(rxnet->idle_client_conns.next,
struct rxrpc_connection, cache_link);
- ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags));
if (!rxnet->kill_all_client_conns) {
/* If the number of connections is over the reap limit, we
@@ -1120,18 +1036,13 @@ next:
}
trace_rxrpc_client(conn, -1, rxrpc_client_discard);
- if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
- BUG();
- conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
list_del_init(&conn->cache_link);
spin_unlock(&rxnet->client_conn_cache_lock);
- /* When we cleared the EXPOSED flag, we took on responsibility for the
- * reference that that had on the usage count. We deal with that here.
- * If someone re-sets the flag and re-gets the ref, that's fine.
- */
- rxrpc_put_connection(conn);
+ rxrpc_unbundle_conn(conn);
+ rxrpc_put_connection(conn); /* Drop the ->cache_link ref */
+
nr_conns--;
goto next;
@@ -1145,8 +1056,7 @@ not_yet_expired:
*/
_debug("not yet");
if (!rxnet->kill_all_client_conns)
- timer_reduce(&rxnet->client_conn_reap_timer,
- conn_expires_at);
+ timer_reduce(&rxnet->client_conn_reap_timer, conn_expires_at);
out:
spin_unlock(&rxnet->client_conn_cache_lock);
@@ -1181,37 +1091,27 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local)
{
struct rxrpc_connection *conn, *tmp;
struct rxrpc_net *rxnet = local->rxnet;
- unsigned int nr_active;
LIST_HEAD(graveyard);
_enter("");
spin_lock(&rxnet->client_conn_cache_lock);
- nr_active = rxnet->nr_active_client_conns;
list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns,
cache_link) {
if (conn->params.local == local) {
- ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_IDLE);
-
trace_rxrpc_client(conn, -1, rxrpc_client_discard);
- if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
- BUG();
- conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
list_move(&conn->cache_link, &graveyard);
- nr_active--;
}
}
- rxnet->nr_active_client_conns = nr_active;
spin_unlock(&rxnet->client_conn_cache_lock);
- ASSERTCMP(nr_active, >=, 0);
while (!list_empty(&graveyard)) {
conn = list_entry(graveyard.next,
struct rxrpc_connection, cache_link);
list_del_init(&conn->cache_link);
-
+ rxrpc_unbundle_conn(conn);
rxrpc_put_connection(conn);
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 447f55ca6886..0628dad2bdea 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -157,12 +157,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
_enter("{%d},%x", conn->debug_id, conn->abort_code);
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
for (i = 0; i < RXRPC_MAXCALLS; i++) {
call = rcu_dereference_protected(
conn->channels[i].call,
- lockdep_is_held(&conn->channel_lock));
+ lockdep_is_held(&conn->bundle->channel_lock));
if (call) {
if (compl == RXRPC_CALL_LOCALLY_ABORTED)
trace_rxrpc_abort(call->debug_id,
@@ -179,7 +179,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
}
}
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
_leave("");
}
@@ -210,6 +210,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
conn->error = error;
conn->abort_code = abort_code;
conn->state = RXRPC_CONN_LOCALLY_ABORTED;
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
spin_unlock_bh(&conn->state_lock);
msg.msg_name = &conn->params.peer->srx.transport;
@@ -319,6 +320,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
conn->error = -ECONNABORTED;
conn->abort_code = abort_code;
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial);
return -ECONNABORTED;
@@ -339,7 +341,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (ret < 0)
return ret;
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
spin_lock(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) {
@@ -349,12 +351,12 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
rxrpc_call_is_secure(
rcu_dereference_protected(
conn->channels[loop].call,
- lockdep_is_held(&conn->channel_lock)));
+ lockdep_is_held(&conn->bundle->channel_lock)));
} else {
spin_unlock(&conn->state_lock);
}
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
return 0;
default:
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 8cbe0bf20ed5..3bcbe0665f91 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -41,8 +41,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
if (conn) {
INIT_LIST_HEAD(&conn->cache_link);
- spin_lock_init(&conn->channel_lock);
- INIT_LIST_HEAD(&conn->waiting_calls);
timer_setup(&conn->timer, &rxrpc_connection_timer, 0);
INIT_WORK(&conn->processor, &rxrpc_process_connection);
INIT_LIST_HEAD(&conn->proc_link);
@@ -219,11 +217,11 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
}
if (rxrpc_is_client_call(call))
- return rxrpc_disconnect_client_call(call);
+ return rxrpc_disconnect_client_call(conn->bundle, call);
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
__rxrpc_disconnect_call(conn, call);
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
conn->idle_timestamp = jiffies;
@@ -292,12 +290,13 @@ void rxrpc_see_connection(struct rxrpc_connection *conn)
/*
* Get a ref on a connection.
*/
-void rxrpc_get_connection(struct rxrpc_connection *conn)
+struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn)
{
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(&conn->usage);
trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here);
+ return conn;
}
/*
@@ -365,6 +364,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
conn->security->clear(conn);
key_put(conn->params.key);
key_put(conn->server_key);
+ rxrpc_put_bundle(conn->bundle);
rxrpc_put_peer(conn->params.peer);
if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns))
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index 21da48e3d2e5..6c847720494f 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -8,6 +8,12 @@
#include <linux/slab.h>
#include "ar-internal.h"
+static struct rxrpc_bundle rxrpc_service_dummy_bundle = {
+ .usage = ATOMIC_INIT(1),
+ .debug_id = UINT_MAX,
+ .channel_lock = __SPIN_LOCK_UNLOCKED(&rxrpc_service_dummy_bundle.channel_lock),
+};
+
/*
* Find a service connection under RCU conditions.
*
@@ -127,6 +133,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
*/
conn->state = RXRPC_CONN_SERVICE_PREALLOC;
atomic_set(&conn->usage, 2);
+ conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle);
atomic_inc(&rxnet->nr_conns);
write_lock(&rxnet->conn_lock);
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index ede058f9cc15..8c2881054266 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -86,8 +86,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
init_rwsem(&local->defrag_sem);
skb_queue_head_init(&local->reject_queue);
skb_queue_head_init(&local->event_queue);
- local->client_conns = RB_ROOT;
- spin_lock_init(&local->client_conns_lock);
+ local->client_bundles = RB_ROOT;
+ spin_lock_init(&local->client_bundles_lock);
spin_lock_init(&local->lock);
rwlock_init(&local->services_lock);
local->debug_id = atomic_inc_return(&rxrpc_debug_id);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index b312aab80fed..25bbc4cc8b13 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -62,13 +62,10 @@ static __net_init int rxrpc_init_net(struct net *net)
timer_setup(&rxnet->service_conn_reap_timer,
rxrpc_service_conn_reap_timeout, 0);
- rxnet->nr_client_conns = 0;
- rxnet->nr_active_client_conns = 0;
+ atomic_set(&rxnet->nr_client_conns, 0);
rxnet->kill_all_client_conns = false;
spin_lock_init(&rxnet->client_conn_cache_lock);
spin_lock_init(&rxnet->client_conn_discard_lock);
- INIT_LIST_HEAD(&rxnet->waiting_client_conns);
- INIT_LIST_HEAD(&rxnet->active_client_conns);
INIT_LIST_HEAD(&rxnet->idle_client_conns);
INIT_WORK(&rxnet->client_conn_reaper,
rxrpc_discard_expired_client_conns);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 3cfff7922ba8..10f2bf2e9068 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -357,6 +357,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
_enter(",{%d}", skb->len);
+ if (hlist_unhashed(&call->error_link)) {
+ spin_lock_bh(&call->peer->lock);
+ hlist_add_head_rcu(&call->error_link, &call->peer->error_targets);
+ spin_unlock_bh(&call->peer->lock);
+ }
+
/* Each transmission of a Tx packet needs a new serial number */
serial = atomic_inc_return(&conn->serial);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 543afd9bd664..e2f990754f88 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -165,7 +165,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
"Proto Local "
" Remote "
" SvID ConnID End Use State Key "
- " Serial ISerial\n"
+ " Serial ISerial CallId0 CallId1 CallId2 CallId3\n"
);
return 0;
}
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index 1221b0637a7e..4e565eeab426 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -14,7 +14,6 @@
#define RXRPC_RTO_MAX ((unsigned)(120 * HZ))
#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */
-#define rxrpc_min_rtt_wlen 300 /* As sysctl_tcp_min_rtt_wlen */
static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
{
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index e08130e5746b..f114dc2af5cf 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -1169,7 +1169,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
if (response->encrypted.checksum != csum)
goto protocol_error_free;
- spin_lock(&conn->channel_lock);
+ spin_lock(&conn->bundle->channel_lock);
for (i = 0; i < RXRPC_MAXCALLS; i++) {
struct rxrpc_call *call;
u32 call_id = ntohl(response->encrypted.call_id[i]);
@@ -1186,13 +1186,13 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
if (call_id > conn->channels[i].call_counter) {
call = rcu_dereference_protected(
conn->channels[i].call,
- lockdep_is_held(&conn->channel_lock));
+ lockdep_is_held(&conn->bundle->channel_lock));
if (call && call->state < RXRPC_CALL_COMPLETE)
goto protocol_error_unlock;
conn->channels[i].call_counter = call_id;
}
}
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
eproto = tracepoint_string("rxkad_rsp_seq");
abort_code = RXKADOUTOFSEQUENCE;
@@ -1219,7 +1219,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
return 0;
protocol_error_unlock:
- spin_unlock(&conn->channel_lock);
+ spin_unlock(&conn->bundle->channel_lock);
protocol_error_free:
kfree(ticket);
protocol_error:
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index e91acc95ff28..540351d6a5f4 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -74,21 +74,13 @@ static struct ctl_table rxrpc_sysctl_table[] = {
/* Non-time values */
{
- .procname = "max_client_conns",
- .data = &rxrpc_max_client_connections,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&rxrpc_reap_client_connections,
- },
- {
.procname = "reap_client_conns",
.data = &rxrpc_reap_client_connections,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)SYSCTL_ONE,
- .extra2 = (void *)&rxrpc_max_client_connections,
+ .extra2 = (void *)&n_65535,
},
{
.procname = "max_backlog",
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 063d8aaf2900..f64af9d9dfee 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -976,7 +976,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
#endif
NL_SET_ERR_MSG(extack, "Failed to load TC action module");
err = -ENOENT;
- goto err_out;
+ goto err_free;
}
/* backward compatibility for policer */
@@ -1013,11 +1013,12 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
err_mod:
module_put(a_o->owner);
-err_out:
+err_free:
if (cookie) {
kfree(cookie->data);
kfree(cookie);
}
+err_out:
return ERR_PTR(err);
}
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 2c3619165680..47f9128ecb8f 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1039,7 +1039,7 @@ drop:
static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
[TCA_CT_ACTION] = { .type = NLA_U16 },
- [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+ [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
[TCA_CT_ZONE] = { .type = NLA_U16 },
[TCA_CT_MARK] = { .type = NLA_U32 },
[TCA_CT_MARK_MASK] = { .type = NLA_U32 },
@@ -1049,10 +1049,8 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
.len = 128 / BITS_PER_BYTE },
[TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
[TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
- [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr) },
- [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
};
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index b5042f3ea079..e88fa19ea8a9 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -144,9 +144,8 @@ out:
}
static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
- [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct
- tc_ctinfo) },
+ [TCA_CTINFO_ACT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)),
[TCA_CTINFO_ZONE] = { .type = NLA_U16 },
[TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
[TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 1fb8d428d2c1..f5dd4d1d274e 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -159,8 +159,8 @@ static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = {
};
static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = {
- [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate),
- .type = NLA_EXACT_LEN },
+ [TCA_GATE_PARMS] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)),
[TCA_GATE_PRIORITY] = { .type = NLA_S32 },
[TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED },
[TCA_GATE_BASE_TIME] = { .type = NLA_U64 },
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 8d735461fa19..fdb69d46276d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1351,7 +1351,7 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc)
}
/* We did not find anything useful for a possible retransmission
- * path; either primary path that we found is the the same as
+ * path; either primary path that we found is the same as
* the current one, or we didn't generally find an active one.
*/
if (trans_sec == NULL)
@@ -1537,7 +1537,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
/* If we've reached or overflowed our receive buffer, announce
* a 0 rwnd if rwnd would still be positive. Store the
- * the potential pressure overflow so that the window can be restored
+ * potential pressure overflow so that the window can be restored
* back to original value.
*/
if (rx_count >= asoc->base.sk->sk_rcvbuf)
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 9e289c770574..589868d96e3f 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -445,7 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey(
}
/*
- * Initialize all the possible digest transforms that we can use. Right now
+ * Initialize all the possible digest transforms that we can use. Right
* now, the supported digests are SHA1 and SHA256. We do this here once
* because of the restrictiong that transforms may only be allocated in
* user context. This forces us to pre-allocated all possible transforms
@@ -810,7 +810,7 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
}
/* Set a new shared key on either endpoint or association. If the
- * the key with a same ID already exists, replace the key (remove the
+ * key with a same ID already exists, replace the key (remove the
* old key and add a new one).
*/
int sctp_auth_set_key(struct sctp_endpoint *ep,
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 701c5a4e441d..53e5ed79f63f 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -505,7 +505,7 @@ int sctp_in_scope(struct net *net, const union sctp_addr *addr,
return 0;
/*
* For INIT and INIT-ACK address list, let L be the level of
- * of requested destination address, sender and receiver
+ * requested destination address, sender and receiver
* SHOULD include all of its addresses with level greater
* than or equal to L.
*
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ab6a997e222f..fd4f8243cc35 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -179,7 +179,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
__func__, asoc, max_data);
}
- /* If the the peer requested that we authenticate DATA chunks
+ /* If the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
* DATA.
*/
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d19db22262fd..25833238fe93 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -372,7 +372,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
* Level 3 - private addresses.
* Level 4 - global addresses
* For INIT and INIT-ACK address list, let L be the level of
- * of requested destination address, sender and receiver
+ * requested destination address, sender and receiver
* SHOULD include all of its addresses with level greater
* than or equal to L.
*
@@ -1483,10 +1483,10 @@ static __init int sctp_init(void)
num_entries = (1UL << order) * PAGE_SIZE /
sizeof(struct sctp_bind_hashbucket);
- /* And finish by rounding it down to the nearest power of two
- * this wastes some memory of course, but its needed because
+ /* And finish by rounding it down to the nearest power of two.
+ * This wastes some memory of course, but it's needed because
* the hash function operates based on the assumption that
- * that the number of entries is a power of two
+ * the number of entries is a power of two.
*/
sctp_port_hashsize = rounddown_pow_of_two(num_entries);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c11c24524652..9a56ae2f3651 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1235,7 +1235,7 @@ nodata:
/* Create an Operation Error chunk of a fixed size, specifically,
* min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
- * This is a helper function to allocate an error chunk for for those
+ * This is a helper function to allocate an error chunk for those
* invalid parameter codes in which we may not want to report all the
* errors, if the incoming chunk is large. If it can't fit in a single
* packet, we ignore it.
@@ -1780,7 +1780,7 @@ no_hmac:
* for init collision case of lost COOKIE ACK.
* If skb has been timestamped, then use the stamp, otherwise
* use current time. This introduces a small possibility that
- * that a cookie may be considered expired, but his would only slow
+ * a cookie may be considered expired, but this would only slow
* down the new association establishment instead of every packet.
*/
if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
@@ -2319,7 +2319,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
/* This implementation defaults to making the first transport
* added as the primary transport. The source address seems to
- * be a a better choice than any of the embedded addresses.
+ * be a better choice than any of the embedded addresses.
*/
if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
goto nomem;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 1c6c640607c5..407fed46931b 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -740,7 +740,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq)
/* Helper function to gather skbs that have possibly become
- * ordered by an an incoming chunk.
+ * ordered by an incoming chunk.
*/
static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e7649bbc2b87..ed8f97166be9 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -55,6 +55,9 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
+struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+struct workqueue_struct *smc_close_wq; /* wq for close work */
+
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
@@ -436,10 +439,10 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
static void smcr_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
- smc->conn.peer_rmbe_idx = clc->rmbe_idx;
- smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
@@ -448,10 +451,10 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc,
static void smcd_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
- smc->conn.peer_rmbe_idx = clc->dmbe_idx;
- smc->conn.peer_token = clc->token;
+ smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+ smc->conn.peer_token = clc->d0.token;
/* msg header takes up space in the buffer */
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
@@ -470,11 +473,11 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc)
{
- link->peer_qpn = ntoh24(clc->qpn);
- memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
- memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
- link->peer_psn = ntoh24(clc->psn);
- link->peer_mtu = clc->qp_mtu;
+ link->peer_qpn = ntoh24(clc->r0.qpn);
+ memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->r0.psn);
+ link->peer_mtu = clc->r0.qp_mtu;
}
static void smc_switch_to_fallback(struct smc_sock *smc)
@@ -523,11 +526,11 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
/* abort connecting */
static int smc_connect_abort(struct smc_sock *smc, int reason_code,
- int local_contact)
+ int local_first)
{
bool is_smcd = smc->conn.lgr->is_smcd;
- if (local_contact == SMC_FIRST_CONTACT)
+ if (local_first)
smc_lgr_cleanup_early(&smc->conn);
else
smc_conn_free(&smc->conn);
@@ -613,9 +616,9 @@ static int smc_connect_rdma(struct smc_sock *smc,
struct smc_link *link;
ini->is_smcd = false;
- ini->ib_lcl = &aclc->lcl;
- ini->ib_clcqpn = ntoh24(aclc->qpn);
- ini->srv_first_contact = aclc->hdr.flag;
+ ini->ib_lcl = &aclc->r0.lcl;
+ ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+ ini->first_contact_peer = aclc->hdr.flag;
mutex_lock(&smc_client_lgr_pending);
reason_code = smc_conn_create(smc, ini);
@@ -626,7 +629,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc_conn_save_peer_info(smc, aclc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
link = smc->conn.lnk;
} else {
/* set link that was assigned by server */
@@ -634,60 +637,62 @@ static int smc_connect_rdma(struct smc_sock *smc,
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
struct smc_link *l = &smc->conn.lgr->lnk[i];
- if (l->peer_qpn == ntoh24(aclc->qpn) &&
- !memcmp(l->peer_gid, &aclc->lcl.gid, SMC_GID_SIZE) &&
- !memcmp(l->peer_mac, &aclc->lcl.mac, sizeof(l->peer_mac))) {
+ if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+ !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+ SMC_GID_SIZE) &&
+ !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+ sizeof(l->peer_mac))) {
link = l;
break;
}
}
if (!link)
return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK,
- ini->cln_first_contact);
+ ini->first_contact_local);
smc->conn.lnk = link;
}
/* create send buffer and rmb */
if (smc_buf_create(smc, false))
return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
- ini->cln_first_contact);
+ ini->first_contact_local);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
+ if (ini->first_contact_local)
smc_link_save_peer_info(link, aclc);
if (smc_rmb_rtoken_handling(&smc->conn, link, aclc))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
- ini->cln_first_contact);
+ ini->first_contact_local);
smc_close_init(smc);
smc_rx_init(smc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
if (smc_ib_ready_link(link))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
- ini->cln_first_contact);
+ ini->first_contact_local);
} else {
if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc))
return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
- ini->cln_first_contact);
+ ini->first_contact_local);
}
smc_rmb_sync_sg_for_device(&smc->conn);
reason_code = smc_clc_send_confirm(smc);
if (reason_code)
return smc_connect_abort(smc, reason_code,
- ini->cln_first_contact);
+ ini->first_contact_local);
smc_tx_init(smc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
/* QP confirmation over RoCE fabric */
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
reason_code = smcr_clnt_conf_first_link(smc);
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
if (reason_code)
return smc_connect_abort(smc, reason_code,
- ini->cln_first_contact);
+ ini->first_contact_local);
}
mutex_unlock(&smc_client_lgr_pending);
@@ -707,8 +712,8 @@ static int smc_connect_ism(struct smc_sock *smc,
int rc = 0;
ini->is_smcd = true;
- ini->ism_gid = aclc->gid;
- ini->srv_first_contact = aclc->hdr.flag;
+ ini->ism_peer_gid = aclc->d0.gid;
+ ini->first_contact_peer = aclc->hdr.flag;
/* there is only one lgr role for SMC-D; use server lock */
mutex_lock(&smc_server_lgr_pending);
@@ -724,7 +729,7 @@ static int smc_connect_ism(struct smc_sock *smc,
return smc_connect_abort(smc, (rc == -ENOSPC) ?
SMC_CLC_DECL_MAX_DMB :
SMC_CLC_DECL_MEM,
- ini->cln_first_contact);
+ ini->first_contact_local);
smc_conn_save_peer_info(smc, aclc);
smc_close_init(smc);
@@ -733,7 +738,7 @@ static int smc_connect_ism(struct smc_sock *smc,
rc = smc_clc_send_confirm(smc);
if (rc)
- return smc_connect_abort(smc, rc, ini->cln_first_contact);
+ return smc_connect_abort(smc, rc, ini->first_contact_local);
mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
@@ -903,7 +908,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
if (smc->use_fallback)
goto out;
if (flags & O_NONBLOCK) {
- if (schedule_work(&smc->connect_work))
+ if (queue_work(smc_hs_wq, &smc->connect_work))
smc->connect_nonblock = 1;
rc = -EINPROGRESS;
} else {
@@ -940,10 +945,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
- rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
- if (rc < 0)
+ if (rc < 0 && rc != -EAGAIN)
lsk->sk_err = -rc;
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
@@ -956,6 +961,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
goto out;
}
+ /* new clcsock has inherited the smc listen-specific sk_data_ready
+ * function; switch it back to the original sk_data_ready function
+ */
+ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
(*new_smc)->clcsock = new_clcsock;
out:
return rc;
@@ -1123,10 +1132,10 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
/* listen worker: decline and fall back if possible */
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
- int local_contact)
+ bool local_first)
{
/* RDMA setup failed, switch back to TCP */
- if (local_contact == SMC_FIRST_CONTACT)
+ if (local_first)
smc_lgr_cleanup_early(&new_smc->conn);
else
smc_conn_free(&new_smc->conn);
@@ -1182,30 +1191,16 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
- struct smc_clc_msg_smcd *pclc_smcd;
int rc;
- pclc_smcd = smc_get_clc_msg_smcd(pclc);
- ini->ism_gid = pclc_smcd->gid;
rc = smc_conn_create(new_smc, ini);
if (rc)
return rc;
- /* Check if peer can be reached via ISM device */
- if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
- new_smc->conn.lgr->vlan_id,
- new_smc->conn.lgr->smcd)) {
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_SMCDNOTALK;
- }
-
/* Create send and receive buffers */
rc = smc_buf_create(new_smc, true);
if (rc) {
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
+ if (ini->first_contact_local)
smc_lgr_cleanup_early(&new_smc->conn);
else
smc_conn_free(&new_smc->conn);
@@ -1217,11 +1212,11 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
}
/* listen worker: register buffers */
-static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
{
struct smc_connection *conn = &new_smc->conn;
- if (local_contact != SMC_FIRST_CONTACT) {
+ if (!local_first) {
if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
return SMC_CLC_DECL_ERR_REGRMB;
}
@@ -1233,35 +1228,25 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
/* listen worker: finish RDMA setup */
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
struct smc_clc_msg_accept_confirm *cclc,
- int local_contact)
+ bool local_first)
{
struct smc_link *link = new_smc->conn.lnk;
int reason_code = 0;
- if (local_contact == SMC_FIRST_CONTACT)
+ if (local_first)
smc_link_save_peer_info(link, cclc);
- if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) {
- reason_code = SMC_CLC_DECL_ERR_RTOK;
- goto decline;
- }
+ if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+ return SMC_CLC_DECL_ERR_RTOK;
- if (local_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link)) {
- reason_code = SMC_CLC_DECL_ERR_RDYLNK;
- goto decline;
- }
+ if (local_first) {
+ if (smc_ib_ready_link(link))
+ return SMC_CLC_DECL_ERR_RDYLNK;
/* QP confirmation over RoCE fabric */
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
reason_code = smcr_serv_conf_first_link(new_smc);
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
- if (reason_code)
- goto decline;
}
- return 0;
-
-decline:
- smc_listen_decline(new_smc, reason_code, local_contact);
return reason_code;
}
@@ -1272,10 +1257,10 @@ static void smc_listen_work(struct work_struct *work)
smc_listen_work);
struct socket *newclcsock = new_smc->clcsock;
struct smc_clc_msg_accept_confirm cclc;
+ struct smc_clc_msg_proposal_area *buf;
struct smc_clc_msg_proposal *pclc;
struct smc_init_info ini = {0};
bool ism_supported = false;
- u8 buf[SMC_CLC_MAX_LEN];
int rc = 0;
if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
@@ -1297,8 +1282,13 @@ static void smc_listen_work(struct work_struct *work)
/* do inband token exchange -
* wait for and receive SMC Proposal CLC message
*/
- pclc = (struct smc_clc_msg_proposal *)&buf;
- rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+ pclc = (struct smc_clc_msg_proposal *)buf;
+ rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
if (rc)
goto out_decl;
@@ -1327,7 +1317,10 @@ static void smc_listen_work(struct work_struct *work)
/* check if ISM is available */
if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
+ struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+
ini.is_smcd = true; /* prepare ISM check */
+ ini.ism_peer_gid = pclc_smcd->gid;
rc = smc_find_ism_device(new_smc, &ini);
if (!rc)
rc = smc_listen_ism_init(new_smc, pclc, &ini);
@@ -1354,13 +1347,13 @@ static void smc_listen_work(struct work_struct *work)
rc = smc_listen_rdma_init(new_smc, &ini);
if (rc)
goto out_unlock;
- rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
+ rc = smc_listen_rdma_reg(new_smc, ini.first_contact_local);
if (rc)
goto out_unlock;
}
/* send SMC Accept CLC message */
- rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
+ rc = smc_clc_send_accept(new_smc, ini.first_contact_local);
if (rc)
goto out_unlock;
@@ -1380,19 +1373,21 @@ static void smc_listen_work(struct work_struct *work)
/* finish worker */
if (!ism_supported) {
rc = smc_listen_rdma_finish(new_smc, &cclc,
- ini.cln_first_contact);
- mutex_unlock(&smc_server_lgr_pending);
+ ini.first_contact_local);
if (rc)
- return;
+ goto out_unlock;
+ mutex_unlock(&smc_server_lgr_pending);
}
smc_conn_save_peer_info(new_smc, &cclc);
smc_listen_out_connected(new_smc);
- return;
+ goto out_free;
out_unlock:
mutex_unlock(&smc_server_lgr_pending);
out_decl:
- smc_listen_decline(new_smc, rc, ini.cln_first_contact);
+ smc_listen_decline(new_smc, rc, ini.first_contact_local);
+out_free:
+ kfree(buf);
}
static void smc_tcp_listen_work(struct work_struct *work)
@@ -1406,7 +1401,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
lock_sock(lsk);
while (lsk->sk_state == SMC_LISTEN) {
rc = smc_clcsock_accept(lsmc, &new_smc);
- if (rc)
+ if (rc) /* clcsock accept queue empty or error */
goto out;
if (!new_smc)
continue;
@@ -1420,13 +1415,29 @@ static void smc_tcp_listen_work(struct work_struct *work)
new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */
- if (!schedule_work(&new_smc->smc_listen_work))
+ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
}
out:
release_sock(lsk);
- sock_put(&lsmc->sk); /* sock_hold in smc_listen */
+ sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+ struct smc_sock *lsmc;
+
+ lsmc = (struct smc_sock *)
+ ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
+ if (!lsmc)
+ return;
+ lsmc->clcsk_data_ready(listen_clcsock);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ }
}
static int smc_listen(struct socket *sock, int backlog)
@@ -1455,15 +1466,19 @@ static int smc_listen(struct socket *sock, int backlog)
if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ /* save original sk_data_ready function and establish
+ * smc-specific sk_data_ready function
+ */
+ smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
+ smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
goto out;
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN;
- sock_hold(sk); /* sock_hold in tcp_listen_worker */
- if (!schedule_work(&smc->tcp_listen_work))
- sock_put(sk);
out:
release_sock(sk);
@@ -1788,8 +1803,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
if (val)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
}
break;
case TCP_CORK:
@@ -1797,8 +1812,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
if (!val)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
}
break;
case TCP_DEFER_ACCEPT:
@@ -2081,10 +2096,19 @@ static int __init smc_init(void)
if (rc)
goto out_pernet_subsys;
+ rc = -ENOMEM;
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ if (!smc_hs_wq)
+ goto out_pnet;
+
+ smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ if (!smc_close_wq)
+ goto out_alloc_hs_wq;
+
rc = smc_core_init();
if (rc) {
pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_alloc_wqs;
}
rc = smc_llc_init();
@@ -2136,6 +2160,10 @@ out_proto:
proto_unregister(&smc_proto);
out_core:
smc_core_exit();
+out_alloc_wqs:
+ destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+ destroy_workqueue(smc_hs_wq);
out_pnet:
smc_pnet_exit();
out_pernet_subsys:
@@ -2150,6 +2178,8 @@ static void __exit smc_exit(void)
sock_unregister(PF_SMC);
smc_core_exit();
smc_ib_unregister_client();
+ destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 6f1c42da7a4c..2bd57e57b7e7 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -18,6 +18,8 @@
#include "smc_ib.h"
+#define SMC_V1 1 /* SMC version V1 */
+
#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
@@ -201,6 +203,8 @@ struct smc_connection {
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
+ void (*clcsk_data_ready)(struct sock *sk);
+ /* original data_ready fct. **/
struct smc_connection conn; /* smc connection */
struct smc_sock *listen_smc; /* listen parent */
struct work_struct connect_work; /* handle non-blocking connect*/
@@ -235,6 +239,9 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
return (struct smc_sock *)sk;
}
+extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+extern struct workqueue_struct *smc_close_wq; /* wq for close work */
+
#define SMC_SYSTEMID_LEN 8
extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index ce468ff62a19..b1ce6ccbfaec 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -299,7 +299,7 @@ static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
conn->lnk = link;
spin_unlock_bh(&conn->send_lock);
sock_hold(&smc->sk); /* sock_put in abort_work */
- if (!schedule_work(&conn->abort_work))
+ if (!queue_work(smc_close_wq, &conn->abort_work))
sock_put(&smc->sk);
}
}
@@ -368,7 +368,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(&smc->sk, SOCK_DONE);
sock_hold(&smc->sk); /* sock_put in close_work */
- if (!schedule_work(&conn->close_work))
+ if (!queue_work(smc_close_wq, &conn->close_work))
sock_put(&smc->sk);
}
}
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 779f4142a11d..bd116e1949b9 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -153,7 +153,6 @@ static int smc_clc_prfx_set(struct socket *clcsock,
struct sockaddr_in *addr;
int rc = -ENOENT;
- memset(prop, 0, sizeof(*prop));
if (!dst) {
rc = -ENOTCONN;
goto out;
@@ -320,7 +319,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
- (clcm->version < SMC_CLC_V1) ||
+ (clcm->version < SMC_V1) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
@@ -389,7 +388,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
dclc.hdr.type = SMC_CLC_DECLINE;
dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
- dclc.hdr.version = SMC_CLC_V1;
+ dclc.hdr.version = SMC_V1;
dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) &&
smc_ib_is_valid_local_systemid())
@@ -412,138 +411,171 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
struct smc_init_info *ini)
{
- struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
- struct smc_clc_msg_proposal_prefix pclc_prfx;
- struct smc_clc_msg_smcd pclc_smcd;
- struct smc_clc_msg_proposal pclc;
- struct smc_clc_msg_trail trl;
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_msg_proposal *pclc_base;
+ struct smc_clc_msg_proposal_area *pclc;
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ struct smc_clc_msg_trail *trl;
int len, i, plen, rc;
int reason_code = 0;
struct kvec vec[5];
struct msghdr msg;
+ pclc = kzalloc(sizeof(*pclc), GFP_KERNEL);
+ if (!pclc)
+ return -ENOMEM;
+
+ pclc_base = &pclc->pclc_base;
+ pclc_smcd = &pclc->pclc_smcd;
+ pclc_prfx = &pclc->pclc_prfx;
+ ipv6_prfx = pclc->pclc_prfx_ipv6;
+ trl = &pclc->pclc_trl;
+
/* retrieve ip prefixes for CLC proposal msg */
- rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx);
- if (rc)
+ rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx);
+ if (rc) {
+ kfree(pclc);
return SMC_CLC_DECL_CNFERR; /* configuration error */
+ }
/* send SMC Proposal CLC message */
- plen = sizeof(pclc) + sizeof(pclc_prfx) +
- (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
- sizeof(trl);
- memset(&pclc, 0, sizeof(pclc));
- memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
- pclc.hdr.type = SMC_CLC_PROPOSAL;
- pclc.hdr.version = SMC_CLC_V1; /* SMC version */
- pclc.hdr.path = smc_type;
+ plen = sizeof(*pclc_base) + sizeof(*pclc_prfx) +
+ (pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
+ sizeof(*trl);
+ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ pclc_base->hdr.type = SMC_CLC_PROPOSAL;
+ pclc_base->hdr.version = SMC_V1; /* SMC version */
+ pclc_base->hdr.path = smc_type;
if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) {
/* add SMC-R specifics */
- memcpy(pclc.lcl.id_for_peer, local_systemid,
+ memcpy(pclc_base->lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE);
- memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
+ memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE);
+ memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
ETH_ALEN);
- pclc.iparea_offset = htons(0);
+ pclc_base->iparea_offset = htons(0);
}
if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
/* add SMC-D specifics */
- memset(&pclc_smcd, 0, sizeof(pclc_smcd));
- plen += sizeof(pclc_smcd);
- pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
- pclc_smcd.gid = ini->ism_dev->local_gid;
+ plen += sizeof(*pclc_smcd);
+ pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
+ pclc_smcd->gid = ini->ism_dev->local_gid;
}
- pclc.hdr.length = htons(plen);
+ pclc_base->hdr.length = htons(plen);
- memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
i = 0;
- vec[i].iov_base = &pclc;
- vec[i++].iov_len = sizeof(pclc);
+ vec[i].iov_base = pclc_base;
+ vec[i++].iov_len = sizeof(*pclc_base);
if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
- vec[i].iov_base = &pclc_smcd;
- vec[i++].iov_len = sizeof(pclc_smcd);
+ vec[i].iov_base = pclc_smcd;
+ vec[i++].iov_len = sizeof(*pclc_smcd);
}
- vec[i].iov_base = &pclc_prfx;
- vec[i++].iov_len = sizeof(pclc_prfx);
- if (pclc_prfx.ipv6_prefixes_cnt > 0) {
- vec[i].iov_base = &ipv6_prfx[0];
- vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt *
+ vec[i].iov_base = pclc_prfx;
+ vec[i++].iov_len = sizeof(*pclc_prfx);
+ if (pclc_prfx->ipv6_prefixes_cnt > 0) {
+ vec[i].iov_base = ipv6_prfx;
+ vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt *
sizeof(ipv6_prfx[0]);
}
- vec[i].iov_base = &trl;
- vec[i++].iov_len = sizeof(trl);
+ vec[i].iov_base = trl;
+ vec[i++].iov_len = sizeof(*trl);
/* due to the few bytes needed for clc-handshake this cannot block */
len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
if (len < 0) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
reason_code = -smc->sk.sk_err;
- } else if (len < (int)sizeof(pclc)) {
+ } else if (len < ntohs(pclc_base->hdr.length)) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
}
+ kfree(pclc);
return reason_code;
}
-/* send CLC CONFIRM message across internal TCP socket */
-int smc_clc_send_confirm(struct smc_sock *smc)
+/* build and send CLC CONFIRM / ACCEPT message */
+static int smc_clc_send_confirm_accept(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc,
+ int first_contact)
{
struct smc_connection *conn = &smc->conn;
- struct smc_clc_msg_accept_confirm cclc;
- struct smc_link *link;
- int reason_code = 0;
struct msghdr msg;
struct kvec vec;
- int len;
/* send SMC Confirm CLC msg */
- memset(&cclc, 0, sizeof(cclc));
- cclc.hdr.type = SMC_CLC_CONFIRM;
- cclc.hdr.version = SMC_CLC_V1; /* SMC version */
- if (smc->conn.lgr->is_smcd) {
+ clc->hdr.version = SMC_V1; /* SMC version */
+ if (first_contact)
+ clc->hdr.flag = 1;
+ if (conn->lgr->is_smcd) {
/* SMC-D specific settings */
- memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
- cclc.hdr.path = SMC_TYPE_D;
- cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
- cclc.gid = conn->lgr->smcd->local_gid;
- cclc.token = conn->rmb_desc->token;
- cclc.dmbe_size = conn->rmbe_size_short;
- cclc.dmbe_idx = 0;
- memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
- memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ clc->hdr.path = SMC_TYPE_D;
+ clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ clc->d0.gid = conn->lgr->smcd->local_gid;
+ clc->d0.token = conn->rmb_desc->token;
+ clc->d0.dmbe_size = conn->rmbe_size_short;
+ clc->d0.dmbe_idx = 0;
+ memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ memcpy(clc->d0.smcd_trl.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
} else {
+ struct smc_link *link = conn->lnk;
+
/* SMC-R specific settings */
link = conn->lnk;
- memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
+ memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER,
sizeof(SMC_EYECATCHER));
- cclc.hdr.path = SMC_TYPE_R;
- cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(cclc.lcl.id_for_peer, local_systemid,
+ clc->hdr.path = SMC_TYPE_R;
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(clc->r0.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE);
- memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
ETH_ALEN);
- hton24(cclc.qpn, link->roce_qp->qp_num);
- cclc.rmb_rkey =
+ hton24(clc->r0.qpn, link->roce_qp->qp_num);
+ clc->r0.rmb_rkey =
htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
- cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
- cclc.rmbe_alert_token = htonl(conn->alert_token_local);
- cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
- cclc.rmbe_size = conn->rmbe_size_short;
- cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
+ switch (clc->hdr.type) {
+ case SMC_CLC_ACCEPT:
+ clc->r0.qp_mtu = link->path_mtu;
+ break;
+ case SMC_CLC_CONFIRM:
+ clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ break;
+ }
+ clc->r0.rmbe_size = conn->rmbe_size_short;
+ clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
(conn->rmb_desc->sgt[link->link_idx].sgl));
- hton24(cclc.psn, link->psn_initial);
- memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
+ hton24(clc->r0.psn, link->psn_initial);
+ memcpy(clc->r0.smcr_trl.eyecatcher, SMC_EYECATCHER,
sizeof(SMC_EYECATCHER));
}
memset(&msg, 0, sizeof(msg));
- vec.iov_base = &cclc;
- vec.iov_len = ntohs(cclc.hdr.length);
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
- ntohs(cclc.hdr.length));
+ vec.iov_base = clc;
+ vec.iov_len = ntohs(clc->hdr.length);
+ return kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ ntohs(clc->hdr.length));
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc)
+{
+ struct smc_clc_msg_accept_confirm cclc;
+ int reason_code = 0;
+ int len;
+
+ /* send SMC Confirm CLC msg */
+ memset(&cclc, 0, sizeof(cclc));
+ cclc.hdr.type = SMC_CLC_CONFIRM;
+ len = smc_clc_send_confirm_accept(smc, &cclc, 0);
if (len < ntohs(cclc.hdr.length)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
@@ -557,65 +589,14 @@ int smc_clc_send_confirm(struct smc_sock *smc)
}
/* send CLC ACCEPT message across internal TCP socket */
-int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact)
{
- struct smc_connection *conn = &new_smc->conn;
struct smc_clc_msg_accept_confirm aclc;
- struct smc_link *link;
- struct msghdr msg;
- struct kvec vec;
int len;
memset(&aclc, 0, sizeof(aclc));
aclc.hdr.type = SMC_CLC_ACCEPT;
- aclc.hdr.version = SMC_CLC_V1; /* SMC version */
- if (srv_first_contact)
- aclc.hdr.flag = 1;
-
- if (new_smc->conn.lgr->is_smcd) {
- /* SMC-D specific settings */
- aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER,
- sizeof(SMCD_EYECATCHER));
- aclc.hdr.path = SMC_TYPE_D;
- aclc.gid = conn->lgr->smcd->local_gid;
- aclc.token = conn->rmb_desc->token;
- aclc.dmbe_size = conn->rmbe_size_short;
- aclc.dmbe_idx = 0;
- memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
- memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
- sizeof(SMCD_EYECATCHER));
- } else {
- /* SMC-R specific settings */
- aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
- aclc.hdr.path = SMC_TYPE_R;
- link = conn->lnk;
- memcpy(aclc.lcl.id_for_peer, local_systemid,
- sizeof(local_systemid));
- memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
- memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
- ETH_ALEN);
- hton24(aclc.qpn, link->roce_qp->qp_num);
- aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
- aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
- aclc.rmbe_alert_token = htonl(conn->alert_token_local);
- aclc.qp_mtu = link->path_mtu;
- aclc.rmbe_size = conn->rmbe_size_short,
- aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
- (conn->rmb_desc->sgt[link->link_idx].sgl));
- hton24(aclc.psn, link->psn_initial);
- memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
- }
-
- memset(&msg, 0, sizeof(msg));
- vec.iov_base = &aclc;
- vec.iov_len = ntohs(aclc.hdr.length);
- len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
- ntohs(aclc.hdr.length));
+ len = smc_clc_send_confirm_accept(new_smc, &aclc, srv_first_contact);
if (len < ntohs(aclc.hdr.length))
len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index cf7b45306f4e..fcd8521c7737 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -22,7 +22,6 @@
#define SMC_CLC_CONFIRM 0x03
#define SMC_CLC_DECLINE 0x04
-#define SMC_CLC_V1 0x1 /* SMC version */
#define SMC_TYPE_R 0 /* SMC-R only */
#define SMC_TYPE_D 1 /* SMC-D only */
#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
@@ -38,7 +37,6 @@
#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
-#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */
#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
@@ -111,55 +109,58 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
__be16 iparea_offset; /* offset to IP address information area */
} __aligned(4);
-#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28
-#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \
- sizeof(struct smc_clc_ipv6_prefix))
-#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \
- SMC_CLC_PROPOSAL_MAX_OFFSET + \
- sizeof(struct smc_clc_msg_proposal_prefix) + \
- SMC_CLC_PROPOSAL_MAX_PREFIX + \
- sizeof(struct smc_clc_msg_trail))
+struct smc_clc_msg_proposal_area {
+ struct smc_clc_msg_proposal pclc_base;
+ struct smc_clc_msg_smcd pclc_smcd;
+ struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
+ struct smc_clc_msg_trail pclc_trl;
+};
-struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
- struct smc_clc_msg_hdr hdr;
- union {
- struct { /* SMC-R */
- struct smc_clc_msg_local lcl;
- u8 qpn[3]; /* QP number */
- __be32 rmb_rkey; /* RMB rkey */
- u8 rmbe_idx; /* Index of RMBE in RMB */
- __be32 rmbe_alert_token;/* unique connection id */
-#if defined(__BIG_ENDIAN_BITFIELD)
- u8 rmbe_size : 4, /* buf size (compressed) */
- qp_mtu : 4; /* QP mtu */
+struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token; /* unique connection id */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 qp_mtu : 4,
- rmbe_size : 4;
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
#endif
- u8 reserved;
- __be64 rmb_dma_addr; /* RMB virtual address */
- u8 reserved2;
- u8 psn[3]; /* packet sequence number */
- struct smc_clc_msg_trail smcr_trl;
- /* eye catcher "SMCR" EBCDIC */
- } __packed;
- struct { /* SMC-D */
- u64 gid; /* Sender GID */
- u64 token; /* DMB token */
- u8 dmbe_idx; /* DMBE index */
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+ struct smc_clc_msg_trail smcr_trl;
+ /* eye catcher "SMCR" EBCDIC */
+} __packed;
+
+struct smcd_clc_msg_accept_confirm { /* SMCD accept/confirm */
+ u64 gid; /* Sender GID */
+ u64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 dmbe_size : 4, /* buf size (compressed) */
- reserved3 : 4;
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 reserved3 : 4,
- dmbe_size : 4;
+ u8 reserved3 : 4,
+ dmbe_size : 4;
#endif
- u16 reserved4;
- u32 linkid; /* Link identifier */
- u32 reserved5[3];
- struct smc_clc_msg_trail smcd_trl;
- /* eye catcher "SMCD" EBCDIC */
- } __packed;
+ u16 reserved4;
+ u32 linkid; /* Link identifier */
+ u32 reserved5[3];
+ struct smc_clc_msg_trail smcd_trl;
+ /* eye catcher "SMCD" EBCDIC */
+} __packed;
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct smcd_clc_msg_accept_confirm d0; /* SMC-D */
};
} __packed; /* format defined in RFC7609 */
@@ -200,6 +201,6 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
struct smc_init_info *ini);
int smc_clc_send_confirm(struct smc_sock *smc);
-int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
+int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact);
#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 0e7409e469c0..0f9ffba07d26 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -210,9 +210,9 @@ again:
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk); /* wake up accept */
if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
+ smc->clcsock->sk->sk_user_data = NULL;
rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
- /* wake up kernel_accept of smc_tcp_listen_worker */
- smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
}
smc_close_cleanup_listen(sk);
release_sock(sk);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a406627b1d55..c811ae1a8add 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -34,7 +34,6 @@
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
-#define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
static struct smc_lgr_list smc_lgr_list = { /* established link groups */
.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
@@ -70,7 +69,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
* creation. For client use a somewhat higher removal delay time,
* otherwise there is a risk of out-of-sync link groups.
*/
- if (!lgr->freeing && !lgr->freefast) {
+ if (!lgr->freeing) {
mod_delayed_work(system_wq, &lgr->free_work,
(!lgr->is_smcd && lgr->role == SMC_CLNT) ?
SMC_LGR_FREE_DELAY_CLNT :
@@ -78,15 +77,6 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
}
}
-void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
-{
- if (!lgr->freeing && !lgr->freefast) {
- lgr->freefast = 1;
- mod_delayed_work(system_wq, &lgr->free_work,
- SMC_LGR_FREE_DELAY_FAST);
- }
-}
-
/* Register connection's alert token in our lookup structure.
* To use rbtrees we have to implement our own insert core.
* Requires @conns_lock
@@ -227,7 +217,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn)
if (!list_empty(lgr_list))
list_del_init(lgr_list);
spin_unlock_bh(lgr_lock);
- smc_lgr_schedule_free_work_fast(lgr);
+ __smc_lgr_terminate(lgr, true);
}
static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
@@ -396,10 +386,15 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
rc = SMC_CLC_DECL_MEM;
goto ism_put_vlan;
}
+ lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
+ SMC_LGR_ID_SIZE, &lgr->id);
+ if (!lgr->tx_wq) {
+ rc = -ENOMEM;
+ goto free_lgr;
+ }
lgr->is_smcd = ini->is_smcd;
lgr->sync_err = 0;
lgr->terminating = 0;
- lgr->freefast = 0;
lgr->freeing = 0;
lgr->vlan_id = ini->vlan_id;
mutex_init(&lgr->sndbufs_lock);
@@ -418,7 +413,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
if (ini->is_smcd) {
/* SMC-D specific settings */
get_device(&ini->ism_dev->dev);
- lgr->peer_gid = ini->ism_gid;
+ lgr->peer_gid = ini->ism_peer_gid;
lgr->smcd = ini->ism_dev;
lgr_list = &ini->ism_dev->lgr_list;
lgr_lock = &lgr->smcd->lgr_lock;
@@ -437,7 +432,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lnk = &lgr->lnk[link_idx];
rc = smcr_link_init(lgr, lnk, link_idx, ini);
if (rc)
- goto free_lgr;
+ goto free_wq;
lgr_list = &smc_lgr_list.list;
lgr_lock = &smc_lgr_list.lock;
atomic_inc(&lgr_cnt);
@@ -448,6 +443,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
spin_unlock_bh(lgr_lock);
return 0;
+free_wq:
+ destroy_workqueue(lgr->tx_wq);
free_lgr:
kfree(lgr);
ism_put_vlan:
@@ -517,7 +514,7 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
smc->sk.sk_state != SMC_CLOSED) {
rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
if (!rc) {
- schedule_delayed_work(&conn->tx_work, 0);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0);
smc->sk.sk_data_ready(&smc->sk);
}
} else {
@@ -824,11 +821,10 @@ static void smc_lgr_free(struct smc_link_group *lgr)
}
smc_lgr_free_bufs(lgr);
+ destroy_workqueue(lgr->tx_wq);
if (lgr->is_smcd) {
- if (!lgr->terminating) {
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
- }
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
wake_up(&lgr->smcd->lgrs_deleted);
} else {
@@ -889,8 +885,6 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
if (lgr->is_smcd) {
smc_ism_signal_shutdown(lgr);
smcd_unregister_all_dmbs(lgr);
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
} else {
u32 rsn = lgr->llc_termination_rsn;
@@ -1296,9 +1290,9 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
- ini->cln_first_contact = SMC_FIRST_CONTACT;
+ ini->first_contact_local = 1;
role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- if (role == SMC_CLNT && ini->srv_first_contact)
+ if (role == SMC_CLNT && ini->first_contact_peer)
/* create new link group as well */
goto create;
@@ -1307,14 +1301,14 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
list_for_each_entry(lgr, lgr_list, list) {
write_lock_bh(&lgr->conns_lock);
if ((ini->is_smcd ?
- smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
+ smcd_lgr_match(lgr, ini->ism_dev, ini->ism_peer_gid) :
smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
!lgr->sync_err &&
lgr->vlan_id == ini->vlan_id &&
(role == SMC_CLNT || ini->is_smcd ||
lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
/* link group found */
- ini->cln_first_contact = SMC_REUSE_CONTACT;
+ ini->first_contact_local = 0;
conn->lgr = lgr;
rc = smc_lgr_register_conn(conn, false);
write_unlock_bh(&lgr->conns_lock);
@@ -1328,8 +1322,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
if (rc)
return rc;
- if (role == SMC_CLNT && !ini->srv_first_contact &&
- ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (role == SMC_CLNT && !ini->first_contact_peer &&
+ ini->first_contact_local) {
/* Server reuses a link group, but Client wants to start
* a new one
* send out_of_sync decline, reason synchr. error
@@ -1338,7 +1332,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
}
create:
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
rc = smc_lgr_create(smc, ini);
if (rc)
goto out;
@@ -1892,8 +1886,8 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
struct smc_link *lnk,
struct smc_clc_msg_accept_confirm *clc)
{
- conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr,
- clc->rmb_rkey);
+ conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr,
+ clc->r0.rmb_rkey);
if (conn->rtoken_idx < 0)
return conn->rtoken_idx;
return 0;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 1c4d5439d0ff..37a5903789b0 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -137,9 +137,6 @@ struct smc_link {
#define SMC_LINKS_PER_LGR_MAX 3
#define SMC_SINGLE_LINK 0
-#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
-#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
-
/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
struct smc_buf_desc {
struct list_head list;
@@ -228,9 +225,9 @@ struct smc_link_group {
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
struct work_struct terminate_work; /* abnormal lgr termination */
+ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
- u8 freefast : 1; /* free worker scheduled fast */
u8 freeing : 1; /* lgr is being freed */
bool is_smcd; /* SMC-R or SMC-D */
@@ -294,9 +291,9 @@ struct smc_clc_msg_local;
struct smc_init_info {
u8 is_smcd;
+ u8 first_contact_peer;
+ u8 first_contact_local;
unsigned short vlan_id;
- int srv_first_contact;
- int cln_first_contact;
/* SMC-R */
struct smc_clc_msg_local *ib_lcl;
struct smc_ib_device *ib_dev;
@@ -304,7 +301,7 @@ struct smc_init_info {
u8 ib_port;
u32 ib_clcqpn;
/* SMC-D */
- u64 ism_gid;
+ u64 ism_peer_gid;
struct smcd_dev *ism_dev;
};
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index da9ba6d1679b..f15fca59b4b2 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -22,6 +22,15 @@
#include "smc.h"
#include "smc_core.h"
+struct smc_diag_dump_ctx {
+ int pos[2];
+};
+
+static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb)
+{
+ return (struct smc_diag_dump_ctx *)cb->ctx;
+}
+
static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
{
sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
@@ -193,13 +202,15 @@ errout:
}
static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb, int p_type)
{
+ struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb);
struct net *net = sock_net(skb->sk);
+ int snum = cb_ctx->pos[p_type];
struct nlattr *bc = NULL;
struct hlist_head *head;
+ int rc = 0, num = 0;
struct sock *sk;
- int rc = 0;
read_lock(&prot->h.smc_hash->lock);
head = &prot->h.smc_hash->ht;
@@ -209,13 +220,18 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
sk_for_each(sk, head) {
if (!net_eq(sock_net(sk), net))
continue;
+ if (num < snum)
+ goto next;
rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
- if (rc)
- break;
+ if (rc < 0)
+ goto out;
+next:
+ num++;
}
out:
read_unlock(&prot->h.smc_hash->lock);
+ cb_ctx->pos[p_type] = num;
return rc;
}
@@ -223,10 +239,10 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int rc = 0;
- rc = smc_diag_dump_proto(&smc_proto, skb, cb);
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC);
if (!rc)
- rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
- return rc;
+ smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6);
+ return skb->len;
}
static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 3ea33466ebe9..2db967f2fb50 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1691,7 +1691,7 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
list_add_tail(&qentry->list, &lgr->llc_event_q);
spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
- schedule_work(&lgr->llc_event_work);
+ queue_work(system_highpri_wq, &lgr->llc_event_work);
}
/* copy received msg and add it to the event queue */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 30e5fac7034e..70684c49510e 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -928,7 +928,10 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
- !ismdev->going_away) {
+ !ismdev->going_away &&
+ (!ini->ism_peer_gid ||
+ !smc_ism_cantalk(ini->ism_peer_gid, ini->vlan_id,
+ ismdev))) {
ini->ism_dev = ismdev;
break;
}
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 54ba0443847e..4532c16bf85e 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -228,8 +228,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
/* for a corked socket defer the RDMA writes if there
* is still sufficient sndbuf_space available
*/
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_CORK_DELAY);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_CORK_DELAY);
else
smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */
@@ -499,7 +499,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
if (conn->killed)
return -EPIPE;
rc = 0;
- mod_delayed_work(system_wq, &conn->tx_work,
+ mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
SMC_TX_WORK_DELAY);
}
return rc;
@@ -623,8 +623,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
return;
if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
!conn->killed) {
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_WORK_DELAY);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
return;
}
}
diff --git a/net/socket.c b/net/socket.c
index 0c0144604f81..82262e1922f9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2628,9 +2628,11 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
struct user_msghdr __user *umsg,
struct sockaddr __user *uaddr, unsigned int flags)
{
- /* disallow ancillary data requests from this path */
- if (msg->msg_control || msg->msg_controllen)
- return -EINVAL;
+ if (msg->msg_control || msg->msg_controllen) {
+ /* disallow ancillary data reqs unless cmsg is plain data */
+ if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
+ return -EINVAL;
+ }
return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 999eee1ed61c..6c86e2a7d942 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -108,8 +108,10 @@ proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
left -= (s - tmpbuf);
if (left && !isspace(*s))
return -EINVAL;
- while (left && isspace(*s))
- left--, s++;
+ while (left && isspace(*s)) {
+ left--;
+ s++;
+ }
} else
left = 0;
*(unsigned int *) table->data = value;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 4f6dc74adf45..c2ff42900b53 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -60,6 +60,7 @@ static int __net_init tipc_init_net(struct net *net)
tn->trial_addr = 0;
tn->addr_trial_end = 0;
tn->capabilities = TIPC_NODE_CAPABILITIES;
+ INIT_WORK(&tn->final_work.work, tipc_net_finalize_work);
memset(tn->node_id, 0, sizeof(tn->node_id));
memset(tn->node_id_string, 0, sizeof(tn->node_id_string));
tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
@@ -107,8 +108,13 @@ out_crypto:
static void __net_exit tipc_exit_net(struct net *net)
{
+ struct tipc_net *tn = tipc_net(net);
+
tipc_detach_loopback(net);
+ /* Make sure the tipc_net_finalize_work() finished */
+ cancel_work_sync(&tn->final_work.work);
tipc_net_stop(net);
+
tipc_bcast_stop(net);
tipc_nametbl_stop(net);
tipc_sk_rht_destroy(net);
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 631d83c9705f..1d57a4d3b05e 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -90,6 +90,12 @@ extern unsigned int tipc_net_id __read_mostly;
extern int sysctl_tipc_rmem[3] __read_mostly;
extern int sysctl_tipc_named_timeout __read_mostly;
+struct tipc_net_work {
+ struct work_struct work;
+ struct net *net;
+ u32 addr;
+};
+
struct tipc_net {
u8 node_id[NODE_ID_LEN];
u32 node_addr;
@@ -143,6 +149,8 @@ struct tipc_net {
/* TX crypto handler */
struct tipc_crypto *crypto_tx;
#endif
+ /* Work item for net finalize */
+ struct tipc_net_work final_work;
};
static inline struct tipc_net *tipc_net(struct net *net)
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 7c523dc81575..40c44101fe8e 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -36,22 +36,28 @@
#include <crypto/aead.h>
#include <crypto/aes.h>
+#include <crypto/rng.h>
#include "crypto.h"
+#include "msg.h"
+#include "bcast.h"
-#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */
-#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */
+#define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */
+#define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */
#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */
-#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */
+#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */
+
#define TIPC_MAX_TFMS_DEF 10
#define TIPC_MAX_TFMS_LIM 1000
+#define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */
+
/**
* TIPC Key ids
*/
enum {
- KEY_UNUSED = 0,
- KEY_MIN,
- KEY_1 = KEY_MIN,
+ KEY_MASTER = 0,
+ KEY_MIN = KEY_MASTER,
+ KEY_1 = 1,
KEY_2,
KEY_3,
KEY_MAX = KEY_3,
@@ -81,6 +87,8 @@ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok",
/* Max TFMs number per key */
int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF;
+/* Key exchange switch, default: on */
+int sysctl_tipc_key_exchange_enabled __read_mostly = 1;
/**
* struct tipc_key - TIPC keys' status indicator
@@ -132,6 +140,8 @@ struct tipc_tfm {
* @mode: crypto mode is applied to the key
* @hint[]: a hint for user key
* @rcu: struct rcu_head
+ * @key: the aead key
+ * @gen: the key's generation
* @seqno: the key seqno (cluster scope)
* @refcnt: the key reference counter
*/
@@ -144,8 +154,10 @@ struct tipc_aead {
u32 salt;
u8 authsize;
u8 mode;
- char hint[TIPC_AEAD_HINT_LEN + 1];
+ char hint[2 * TIPC_AEAD_HINT_LEN + 1];
struct rcu_head rcu;
+ struct tipc_aead_key *key;
+ u16 gen;
atomic64_t seqno ____cacheline_aligned;
refcount_t refcnt ____cacheline_aligned;
@@ -165,26 +177,56 @@ struct tipc_crypto_stats {
* @node: TIPC node (RX)
* @aead: array of pointers to AEAD keys for encryption/decryption
* @peer_rx_active: replicated peer RX active key index
+ * @key_gen: TX/RX key generation
* @key: the key states
- * @working: the crypto is working or not
+ * @skey_mode: session key's mode
+ * @skey: received session key
+ * @wq: common workqueue on TX crypto
+ * @work: delayed work sched for TX/RX
+ * @key_distr: key distributing state
+ * @rekeying_intv: rekeying interval (in minutes)
* @stats: the crypto statistics
+ * @name: the crypto name
* @sndnxt: the per-peer sndnxt (TX)
* @timer1: general timer 1 (jiffies)
- * @timer2: general timer 1 (jiffies)
+ * @timer2: general timer 2 (jiffies)
+ * @working: the crypto is working or not
+ * @key_master: flag indicates if master key exists
+ * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.)
+ * @nokey: no key indication
* @lock: tipc_key lock
*/
struct tipc_crypto {
struct net *net;
struct tipc_node *node;
- struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */
+ struct tipc_aead __rcu *aead[KEY_MAX + 1];
atomic_t peer_rx_active;
+ u16 key_gen;
struct tipc_key key;
- u8 working:1;
+ u8 skey_mode;
+ struct tipc_aead_key *skey;
+ struct workqueue_struct *wq;
+ struct delayed_work work;
+#define KEY_DISTR_SCHED 1
+#define KEY_DISTR_COMPL 2
+ atomic_t key_distr;
+ u32 rekeying_intv;
+
struct tipc_crypto_stats __percpu *stats;
+ char name[48];
atomic64_t sndnxt ____cacheline_aligned;
unsigned long timer1;
unsigned long timer2;
+ union {
+ struct {
+ u8 working:1;
+ u8 key_master:1;
+ u8 legacy_user:1;
+ u8 nokey: 1;
+ };
+ u8 flags;
+ };
spinlock_t lock; /* crypto lock */
} ____cacheline_aligned;
@@ -234,23 +276,35 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
u8 new_active,
u8 new_pending);
static int tipc_crypto_key_attach(struct tipc_crypto *c,
- struct tipc_aead *aead, u8 pos);
+ struct tipc_aead *aead, u8 pos,
+ bool master_key);
static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending);
static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
struct tipc_crypto *rx,
- struct sk_buff *skb);
-static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active,
- struct tipc_msg *hdr);
+ struct sk_buff *skb,
+ u8 tx_key);
+static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb);
static int tipc_crypto_key_revoke(struct net *net, u8 tx_key);
+static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode, u8 type);
static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
struct tipc_bearer *b,
struct sk_buff **skb, int err);
static void tipc_crypto_do_cmd(struct net *net, int cmd);
static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf);
-#ifdef TIPC_CRYPTO_DEBUG
static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
char *buf);
-#endif
+static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey,
+ u16 gen, u8 mode, u32 dnode);
+static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr);
+static void tipc_crypto_work_tx(struct work_struct *work);
+static void tipc_crypto_work_rx(struct work_struct *work);
+static int tipc_aead_key_generate(struct tipc_aead_key *skey);
+
+#define is_tx(crypto) (!(crypto)->node)
+#define is_rx(crypto) (!is_tx(crypto))
#define key_next(cur) ((cur) % KEY_MAX + 1)
@@ -271,30 +325,55 @@ do { \
/**
* tipc_aead_key_validate - Validate a AEAD user key
*/
-int tipc_aead_key_validate(struct tipc_aead_key *ukey)
+int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info)
{
int keylen;
/* Check if algorithm exists */
if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) {
- pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name);
+ GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)");
return -ENODEV;
}
/* Currently, we only support the "gcm(aes)" cipher algorithm */
- if (strcmp(ukey->alg_name, "gcm(aes)"))
+ if (strcmp(ukey->alg_name, "gcm(aes)")) {
+ GENL_SET_ERR_MSG(info, "not supported yet the algorithm");
return -ENOTSUPP;
+ }
/* Check if key size is correct */
keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE;
if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 &&
keylen != TIPC_AES_GCM_KEY_SIZE_192 &&
- keylen != TIPC_AES_GCM_KEY_SIZE_256))
- return -EINVAL;
+ keylen != TIPC_AES_GCM_KEY_SIZE_256)) {
+ GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)");
+ return -EKEYREJECTED;
+ }
return 0;
}
+/**
+ * tipc_aead_key_generate - Generate new session key
+ * @skey: input/output key with new content
+ *
+ * Return: 0 in case of success, otherwise < 0
+ */
+static int tipc_aead_key_generate(struct tipc_aead_key *skey)
+{
+ int rc = 0;
+
+ /* Fill the key's content with a random value via RNG cipher */
+ rc = crypto_get_default_rng();
+ if (likely(!rc)) {
+ rc = crypto_rng_get_bytes(crypto_default_rng, skey->key,
+ skey->keylen);
+ crypto_put_default_rng();
+ }
+
+ return rc;
+}
+
static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead)
{
struct tipc_aead *tmp;
@@ -339,6 +418,7 @@ static void tipc_aead_free(struct rcu_head *rp)
kfree(head);
}
free_percpu(aead->tfm_entry);
+ kzfree(aead->key);
kfree(aead);
}
@@ -501,14 +581,15 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
return err;
}
- /* Copy some chars from the user key as a hint */
- memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN);
- tmp->hint[TIPC_AEAD_HINT_LEN] = '\0';
+ /* Form a hex string of some last bytes as the key's hint */
+ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN,
+ TIPC_AEAD_HINT_LEN);
/* Initialize the other data */
tmp->mode = mode;
tmp->cloned = NULL;
tmp->authsize = TIPC_AES_GCM_TAG_SIZE;
+ tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL);
memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE);
atomic_set(&tmp->users, 0);
atomic64_set(&tmp->seqno, 0);
@@ -663,13 +744,11 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
* but there is no frag_list, it should be still fine!
* Otherwise, we must cow it to be a writable buffer with the tailroom.
*/
-#ifdef TIPC_CRYPTO_DEBUG
SKB_LINEAR_ASSERT(skb);
if (tailen > skb_tailroom(skb)) {
- pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n",
- skb_tailroom(skb), tailen);
+ pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n",
+ skb_tailroom(skb), tailen);
}
-#endif
if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) {
nsg = 1;
@@ -940,8 +1019,6 @@ bool tipc_ehdr_validate(struct sk_buff *skb)
return false;
if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE))
return false;
- if (unlikely(!ehdr->tx_key))
- return false;
return true;
}
@@ -994,6 +1071,8 @@ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead,
ehdr->tx_key = tx_key;
ehdr->destined = (__rx) ? 1 : 0;
ehdr->rx_key_active = (__rx) ? __rx->key.active : 0;
+ ehdr->rx_nokey = (__rx) ? __rx->nokey : 0;
+ ehdr->master_key = aead->crypto->key_master;
ehdr->reserved_1 = 0;
ehdr->reserved_2 = 0;
@@ -1019,23 +1098,16 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
u8 new_active,
u8 new_pending)
{
-#ifdef TIPC_CRYPTO_DEBUG
struct tipc_key old = c->key;
char buf[32];
-#endif
c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) |
((new_active & KEY_MASK) << (KEY_BITS)) |
((new_pending & KEY_MASK));
-#ifdef TIPC_CRYPTO_DEBUG
- pr_info("%s(%s): key changing %s ::%pS\n",
- (c->node) ? "RX" : "TX",
- (c->node) ? tipc_node_get_id_str(c->node) :
- tipc_own_id_string(c->net),
- tipc_key_change_dump(old, c->key, buf),
- __builtin_return_address(0));
-#endif
+ pr_debug("%s: key changing %s ::%pS\n", c->name,
+ tipc_key_change_dump(old, c->key, buf),
+ __builtin_return_address(0));
}
/**
@@ -1043,6 +1115,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
* @c: TIPC crypto to which new key is attached
* @ukey: the user key
* @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY)
+ * @master_key: specify this is a cluster master key
*
* A new TIPC AEAD key will be allocated and initiated with the specified user
* key, then attached to the TIPC crypto.
@@ -1050,7 +1123,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
* Return: new key id in case of success, otherwise: < 0
*/
int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
- u8 mode)
+ u8 mode, bool master_key)
{
struct tipc_aead *aead = NULL;
int rc = 0;
@@ -1060,17 +1133,11 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
/* Attach it to the crypto */
if (likely(!rc)) {
- rc = tipc_crypto_key_attach(c, aead, 0);
+ rc = tipc_crypto_key_attach(c, aead, 0, master_key);
if (rc < 0)
tipc_aead_free(&aead->rcu);
}
- pr_info("%s(%s): key initiating, rc %d!\n",
- (c->node) ? "RX" : "TX",
- (c->node) ? tipc_node_get_id_str(c->node) :
- tipc_own_id_string(c->net),
- rc);
-
return rc;
}
@@ -1079,58 +1146,58 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
* @c: TIPC crypto to which the new AEAD key is attached
* @aead: the new AEAD key pointer
* @pos: desired slot in the crypto key array, = 0 if any!
+ * @master_key: specify this is a cluster master key
*
* Return: new key id in case of success, otherwise: -EBUSY
*/
static int tipc_crypto_key_attach(struct tipc_crypto *c,
- struct tipc_aead *aead, u8 pos)
+ struct tipc_aead *aead, u8 pos,
+ bool master_key)
{
- u8 new_pending, new_passive, new_key;
struct tipc_key key;
int rc = -EBUSY;
+ u8 new_key;
spin_lock_bh(&c->lock);
key = c->key;
+ if (master_key) {
+ new_key = KEY_MASTER;
+ goto attach;
+ }
if (key.active && key.passive)
goto exit;
- if (key.passive && !tipc_aead_users(c->aead[key.passive]))
- goto exit;
if (key.pending) {
- if (pos)
- goto exit;
if (tipc_aead_users(c->aead[key.pending]) > 0)
goto exit;
+ /* if (pos): ok with replacing, will be aligned when needed */
/* Replace it */
- new_pending = key.pending;
- new_passive = key.passive;
- new_key = new_pending;
+ new_key = key.pending;
} else {
if (pos) {
if (key.active && pos != key_next(key.active)) {
- new_pending = key.pending;
- new_passive = pos;
- new_key = new_passive;
+ key.passive = pos;
+ new_key = pos;
goto attach;
} else if (!key.active && !key.passive) {
- new_pending = pos;
- new_passive = key.passive;
- new_key = new_pending;
+ key.pending = pos;
+ new_key = pos;
goto attach;
}
}
- new_pending = key_next(key.active ?: key.passive);
- new_passive = key.passive;
- new_key = new_pending;
+ key.pending = key_next(key.active ?: key.passive);
+ new_key = key.pending;
}
attach:
aead->crypto = c;
- tipc_crypto_key_set_state(c, new_passive, key.active, new_pending);
+ aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen;
tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock);
-
+ if (likely(c->key.keys != key.keys))
+ tipc_crypto_key_set_state(c, key.passive, key.active,
+ key.pending);
c->working = 1;
- c->timer1 = jiffies;
- c->timer2 = jiffies;
+ c->nokey = 0;
+ c->key_master |= master_key;
rc = new_key;
exit:
@@ -1140,14 +1207,33 @@ exit:
void tipc_crypto_key_flush(struct tipc_crypto *c)
{
+ struct tipc_crypto *tx, *rx;
int k;
spin_lock_bh(&c->lock);
- c->working = 0;
+ if (is_rx(c)) {
+ /* Try to cancel pending work */
+ rx = c;
+ tx = tipc_net(rx->net)->crypto_tx;
+ if (cancel_delayed_work(&rx->work)) {
+ kfree(rx->skey);
+ rx->skey = NULL;
+ atomic_xchg(&rx->key_distr, 0);
+ tipc_node_put(rx->node);
+ }
+ /* RX stopping => decrease TX key users if any */
+ k = atomic_xchg(&rx->peer_rx_active, 0);
+ if (k) {
+ tipc_aead_users_dec(tx->aead[k], 0);
+ /* Mark the point TX key users changed */
+ tx->timer1 = jiffies;
+ }
+ }
+
+ c->flags = 0;
tipc_crypto_key_set_state(c, 0, 0, 0);
for (k = KEY_MIN; k <= KEY_MAX; k++)
tipc_crypto_key_detach(c->aead[k], &c->lock);
- atomic_set(&c->peer_rx_active, 0);
atomic64_set(&c->sndnxt, 0);
spin_unlock_bh(&c->lock);
}
@@ -1206,7 +1292,8 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending)
rcu_assign_pointer(rx->aead[new_passive], tmp2);
refcount_set(&tmp1->refcnt, 1);
aligned = true;
- pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node));
+ pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending,
+ new_pending);
exit:
spin_unlock(&rx->lock);
@@ -1218,6 +1305,7 @@ exit:
* @tx: TX crypto handle
* @rx: RX crypto handle (can be NULL)
* @skb: the message skb which will be decrypted later
+ * @tx_key: peer TX key id
*
* This function looks up the existing TX keys and pick one which is suitable
* for the message decryption, that must be a cluster key and not used before
@@ -1227,7 +1315,8 @@ exit:
*/
static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
struct tipc_crypto *rx,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u8 tx_key)
{
struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb);
struct tipc_aead *aead = NULL;
@@ -1246,6 +1335,10 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
/* Pick one TX key */
spin_lock(&tx->lock);
+ if (tx_key == KEY_MASTER) {
+ aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock);
+ goto done;
+ }
do {
k = (i == 0) ? key.pending :
((i == 1) ? key.active : key.passive);
@@ -1265,9 +1358,12 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
skb->next = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!skb->next))
pr_warn("Failed to clone skb for next round if any\n");
- WARN_ON(!refcount_inc_not_zero(&aead->refcnt));
break;
} while (++i < 3);
+
+done:
+ if (likely(aead))
+ WARN_ON(!refcount_inc_not_zero(&aead->refcnt));
spin_unlock(&tx->lock);
return aead;
@@ -1276,53 +1372,73 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
/**
* tipc_crypto_key_synch: Synch own key data according to peer key status
* @rx: RX crypto handle
- * @new_rx_active: latest RX active key from peer
- * @hdr: TIPCv2 message
+ * @skb: TIPCv2 message buffer (incl. the ehdr from peer)
*
* This function updates the peer node related data as the peer RX active key
* has changed, so the number of TX keys' users on this node are increased and
* decreased correspondingly.
*
+ * It also considers if peer has no key, then we need to make own master key
+ * (if any) taking over i.e. starting grace period and also trigger key
+ * distributing process.
+ *
* The "per-peer" sndnxt is also reset when the peer key has switched.
*/
-static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active,
- struct tipc_msg *hdr)
+static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb)
{
- struct net *net = rx->net;
- struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
- u8 cur_rx_active;
+ struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb);
+ struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx;
+ struct tipc_msg *hdr = buf_msg(skb);
+ u32 self = tipc_own_addr(rx->net);
+ u8 cur, new;
+ unsigned long delay;
- /* TX might be even not ready yet */
- if (unlikely(!tx->key.active && !tx->key.pending))
- return;
+ /* Update RX 'key_master' flag according to peer, also mark "legacy" if
+ * a peer has no master key.
+ */
+ rx->key_master = ehdr->master_key;
+ if (!rx->key_master)
+ tx->legacy_user = 1;
- cur_rx_active = atomic_read(&rx->peer_rx_active);
- if (likely(cur_rx_active == new_rx_active))
+ /* For later cases, apply only if message is destined to this node */
+ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self)
return;
- /* Make sure this message destined for this node */
- if (unlikely(msg_short(hdr) ||
- msg_destnode(hdr) != tipc_own_addr(net)))
- return;
+ /* Case 1: Peer has no keys, let's make master key take over */
+ if (ehdr->rx_nokey) {
+ /* Set or extend grace period */
+ tx->timer2 = jiffies;
+ /* Schedule key distributing for the peer if not yet */
+ if (tx->key.keys &&
+ !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) {
+ get_random_bytes(&delay, 2);
+ delay %= 5;
+ delay = msecs_to_jiffies(500 * ++delay);
+ if (queue_delayed_work(tx->wq, &rx->work, delay))
+ tipc_node_get(rx->node);
+ }
+ } else {
+ /* Cancel a pending key distributing if any */
+ atomic_xchg(&rx->key_distr, 0);
+ }
- /* Peer RX active key has changed, try to update owns' & TX users */
- if (atomic_cmpxchg(&rx->peer_rx_active,
- cur_rx_active,
- new_rx_active) == cur_rx_active) {
- if (new_rx_active)
- tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX);
- if (cur_rx_active)
- tipc_aead_users_dec(tx->aead[cur_rx_active], 0);
+ /* Case 2: Peer RX active key has changed, let's update own TX users */
+ cur = atomic_read(&rx->peer_rx_active);
+ new = ehdr->rx_key_active;
+ if (tx->key.keys &&
+ cur != new &&
+ atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) {
+ if (new)
+ tipc_aead_users_inc(tx->aead[new], INT_MAX);
+ if (cur)
+ tipc_aead_users_dec(tx->aead[cur], 0);
atomic64_set(&rx->sndnxt, 0);
/* Mark the point TX key users changed */
tx->timer1 = jiffies;
-#ifdef TIPC_CRYPTO_DEBUG
- pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n",
- tipc_own_id_string(net), cur_rx_active,
- new_rx_active, tipc_node_get_id_str(rx->node));
-#endif
+ pr_debug("%s: key users changed %d-- %d++, peer %s\n",
+ tx->name, cur, new, rx->name);
}
}
@@ -1340,7 +1456,7 @@ static int tipc_crypto_key_revoke(struct net *net, u8 tx_key)
tipc_crypto_key_detach(tx->aead[key.active], &tx->lock);
spin_unlock(&tx->lock);
- pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net));
+ pr_warn("%s: key is revoked\n", tx->name);
return -EKEYREVOKED;
}
@@ -1357,6 +1473,15 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
if (!c)
return -ENOMEM;
+ /* Allocate workqueue on TX */
+ if (!node) {
+ c->wq = alloc_ordered_workqueue("tipc_crypto", 0);
+ if (!c->wq) {
+ kfree(c);
+ return -ENOMEM;
+ }
+ }
+
/* Allocate statistic structure */
c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
if (!c->stats) {
@@ -1364,53 +1489,52 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
return -ENOMEM;
}
- c->working = 0;
+ c->flags = 0;
c->net = net;
c->node = node;
+ get_random_bytes(&c->key_gen, 2);
tipc_crypto_key_set_state(c, 0, 0, 0);
+ atomic_set(&c->key_distr, 0);
atomic_set(&c->peer_rx_active, 0);
atomic64_set(&c->sndnxt, 0);
c->timer1 = jiffies;
c->timer2 = jiffies;
+ c->rekeying_intv = TIPC_REKEYING_INTV_DEF;
spin_lock_init(&c->lock);
- *crypto = c;
+ scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX",
+ (is_rx(c)) ? tipc_node_get_id_str(c->node) :
+ tipc_own_id_string(c->net));
+
+ if (is_rx(c))
+ INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx);
+ else
+ INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx);
+ *crypto = c;
return 0;
}
void tipc_crypto_stop(struct tipc_crypto **crypto)
{
- struct tipc_crypto *c, *tx, *rx;
- bool is_rx;
+ struct tipc_crypto *c = *crypto;
u8 k;
- if (!*crypto)
+ if (!c)
return;
- rcu_read_lock();
- /* RX stopping? => decrease TX key users if any */
- is_rx = !!((*crypto)->node);
- if (is_rx) {
- rx = *crypto;
- tx = tipc_net(rx->net)->crypto_tx;
- k = atomic_read(&rx->peer_rx_active);
- if (k) {
- tipc_aead_users_dec(tx->aead[k], 0);
- /* Mark the point TX key users changed */
- tx->timer1 = jiffies;
- }
+ /* Flush any queued works & destroy wq */
+ if (is_tx(c)) {
+ c->rekeying_intv = 0;
+ cancel_delayed_work_sync(&c->work);
+ destroy_workqueue(c->wq);
}
/* Release AEAD keys */
- c = *crypto;
+ rcu_read_lock();
for (k = KEY_MIN; k <= KEY_MAX; k++)
tipc_aead_put(rcu_dereference(c->aead[k]));
rcu_read_unlock();
-
- pr_warn("%s(%s) has been purged, node left!\n",
- (is_rx) ? "RX" : "TX",
- (is_rx) ? tipc_node_get_id_str((*crypto)->node) :
- tipc_own_id_string((*crypto)->net));
+ pr_debug("%s: has been stopped\n", c->name);
/* Free this crypto statistics */
free_percpu(c->stats);
@@ -1424,106 +1548,91 @@ void tipc_crypto_timeout(struct tipc_crypto *rx)
struct tipc_net *tn = tipc_net(rx->net);
struct tipc_crypto *tx = tn->crypto_tx;
struct tipc_key key;
- u8 new_pending, new_passive;
int cmd;
- /* TX key activating:
- * The pending key (users > 0) -> active
- * The active key if any (users == 0) -> free
- */
+ /* TX pending: taking all users & stable -> active */
spin_lock(&tx->lock);
key = tx->key;
if (key.active && tipc_aead_users(tx->aead[key.active]) > 0)
goto s1;
if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0)
goto s1;
- if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM))
+ if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME))
goto s1;
tipc_crypto_key_set_state(tx, key.passive, key.pending, 0);
if (key.active)
tipc_crypto_key_detach(tx->aead[key.active], &tx->lock);
this_cpu_inc(tx->stats->stat[STAT_SWITCHES]);
- pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net),
- key.pending);
+ pr_info("%s: key[%d] is activated\n", tx->name, key.pending);
s1:
spin_unlock(&tx->lock);
- /* RX key activating:
- * The pending key (users > 0) -> active
- * The active key if any -> passive, freed later
- */
+ /* RX pending: having user -> active */
spin_lock(&rx->lock);
key = rx->key;
if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0)
goto s2;
- new_pending = (key.passive &&
- !tipc_aead_users(rx->aead[key.passive])) ?
- key.passive : 0;
- new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive);
- tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending);
+ if (key.active)
+ key.passive = key.active;
+ key.active = key.pending;
+ rx->timer2 = jiffies;
+ tipc_crypto_key_set_state(rx, key.passive, key.active, 0);
this_cpu_inc(rx->stats->stat[STAT_SWITCHES]);
- pr_info("RX(%s): key %d is activated!\n",
- tipc_node_get_id_str(rx->node), key.pending);
+ pr_info("%s: key[%d] is activated\n", rx->name, key.pending);
goto s5;
s2:
- /* RX key "faulty" switching:
- * The faulty pending key (users < -30) -> passive
- * The passive key (users = 0) -> pending
- * Note: This only happens after RX deactivated - s3!
- */
- key = rx->key;
- if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30)
- goto s3;
- if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0)
+ /* RX pending: not working -> remove */
+ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10)
goto s3;
- new_pending = key.passive;
- new_passive = key.pending;
- tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending);
+ tipc_crypto_key_set_state(rx, key.passive, key.active, 0);
+ tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock);
+ pr_debug("%s: key[%d] is removed\n", rx->name, key.pending);
goto s5;
s3:
- /* RX key deactivating:
- * The passive key if any -> pending
- * The active key -> passive (users = 0) / pending
- * The pending key if any -> passive (users = 0)
- */
- key = rx->key;
+ /* RX active: timed out or no user -> pending */
if (!key.active)
goto s4;
- if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM))
+ if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) &&
+ tipc_aead_users(rx->aead[key.active]) > 0)
goto s4;
- new_pending = (key.passive) ?: key.active;
- new_passive = (key.passive) ? key.active : key.pending;
- tipc_aead_users_set(rx->aead[new_pending], 0);
- if (new_passive)
- tipc_aead_users_set(rx->aead[new_passive], 0);
- tipc_crypto_key_set_state(rx, new_passive, 0, new_pending);
- pr_info("RX(%s): key %d is deactivated!\n",
- tipc_node_get_id_str(rx->node), key.active);
+ if (key.pending)
+ key.passive = key.active;
+ else
+ key.pending = key.active;
+ rx->timer2 = jiffies;
+ tipc_crypto_key_set_state(rx, key.passive, 0, key.pending);
+ tipc_aead_users_set(rx->aead[key.pending], 0);
+ pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active);
goto s5;
s4:
- /* RX key passive -> freed: */
- key = rx->key;
- if (!key.passive || !tipc_aead_users(rx->aead[key.passive]))
+ /* RX passive: outdated or not working -> free */
+ if (!key.passive)
goto s5;
- if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM))
+ if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) &&
+ tipc_aead_users(rx->aead[key.passive]) > -10)
goto s5;
tipc_crypto_key_set_state(rx, 0, key.active, key.pending);
tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock);
- pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node),
- key.passive);
+ pr_debug("%s: key[%d] is freed\n", rx->name, key.passive);
s5:
spin_unlock(&rx->lock);
+ /* Relax it here, the flag will be set again if it really is, but only
+ * when we are not in grace period for safety!
+ */
+ if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD))
+ tx->legacy_user = 0;
+
/* Limit max_tfms & do debug commands if needed */
if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM))
return;
@@ -1533,6 +1642,22 @@ s5:
tipc_crypto_do_cmd(rx->net, cmd);
}
+static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode, u8 type)
+{
+ struct sk_buff *skb;
+
+ skb = skb_clone(_skb, GFP_ATOMIC);
+ if (skb) {
+ TIPC_SKB_CB(skb)->xmit_type = type;
+ tipc_crypto_xmit(net, &skb, b, dst, __dnode);
+ if (skb)
+ b->media->send_msg(net, skb, b, dst);
+ }
+}
+
/**
* tipc_crypto_xmit - Build & encrypt TIPC message for xmit
* @net: struct net
@@ -1542,7 +1667,8 @@ s5:
* @__dnode: destination node for reference if any
*
* First, build an encryption message header on the top of the message, then
- * encrypt the original TIPC message by using the active or pending TX key.
+ * encrypt the original TIPC message by using the pending, master or active
+ * key with this preference order.
* If the encryption is successful, the encrypted skb is returned directly or
* via the callback.
* Otherwise, the skb is freed!
@@ -1562,46 +1688,67 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode);
struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
struct tipc_crypto_stats __percpu *stats = tx->stats;
+ struct tipc_msg *hdr = buf_msg(*skb);
struct tipc_key key = tx->key;
struct tipc_aead *aead = NULL;
- struct sk_buff *probe;
+ u32 user = msg_user(hdr);
+ u32 type = msg_type(hdr);
int rc = -ENOKEY;
- u8 tx_key;
+ u8 tx_key = 0;
/* No encryption? */
if (!tx->working)
return 0;
- /* Try with the pending key if available and:
- * 1) This is the only choice (i.e. no active key) or;
- * 2) Peer has switched to this key (unicast only) or;
- * 3) It is time to do a pending key probe;
- */
+ /* Pending key if peer has active on it or probing time */
if (unlikely(key.pending)) {
tx_key = key.pending;
- if (!key.active)
+ if (!tx->key_master && !key.active)
goto encrypt;
if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key)
goto encrypt;
- if (TIPC_SKB_CB(*skb)->probe)
+ if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) {
+ pr_debug("%s: probing for key[%d]\n", tx->name,
+ key.pending);
+ goto encrypt;
+ }
+ if (user == LINK_CONFIG || user == LINK_PROTOCOL)
+ tipc_crypto_clone_msg(net, *skb, b, dst, __dnode,
+ SKB_PROBING);
+ }
+
+ /* Master key if this is a *vital* message or in grace period */
+ if (tx->key_master) {
+ tx_key = KEY_MASTER;
+ if (!key.active)
+ goto encrypt;
+ if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) {
+ pr_debug("%s: gracing for msg (%d %d)\n", tx->name,
+ user, type);
goto encrypt;
- if (!__rx &&
- time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) {
- tx->timer2 = jiffies;
- probe = skb_clone(*skb, GFP_ATOMIC);
- if (probe) {
- TIPC_SKB_CB(probe)->probe = 1;
- tipc_crypto_xmit(net, &probe, b, dst, __dnode);
- if (probe)
- b->media->send_msg(net, probe, b, dst);
+ }
+ if (user == LINK_CONFIG ||
+ (user == LINK_PROTOCOL && type == RESET_MSG) ||
+ (user == MSG_CRYPTO && type == KEY_DISTR_MSG) ||
+ time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) {
+ if (__rx && __rx->key_master &&
+ !atomic_read(&__rx->peer_rx_active))
+ goto encrypt;
+ if (!__rx) {
+ if (likely(!tx->legacy_user))
+ goto encrypt;
+ tipc_crypto_clone_msg(net, *skb, b, dst,
+ __dnode, SKB_GRACING);
}
}
}
+
/* Else, use the active key if any */
if (likely(key.active)) {
tx_key = key.active;
goto encrypt;
}
+
goto exit;
encrypt:
@@ -1667,30 +1814,21 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
struct tipc_aead *aead = NULL;
struct tipc_key key;
int rc = -ENOKEY;
- u8 tx_key = 0;
+ u8 tx_key, n;
+
+ tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key;
/* New peer?
* Let's try with TX key (i.e. cluster mode) & verify the skb first!
*/
- if (unlikely(!rx))
+ if (unlikely(!rx || tx_key == KEY_MASTER))
goto pick_tx;
- /* Pick RX key according to TX key, three cases are possible:
- * 1) The current active key (likely) or;
- * 2) The pending (new or deactivated) key (if any) or;
- * 3) The passive or old active key (i.e. users > 0);
- */
- tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key;
+ /* Pick RX key according to TX key if any */
key = rx->key;
- if (likely(tx_key == key.active))
+ if (tx_key == key.active || tx_key == key.pending ||
+ tx_key == key.passive)
goto decrypt;
- if (tx_key == key.pending)
- goto decrypt;
- if (tx_key == key.passive) {
- rx->timer2 = jiffies;
- if (tipc_aead_users(rx->aead[key.passive]) > 0)
- goto decrypt;
- }
/* Unknown key, let's try to align RX key(s) */
if (tipc_crypto_key_try_align(rx, tx_key))
@@ -1698,7 +1836,7 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
pick_tx:
/* No key suitable? Try to pick one from TX... */
- aead = tipc_crypto_key_pick_tx(tx, rx, *skb);
+ aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key);
if (aead)
goto decrypt;
goto exit;
@@ -1726,8 +1864,19 @@ exit:
if (rc == -ENOKEY) {
kfree_skb(*skb);
*skb = NULL;
- if (rx)
+ if (rx) {
+ /* Mark rx->nokey only if we dont have a
+ * pending received session key, nor a newer
+ * one i.e. in the next slot.
+ */
+ n = key_next(tx_key);
+ rx->nokey = !(rx->skey ||
+ rcu_access_pointer(rx->aead[n]));
+ pr_debug_ratelimited("%s: nokey %d, key %d/%x\n",
+ rx->name, rx->nokey,
+ tx_key, rx->key.keys);
tipc_node_put(rx->node);
+ }
this_cpu_inc(stats->stat[STAT_NOKEYS]);
return rc;
} else if (rc == -EBADMSG) {
@@ -1749,21 +1898,17 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
struct tipc_aead *tmp = NULL;
struct tipc_ehdr *ehdr;
struct tipc_node *n;
- u8 rx_key_active;
- bool destined;
/* Is this completed by TX? */
- if (unlikely(!rx->node)) {
+ if (unlikely(is_tx(aead->crypto))) {
rx = skb_cb->tx_clone_ctx.rx;
-#ifdef TIPC_CRYPTO_DEBUG
- pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n",
- (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead,
- (*skb)->next, skb_cb->flags);
- pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n",
- skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last,
- aead->crypto->aead[1], aead->crypto->aead[2],
- aead->crypto->aead[3]);
-#endif
+ pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n",
+ (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead,
+ (*skb)->next, skb_cb->flags);
+ pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n",
+ skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last,
+ aead->crypto->aead[1], aead->crypto->aead[2],
+ aead->crypto->aead[3]);
if (unlikely(err)) {
if (err == -EBADMSG && (*skb)->next)
tipc_rcv(net, (*skb)->next, b);
@@ -1784,12 +1929,12 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
goto free_skb;
}
- /* Skip cloning this time as we had a RX pending key */
- if (rx->key.pending)
+ /* Ignore cloning if it was TX master key */
+ if (ehdr->tx_key == KEY_MASTER)
goto rcv;
if (tipc_aead_clone(&tmp, aead) < 0)
goto rcv;
- if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) {
+ if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) {
tipc_aead_free(&tmp->rcu);
goto rcv;
}
@@ -1805,14 +1950,18 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
/* Set the RX key's user */
tipc_aead_users_set(aead, 1);
-rcv:
/* Mark this point, RX works */
rx->timer1 = jiffies;
+rcv:
/* Remove ehdr & auth. tag prior to tipc_rcv() */
ehdr = (struct tipc_ehdr *)(*skb)->data;
- destined = ehdr->destined;
- rx_key_active = ehdr->rx_key_active;
+
+ /* Mark this point, RX passive still works */
+ if (rx->key.passive && ehdr->tx_key == rx->key.passive)
+ rx->timer2 = jiffies;
+
+ skb_reset_network_header(*skb);
skb_pull(*skb, tipc_ehdr_size(ehdr));
pskb_trim(*skb, (*skb)->len - aead->authsize);
@@ -1822,9 +1971,8 @@ rcv:
goto free_skb;
}
- /* Update peer RX active key & TX users */
- if (destined)
- tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb));
+ /* Ok, everything's fine, try to synch own keys according to peers' */
+ tipc_crypto_key_synch(rx, *skb);
/* Mark skb decrypted */
skb_cb->decrypted = 1;
@@ -1883,7 +2031,7 @@ print_stats:
/* Print crypto statistics */
for (i = 0, j = 0; i < MAX_STATS; i++)
j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]);
- pr_info("\nCounter %s", buf);
+ pr_info("Counter %s", buf);
memset(buf, '-', 115);
buf[115] = '\0';
@@ -1927,21 +2075,31 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf)
char *s;
for (k = KEY_MIN; k <= KEY_MAX; k++) {
- if (k == key.passive)
- s = "PAS";
- else if (k == key.active)
- s = "ACT";
- else if (k == key.pending)
- s = "PEN";
- else
- s = "-";
+ if (k == KEY_MASTER) {
+ if (is_rx(c))
+ continue;
+ if (time_before(jiffies,
+ c->timer2 + TIPC_TX_GRACE_PERIOD))
+ s = "ACT";
+ else
+ s = "PAS";
+ } else {
+ if (k == key.passive)
+ s = "PAS";
+ else if (k == key.active)
+ s = "ACT";
+ else if (k == key.pending)
+ s = "PEN";
+ else
+ s = "-";
+ }
i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s);
rcu_read_lock();
aead = rcu_dereference(c->aead[k]);
if (aead)
i += scnprintf(buf + i, 200 - i,
- "{\"%s...\", \"%s\"}/%d:%d",
+ "{\"0x...%s\", \"%s\"}/%d:%d",
aead->hint,
(aead->mode == CLUSTER_KEY) ? "c" : "p",
atomic_read(&aead->users),
@@ -1950,14 +2108,13 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf)
i += scnprintf(buf + i, 200 - i, "\n");
}
- if (c->node)
+ if (is_rx(c))
i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n",
atomic_read(&c->peer_rx_active));
return buf;
}
-#ifdef TIPC_CRYPTO_DEBUG
static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
char *buf)
{
@@ -1968,7 +2125,7 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
/* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */
again:
i += scnprintf(buf + i, 32 - i, "[");
- for (k = KEY_MIN; k <= KEY_MAX; k++) {
+ for (k = KEY_1; k <= KEY_3; k++) {
if (k == key->passive)
s = "pas";
else if (k == key->active)
@@ -1978,7 +2135,7 @@ again:
else
s = "-";
i += scnprintf(buf + i, 32 - i,
- (k != KEY_MAX) ? "%s " : "%s", s);
+ (k != KEY_3) ? "%s " : "%s", s);
}
if (key != &new) {
i += scnprintf(buf + i, 32 - i, "] -> ");
@@ -1988,4 +2145,320 @@ again:
i += scnprintf(buf + i, 32 - i, "]");
return buf;
}
-#endif
+
+/**
+ * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point
+ * @net: the struct net
+ * @skb: the receiving message buffer
+ */
+void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb)
+{
+ struct tipc_crypto *rx;
+ struct tipc_msg *hdr;
+
+ if (unlikely(skb_linearize(skb)))
+ goto exit;
+
+ hdr = buf_msg(skb);
+ rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr));
+ if (unlikely(!rx))
+ goto exit;
+
+ switch (msg_type(hdr)) {
+ case KEY_DISTR_MSG:
+ if (tipc_crypto_key_rcv(rx, hdr))
+ goto exit;
+ break;
+ default:
+ break;
+ }
+
+ tipc_node_put(rx->node);
+
+exit:
+ kfree_skb(skb);
+}
+
+/**
+ * tipc_crypto_key_distr - Distribute a TX key
+ * @tx: the TX crypto
+ * @key: the key's index
+ * @dest: the destination tipc node, = NULL if distributing to all nodes
+ *
+ * Return: 0 in case of success, otherwise < 0
+ */
+int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key,
+ struct tipc_node *dest)
+{
+ struct tipc_aead *aead;
+ u32 dnode = tipc_node_get_addr(dest);
+ int rc = -ENOKEY;
+
+ if (!sysctl_tipc_key_exchange_enabled)
+ return 0;
+
+ if (key) {
+ rcu_read_lock();
+ aead = tipc_aead_get(tx->aead[key]);
+ if (likely(aead)) {
+ rc = tipc_crypto_key_xmit(tx->net, aead->key,
+ aead->gen, aead->mode,
+ dnode);
+ tipc_aead_put(aead);
+ }
+ rcu_read_unlock();
+ }
+
+ return rc;
+}
+
+/**
+ * tipc_crypto_key_xmit - Send a session key
+ * @net: the struct net
+ * @skey: the session key to be sent
+ * @gen: the key's generation
+ * @mode: the key's mode
+ * @dnode: the destination node address, = 0 if broadcasting to all nodes
+ *
+ * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG'
+ * as its data section, then xmit-ed through the uc/bc link.
+ *
+ * Return: 0 in case of success, otherwise < 0
+ */
+static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey,
+ u16 gen, u8 mode, u32 dnode)
+{
+ struct sk_buff_head pkts;
+ struct tipc_msg *hdr;
+ struct sk_buff *skb;
+ u16 size, cong_link_cnt;
+ u8 *data;
+ int rc;
+
+ size = tipc_aead_key_size(skey);
+ skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = buf_msg(skb);
+ tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG,
+ INT_H_SIZE, dnode);
+ msg_set_size(hdr, INT_H_SIZE + size);
+ msg_set_key_gen(hdr, gen);
+ msg_set_key_mode(hdr, mode);
+
+ data = msg_data(hdr);
+ *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen);
+ memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME);
+ memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key,
+ skey->keylen);
+
+ __skb_queue_head_init(&pkts);
+ __skb_queue_tail(&pkts, skb);
+ if (dnode)
+ rc = tipc_node_xmit(net, &pkts, dnode, 0);
+ else
+ rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt);
+
+ return rc;
+}
+
+/**
+ * tipc_crypto_key_rcv - Receive a session key
+ * @rx: the RX crypto
+ * @hdr: the TIPC v2 message incl. the receiving session key in its data
+ *
+ * This function retrieves the session key in the message from peer, then
+ * schedules a RX work to attach the key to the corresponding RX crypto.
+ *
+ * Return: "true" if the key has been scheduled for attaching, otherwise
+ * "false".
+ */
+static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr)
+{
+ struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx;
+ struct tipc_aead_key *skey = NULL;
+ u16 key_gen = msg_key_gen(hdr);
+ u16 size = msg_data_sz(hdr);
+ u8 *data = msg_data(hdr);
+
+ spin_lock(&rx->lock);
+ if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) {
+ pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name,
+ rx->skey, key_gen, rx->key_gen);
+ goto exit;
+ }
+
+ /* Allocate memory for the key */
+ skey = kmalloc(size, GFP_ATOMIC);
+ if (unlikely(!skey)) {
+ pr_err("%s: unable to allocate memory for skey\n", rx->name);
+ goto exit;
+ }
+
+ /* Copy key from msg data */
+ skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME)));
+ memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME);
+ memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32),
+ skey->keylen);
+
+ /* Sanity check */
+ if (unlikely(size != tipc_aead_key_size(skey))) {
+ kfree(skey);
+ skey = NULL;
+ goto exit;
+ }
+
+ rx->key_gen = key_gen;
+ rx->skey_mode = msg_key_mode(hdr);
+ rx->skey = skey;
+ rx->nokey = 0;
+ mb(); /* for nokey flag */
+
+exit:
+ spin_unlock(&rx->lock);
+
+ /* Schedule the key attaching on this crypto */
+ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0)))
+ return true;
+
+ return false;
+}
+
+/**
+ * tipc_crypto_work_rx - Scheduled RX works handler
+ * @work: the struct RX work
+ *
+ * The function processes the previous scheduled works i.e. distributing TX key
+ * or attaching a received session key on RX crypto.
+ */
+static void tipc_crypto_work_rx(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work);
+ struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx;
+ unsigned long delay = msecs_to_jiffies(5000);
+ bool resched = false;
+ u8 key;
+ int rc;
+
+ /* Case 1: Distribute TX key to peer if scheduled */
+ if (atomic_cmpxchg(&rx->key_distr,
+ KEY_DISTR_SCHED,
+ KEY_DISTR_COMPL) == KEY_DISTR_SCHED) {
+ /* Always pick the newest one for distributing */
+ key = tx->key.pending ?: tx->key.active;
+ rc = tipc_crypto_key_distr(tx, key, rx->node);
+ if (unlikely(rc))
+ pr_warn("%s: unable to distr key[%d] to %s, err %d\n",
+ tx->name, key, tipc_node_get_id_str(rx->node),
+ rc);
+
+ /* Sched for key_distr releasing */
+ resched = true;
+ } else {
+ atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0);
+ }
+
+ /* Case 2: Attach a pending received session key from peer if any */
+ if (rx->skey) {
+ rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false);
+ if (unlikely(rc < 0))
+ pr_warn("%s: unable to attach received skey, err %d\n",
+ rx->name, rc);
+ switch (rc) {
+ case -EBUSY:
+ case -ENOMEM:
+ /* Resched the key attaching */
+ resched = true;
+ break;
+ default:
+ synchronize_rcu();
+ kfree(rx->skey);
+ rx->skey = NULL;
+ break;
+ }
+ }
+
+ if (resched && queue_delayed_work(tx->wq, &rx->work, delay))
+ return;
+
+ tipc_node_put(rx->node);
+}
+
+/**
+ * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval
+ * @tx: TX crypto
+ * @changed: if the rekeying needs to be rescheduled with new interval
+ * @new_intv: new rekeying interval (when "changed" = true)
+ */
+void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed,
+ u32 new_intv)
+{
+ unsigned long delay;
+ bool now = false;
+
+ if (changed) {
+ if (new_intv == TIPC_REKEYING_NOW)
+ now = true;
+ else
+ tx->rekeying_intv = new_intv;
+ cancel_delayed_work_sync(&tx->work);
+ }
+
+ if (tx->rekeying_intv || now) {
+ delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000;
+ queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay));
+ }
+}
+
+/**
+ * tipc_crypto_work_tx - Scheduled TX works handler
+ * @work: the struct TX work
+ *
+ * The function processes the previous scheduled work, i.e. key rekeying, by
+ * generating a new session key based on current one, then attaching it to the
+ * TX crypto and finally distributing it to peers. It also re-schedules the
+ * rekeying if needed.
+ */
+static void tipc_crypto_work_tx(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work);
+ struct tipc_aead_key *skey = NULL;
+ struct tipc_key key = tx->key;
+ struct tipc_aead *aead;
+ int rc = -ENOMEM;
+
+ if (unlikely(key.pending))
+ goto resched;
+
+ /* Take current key as a template */
+ rcu_read_lock();
+ aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]);
+ if (unlikely(!aead)) {
+ rcu_read_unlock();
+ /* At least one key should exist for securing */
+ return;
+ }
+
+ /* Lets duplicate it first */
+ skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC);
+ rcu_read_unlock();
+
+ /* Now, generate new key, initiate & distribute it */
+ if (likely(skey)) {
+ rc = tipc_aead_key_generate(skey) ?:
+ tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false);
+ if (likely(rc > 0))
+ rc = tipc_crypto_key_distr(tx, rc, NULL);
+ kzfree(skey);
+ }
+
+ if (unlikely(rc))
+ pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc);
+
+resched:
+ /* Re-schedule rekeying if any */
+ tipc_crypto_rekeying_sched(tx, false, 0);
+}
diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h
index c3de769f49e8..e71193bd5e36 100644
--- a/net/tipc/crypto.h
+++ b/net/tipc/crypto.h
@@ -67,6 +67,7 @@ enum {
};
extern int sysctl_tipc_max_tfms __read_mostly;
+extern int sysctl_tipc_key_exchange_enabled __read_mostly;
/**
* TIPC encryption message format:
@@ -74,7 +75,7 @@ extern int sysctl_tipc_max_tfms __read_mostly;
* 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
* 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * w0:|Ver=7| User |D|TX |RX |K| Rsvd |
+ * w0:|Ver=7| User |D|TX |RX |K|M|N| Rsvd |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* w1:| Seqno |
* w2:| (8 octets) |
@@ -101,6 +102,9 @@ extern int sysctl_tipc_max_tfms __read_mostly;
* RX : Currently RX active key corresponding to the destination
* node's TX key (when the "D" bit is set)
* K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only)
+ * M : Bit indicates if sender has master key
+ * N : Bit indicates if sender has no RX keys corresponding to the
+ * receiver's TX (when the "D" bit is set)
* Rsvd : Reserved bit, field
* Word1-2:
* Seqno : The 64-bit sequence number of the encrypted message, also
@@ -117,7 +121,9 @@ struct tipc_ehdr {
__u8 destined:1,
user:4,
version:3;
- __u8 reserved_1:3,
+ __u8 reserved_1:1,
+ rx_nokey:1,
+ master_key:1,
keepalive:1,
rx_key_active:2,
tx_key:2;
@@ -128,7 +134,9 @@ struct tipc_ehdr {
__u8 tx_key:2,
rx_key_active:2,
keepalive:1,
- reserved_1:3;
+ master_key:1,
+ rx_nokey:1,
+ reserved_1:1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
@@ -158,10 +166,35 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
struct sk_buff **skb, struct tipc_bearer *b);
int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
- u8 mode);
+ u8 mode, bool master_key);
void tipc_crypto_key_flush(struct tipc_crypto *c);
-int tipc_aead_key_validate(struct tipc_aead_key *ukey);
+int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key,
+ struct tipc_node *dest);
+void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb);
+void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed,
+ u32 new_intv);
+int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info);
bool tipc_ehdr_validate(struct sk_buff *skb);
+static inline u32 msg_key_gen(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_key_gen(struct tipc_msg *m, u32 gen)
+{
+ msg_set_bits(m, 4, 16, 0xffff, gen);
+}
+
+static inline u32 msg_key_mode(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 0, 0xf);
+}
+
+static inline void msg_set_key_mode(struct tipc_msg *m, u32 mode)
+{
+ msg_set_bits(m, 4, 0, 0xf, mode);
+}
+
#endif /* _TIPC_CRYPTO_H */
#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index cef38a910107..06b880da2a8e 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -216,11 +216,6 @@ enum {
#define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10))
#define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1))
-/*
- * Interval between NACKs when packets arrive out of order
- */
-#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-
/* Link FSM states:
*/
enum {
@@ -1256,6 +1251,11 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
case MSG_FRAGMENTER:
case BCAST_PROTOCOL:
return false;
+#ifdef CONFIG_TIPC_CRYPTO
+ case MSG_CRYPTO:
+ tipc_crypto_msg_rcv(l->net, skb);
+ return true;
+#endif
default:
pr_warn("Dropping received illegal msg type\n");
kfree_skb(skb);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 52e93ba4d8e2..11a429f4c6cd 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -582,7 +582,7 @@ bundle:
* @pos: position in outer message of msg to be extracted.
* Returns position of next msg
* Consumes outer buffer when last packet extracted
- * Returns true when when there is an extracted buffer, otherwise false
+ * Returns true when there is an extracted buffer, otherwise false
*/
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
{
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 1016e96db5c4..5d64596ba987 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -82,6 +82,7 @@ struct plist;
#define NAME_DISTRIBUTOR 11
#define MSG_FRAGMENTER 12
#define LINK_CONFIG 13
+#define MSG_CRYPTO 14
#define SOCK_WAKEUP 14 /* pseudo user */
#define TOP_SRV 15 /* pseudo user */
@@ -127,7 +128,9 @@ struct tipc_skb_cb {
#ifdef CONFIG_TIPC_CRYPTO
u8 encrypted:1;
u8 decrypted:1;
- u8 probe:1;
+#define SKB_PROBING 1
+#define SKB_GRACING 2
+ u8 xmit_type:2;
u8 tx_clone_deferred:1;
#endif
};
@@ -747,6 +750,9 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
#define GRP_RECLAIM_MSG 4
#define GRP_REMIT_MSG 5
+/* Crypto message types */
+#define KEY_DISTR_MSG 0
+
/*
* Word 1
*/
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 85400e4242de..0bb2323201da 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -105,12 +105,6 @@
* - A local spin_lock protecting the queue of subscriber events.
*/
-struct tipc_net_work {
- struct work_struct work;
- struct net *net;
- u32 addr;
-};
-
static void tipc_net_finalize(struct net *net, u32 addr);
int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
@@ -142,25 +136,21 @@ static void tipc_net_finalize(struct net *net, u32 addr)
TIPC_CLUSTER_SCOPE, 0, addr);
}
-static void tipc_net_finalize_work(struct work_struct *work)
+void tipc_net_finalize_work(struct work_struct *work)
{
struct tipc_net_work *fwork;
fwork = container_of(work, struct tipc_net_work, work);
tipc_net_finalize(fwork->net, fwork->addr);
- kfree(fwork);
}
void tipc_sched_net_finalize(struct net *net, u32 addr)
{
- struct tipc_net_work *fwork = kzalloc(sizeof(*fwork), GFP_ATOMIC);
+ struct tipc_net *tn = tipc_net(net);
- if (!fwork)
- return;
- INIT_WORK(&fwork->work, tipc_net_finalize_work);
- fwork->net = net;
- fwork->addr = addr;
- schedule_work(&fwork->work);
+ tn->final_work.net = net;
+ tn->final_work.addr = addr;
+ schedule_work(&tn->final_work.work);
}
void tipc_net_stop(struct net *net)
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 6740d97c706e..d0c91d2df20a 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -42,6 +42,7 @@
extern const struct nla_policy tipc_nl_net_policy[];
int tipc_net_init(struct net *net, u8 *node_id, u32 addr);
+void tipc_net_finalize_work(struct work_struct *work);
void tipc_sched_net_finalize(struct net *net, u32 addr);
void tipc_net_stop(struct net *net);
int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index c4aee6247d55..c447cb5f879e 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -108,6 +108,8 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
.len = TIPC_NODEID_LEN},
[TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY,
.len = TIPC_AEAD_KEY_SIZE_MAX},
+ [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG },
+ [TIPC_NLA_NODE_REKEYING] = { .type = NLA_U32 },
};
/* Properties valid for media, bearer and link */
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 4edcee3088da..cf4b239fc569 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -278,6 +278,14 @@ struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos)
{
return container_of(pos, struct tipc_node, list)->crypto_rx;
}
+
+struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr)
+{
+ struct tipc_node *n;
+
+ n = tipc_node_find(net, addr);
+ return (n) ? n->crypto_rx : NULL;
+}
#endif
static void tipc_node_free(struct rcu_head *rp)
@@ -303,7 +311,7 @@ void tipc_node_put(struct tipc_node *node)
kref_put(&node->kref, tipc_node_kref_release);
}
-static void tipc_node_get(struct tipc_node *node)
+void tipc_node_get(struct tipc_node *node)
{
kref_get(&node->kref);
}
@@ -584,6 +592,9 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
static void tipc_node_delete_from_list(struct tipc_node *node)
{
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_key_flush(node->crypto_rx);
+#endif
list_del_rcu(&node->list);
hlist_del_rcu(&node->hash);
tipc_node_put(node);
@@ -2868,15 +2879,27 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id)
return 0;
}
+static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv)
+{
+ struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING];
+
+ if (!attr)
+ return -ENODATA;
+
+ *intv = nla_get_u32(attr);
+ return 0;
+}
+
static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1];
struct net *net = sock_net(skb->sk);
- struct tipc_net *tn = tipc_net(net);
+ struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx;
struct tipc_node *n = NULL;
struct tipc_aead_key *ukey;
- struct tipc_crypto *c;
- u8 *id, *own_id;
+ bool rekeying = true, master_key = false;
+ u8 *id, *own_id, mode;
+ u32 intv = 0;
int rc = 0;
if (!info->attrs[TIPC_NLA_NODE])
@@ -2886,52 +2909,66 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
info->attrs[TIPC_NLA_NODE],
tipc_nl_node_policy, info->extack);
if (rc)
- goto exit;
+ return rc;
own_id = tipc_own_id(net);
if (!own_id) {
- rc = -EPERM;
- goto exit;
+ GENL_SET_ERR_MSG(info, "not found own node identity (set id?)");
+ return -EPERM;
}
+ rc = tipc_nl_retrieve_rekeying(attrs, &intv);
+ if (rc == -ENODATA)
+ rekeying = false;
+
rc = tipc_nl_retrieve_key(attrs, &ukey);
- if (rc)
- goto exit;
+ if (rc == -ENODATA && rekeying)
+ goto rekeying;
+ else if (rc)
+ return rc;
- rc = tipc_aead_key_validate(ukey);
+ rc = tipc_aead_key_validate(ukey, info);
if (rc)
- goto exit;
+ return rc;
rc = tipc_nl_retrieve_nodeid(attrs, &id);
switch (rc) {
case -ENODATA:
- /* Cluster key mode */
- rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY);
+ mode = CLUSTER_KEY;
+ master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]);
break;
case 0:
- /* Per-node key mode */
- if (!memcmp(id, own_id, NODE_ID_LEN)) {
- c = tn->crypto_tx;
- } else {
+ mode = PER_NODE_KEY;
+ if (memcmp(id, own_id, NODE_ID_LEN)) {
n = tipc_node_find_by_id(net, id) ?:
tipc_node_create(net, 0, id, 0xffffu, 0, true);
- if (unlikely(!n)) {
- rc = -ENOMEM;
- break;
- }
+ if (unlikely(!n))
+ return -ENOMEM;
c = n->crypto_rx;
}
-
- rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY);
- if (n)
- tipc_node_put(n);
break;
default:
- break;
+ return rc;
}
-exit:
- return (rc < 0) ? rc : 0;
+ /* Initiate the TX/RX key */
+ rc = tipc_crypto_key_init(c, ukey, mode, master_key);
+ if (n)
+ tipc_node_put(n);
+
+ if (unlikely(rc < 0)) {
+ GENL_SET_ERR_MSG(info, "unable to initiate or attach new key");
+ return rc;
+ } else if (c == tx) {
+ /* Distribute TX key but not master one */
+ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL))
+ GENL_SET_ERR_MSG(info, "failed to replicate new key");
+rekeying:
+ /* Schedule TX rekeying if needed */
+ tipc_crypto_rekeying_sched(tx, rekeying, intv);
+ }
+
+ return 0;
}
int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
@@ -2958,7 +2995,6 @@ static int __tipc_nl_node_flush_key(struct sk_buff *skb,
tipc_crypto_key_flush(n->crypto_rx);
rcu_read_unlock();
- pr_info("All keys are flushed!\n");
return 0;
}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 9f6f13f1604f..154a5bbb0d29 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -79,12 +79,14 @@ bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
u32 tipc_node_get_addr(struct tipc_node *node);
char *tipc_node_get_id_str(struct tipc_node *node);
void tipc_node_put(struct tipc_node *node);
+void tipc_node_get(struct tipc_node *node);
struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
u16 capabilities, u32 hash_mixes,
bool preliminary);
#ifdef CONFIG_TIPC_CRYPTO
struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n);
struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos);
+struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr);
#endif
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 11b27ddc75ba..e795a8a2955b 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -52,10 +52,9 @@
#define NAGLE_START_MAX 1024
#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */
-#define TIPC_FWD_MSG 1
#define TIPC_MAX_PORT 0xffffffff
#define TIPC_MIN_PORT 1
-#define TIPC_ACK_RATE 4 /* ACK at 1/4 of of rcv window size */
+#define TIPC_ACK_RATE 4 /* ACK at 1/4 of rcv window size */
enum {
TIPC_LISTEN = TCP_LISTEN,
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 97a6264a2993..9fb65c988f7f 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -74,6 +74,15 @@ static struct ctl_table tipc_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "key_exchange_enabled",
+ .data = &sysctl_tipc_key_exchange_enabled,
+ .maxlen = sizeof(sysctl_tipc_key_exchange_enabled),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
#endif
{
.procname = "bc_retruni",
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 1489cfb941d8..5f6f86051c83 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -48,7 +48,6 @@
#define MAX_SEND_MSG_COUNT 25
#define MAX_RECV_MSG_COUNT 25
#define CF_CONNECTED 1
-#define CF_SERVER 2
#define TIPC_SERVER_NAME_LEN 32
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 911d13cd2e67..1d17f4470ee2 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -52,6 +52,7 @@
#include "bearer.h"
#include "netlink.h"
#include "msg.h"
+#include "udp_media.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index bbc52b088d29..002b0859fed5 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -330,12 +330,13 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
tls_ctx_free(sk, ctx);
}
-static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
- int __user *optlen)
+static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval,
+ int __user *optlen, int tx)
{
int rc = 0;
struct tls_context *ctx = tls_get_ctx(sk);
struct tls_crypto_info *crypto_info;
+ struct cipher_context *cctx;
int len;
if (get_user(len, optlen))
@@ -352,7 +353,13 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
/* get user crypto info */
- crypto_info = &ctx->crypto_send.info;
+ if (tx) {
+ crypto_info = &ctx->crypto_send.info;
+ cctx = &ctx->tx;
+ } else {
+ crypto_info = &ctx->crypto_recv.info;
+ cctx = &ctx->rx;
+ }
if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
rc = -EBUSY;
@@ -379,9 +386,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_128->iv,
- ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+ cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
TLS_CIPHER_AES_GCM_128_IV_SIZE);
- memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
+ memcpy(crypto_info_aes_gcm_128->rec_seq, cctx->rec_seq,
TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
@@ -403,9 +410,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_256->iv,
- ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE,
+ cctx->iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE,
TLS_CIPHER_AES_GCM_256_IV_SIZE);
- memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq,
+ memcpy(crypto_info_aes_gcm_256->rec_seq, cctx->rec_seq,
TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
@@ -429,7 +436,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
switch (optname) {
case TLS_TX:
- rc = do_tls_getsockopt_tx(sk, optval, optlen);
+ case TLS_RX:
+ rc = do_tls_getsockopt_conf(sk, optval, optlen,
+ optname == TLS_TX);
break;
default:
rc = -ENOPROTOOPT;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 92784e51ee7d..eb82bdc6cf7c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -878,7 +878,6 @@ static int unix_autobind(struct socket *sock)
if (err)
return err;
- err = 0;
if (u->addr)
goto out;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 6a6f2f214c10..96e24ee4c7e8 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -141,9 +141,62 @@ static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef)
return true;
}
+static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width)
+{
+ int mhz;
+
+ switch (chan_width) {
+ case NL80211_CHAN_WIDTH_1:
+ mhz = 1;
+ break;
+ case NL80211_CHAN_WIDTH_2:
+ mhz = 2;
+ break;
+ case NL80211_CHAN_WIDTH_4:
+ mhz = 4;
+ break;
+ case NL80211_CHAN_WIDTH_8:
+ mhz = 8;
+ break;
+ case NL80211_CHAN_WIDTH_16:
+ mhz = 16;
+ break;
+ case NL80211_CHAN_WIDTH_5:
+ mhz = 5;
+ break;
+ case NL80211_CHAN_WIDTH_10:
+ mhz = 10;
+ break;
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ mhz = 20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ mhz = 40;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_80:
+ mhz = 80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ mhz = 160;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ return mhz;
+}
+
+static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
+{
+ return nl80211_chan_width_to_mhz(c->width);
+}
+
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
{
- u32 control_freq;
+ u32 control_freq, oper_freq;
+ int oper_width, control_width;
if (!chandef->chan)
return false;
@@ -155,10 +208,6 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
switch (chandef->width) {
case NL80211_CHAN_WIDTH_1:
- case NL80211_CHAN_WIDTH_2:
- case NL80211_CHAN_WIDTH_4:
- case NL80211_CHAN_WIDTH_8:
- case NL80211_CHAN_WIDTH_16:
case NL80211_CHAN_WIDTH_5:
case NL80211_CHAN_WIDTH_10:
case NL80211_CHAN_WIDTH_20:
@@ -169,6 +218,30 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
if (chandef->center_freq2)
return false;
break;
+ case NL80211_CHAN_WIDTH_2:
+ case NL80211_CHAN_WIDTH_4:
+ case NL80211_CHAN_WIDTH_8:
+ case NL80211_CHAN_WIDTH_16:
+ control_freq = ieee80211_channel_to_khz(chandef->chan);
+ oper_freq = ieee80211_chandef_to_khz(chandef);
+ control_width = nl80211_chan_width_to_mhz(
+ ieee80211_s1g_channel_width(
+ chandef->chan));
+ oper_width = cfg80211_chandef_get_width(chandef);
+
+ if (oper_width < 0 || control_width < 0)
+ return false;
+ if (chandef->center_freq2)
+ return false;
+
+ if (control_freq + MHZ_TO_KHZ(control_width) / 2 >
+ oper_freq + MHZ_TO_KHZ(oper_width) / 2)
+ return false;
+
+ if (control_freq - MHZ_TO_KHZ(control_width) / 2 <
+ oper_freq - MHZ_TO_KHZ(oper_width) / 2)
+ return false;
+ break;
case NL80211_CHAN_WIDTH_40:
if (chandef->center_freq1 != control_freq + 10 &&
chandef->center_freq1 != control_freq - 10)
@@ -264,53 +337,6 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
}
}
-static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
-{
- int width;
-
- switch (c->width) {
- case NL80211_CHAN_WIDTH_1:
- width = 1;
- break;
- case NL80211_CHAN_WIDTH_2:
- width = 2;
- break;
- case NL80211_CHAN_WIDTH_4:
- width = 4;
- break;
- case NL80211_CHAN_WIDTH_8:
- width = 8;
- break;
- case NL80211_CHAN_WIDTH_16:
- width = 16;
- break;
- case NL80211_CHAN_WIDTH_5:
- width = 5;
- break;
- case NL80211_CHAN_WIDTH_10:
- width = 10;
- break;
- case NL80211_CHAN_WIDTH_20:
- case NL80211_CHAN_WIDTH_20_NOHT:
- width = 20;
- break;
- case NL80211_CHAN_WIDTH_40:
- width = 40;
- break;
- case NL80211_CHAN_WIDTH_80P80:
- case NL80211_CHAN_WIDTH_80:
- width = 80;
- break;
- case NL80211_CHAN_WIDTH_160:
- width = 160;
- break;
- default:
- WARN_ON_ONCE(1);
- return -1;
- }
- return width;
-}
-
const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
const struct cfg80211_chan_def *c2)
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 67b0389fca4d..2ebc2a66680d 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -466,8 +466,8 @@ extern struct work_struct cfg80211_disconnect_work;
*
* Checks if chandef is usable and we can/need start CAC on such channel.
*
- * Return: Return true if all channels available and at least
- * one channel require CAC (NL80211_DFS_USABLE)
+ * Return: true if all channels available and at least
+ * one channel requires CAC (NL80211_DFS_USABLE)
*/
bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
const struct cfg80211_chan_def *chandef);
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
index cc7b9fd5c166..d66a913027e0 100644
--- a/net/wireless/lib80211.c
+++ b/net/wireless/lib80211.c
@@ -26,8 +26,6 @@
#include <net/lib80211.h>
-#define DRV_NAME "lib80211"
-
#define DRV_DESCRIPTION "common routines for IEEE802.11 drivers"
MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2c9e9a2d1688..1a212db7a300 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -209,14 +209,23 @@ static int validate_beacon_head(const struct nlattr *attr,
unsigned int len = nla_len(attr);
const struct element *elem;
const struct ieee80211_mgmt *mgmt = (void *)data;
- unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
- u.beacon.variable);
+ bool s1g_bcn = ieee80211_is_s1g_beacon(mgmt->frame_control);
+ unsigned int fixedlen, hdrlen;
+
+ if (s1g_bcn) {
+ fixedlen = offsetof(struct ieee80211_ext,
+ u.s1g_beacon.variable);
+ hdrlen = offsetof(struct ieee80211_ext, u.s1g_beacon);
+ } else {
+ fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+ hdrlen = offsetof(struct ieee80211_mgmt, u.beacon);
+ }
if (len < fixedlen)
goto err;
- if (ieee80211_hdrlen(mgmt->frame_control) !=
- offsetof(struct ieee80211_mgmt, u.beacon))
+ if (ieee80211_hdrlen(mgmt->frame_control) != hdrlen)
goto err;
data += fixedlen;
@@ -336,6 +345,13 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
.len = NL80211_MAX_SUPP_HT_RATES },
[NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)),
[NL80211_TXRATE_GI] = { .type = NLA_U8 },
+ [NL80211_TXRATE_HE] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_he)),
+ [NL80211_TXRATE_HE_GI] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_RATE_INFO_HE_GI_0_8,
+ NL80211_RATE_INFO_HE_GI_3_2),
+ [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_RATE_INFO_HE_1XLTF,
+ NL80211_RATE_INFO_HE_4XLTF),
};
static const struct nla_policy
@@ -360,6 +376,22 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
NLA_POLICY_NESTED(nl80211_txattr_policy),
};
+static const struct nla_policy
+nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = {
+ [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000),
+ [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000),
+ NLA_POLICY_RANGE(NLA_BINARY,
+ NL80211_FILS_DISCOVERY_TMPL_MIN_LEN,
+ IEEE80211_MAX_DATA_LEN),
+};
+
+static const struct nla_policy
+nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1] = {
+ [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] = NLA_POLICY_MAX(NLA_U32, 20),
+ [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN }
+};
+
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD },
[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
@@ -539,7 +571,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
[NL80211_ATTR_WDEV] = { .type = NLA_U64 },
[NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
- [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
+
+ /* need to include at least Auth Transaction and Status Code */
+ [NL80211_ATTR_AUTH_DATA] = NLA_POLICY_MIN_LEN(4),
+
[NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN),
[NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
[NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127),
@@ -561,23 +596,30 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_DATA_LEN },
[NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 },
- [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 },
+ [NL80211_ATTR_MAX_CRIT_PROT_DURATION] =
+ NLA_POLICY_MAX(NLA_U16, NL80211_CRIT_PROTO_MAX_DURATION),
[NL80211_ATTR_PEER_AID] =
NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
[NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
[NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
- [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY },
- [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY },
- [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY },
- [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY },
+ [NL80211_ATTR_CNTDWN_OFFS_BEACON] = { .type = NLA_BINARY },
+ [NL80211_ATTR_CNTDWN_OFFS_PRESP] = { .type = NLA_BINARY },
+ [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = NLA_POLICY_MIN_LEN(2),
+ /*
+ * The value of the Length field of the Supported Operating
+ * Classes element is between 2 and 253.
+ */
+ [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] =
+ NLA_POLICY_RANGE(NLA_BINARY, 2, 253),
[NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG },
[NL80211_ATTR_OPMODE_NOTIF] = { .type = NLA_U8 },
[NL80211_ATTR_VENDOR_ID] = { .type = NLA_U32 },
[NL80211_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 },
[NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
- [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY,
- .len = IEEE80211_QOS_MAP_LEN_MAX },
+ [NL80211_ATTR_QOS_MAP] = NLA_POLICY_RANGE(NLA_BINARY,
+ IEEE80211_QOS_MAP_LEN_MIN,
+ IEEE80211_QOS_MAP_LEN_MAX),
[NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
[NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 },
[NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 },
@@ -625,15 +667,17 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
.len = FILS_ERP_MAX_RRK_LEN },
[NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2),
[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
+ [NL80211_ATTR_PMKR0_NAME] = NLA_POLICY_EXACT_LEN(WLAN_PMK_NAME_LEN),
[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
- [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
- .len = NL80211_HE_MAX_CAPABILITY_LEN },
-
+ [NL80211_ATTR_HE_CAPABILITY] =
+ NLA_POLICY_RANGE(NLA_BINARY,
+ NL80211_HE_MIN_CAPABILITY_LEN,
+ NL80211_HE_MAX_CAPABILITY_LEN),
[NL80211_ATTR_FTM_RESPONDER] =
NLA_POLICY_NESTED(nl80211_ftm_responder_policy),
[NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1),
@@ -654,10 +698,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
[NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
[NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED },
- [NL80211_ATTR_HE_6GHZ_CAPABILITY] = {
- .type = NLA_EXACT_LEN,
- .len = sizeof(struct ieee80211_he_6ghz_capa),
- },
+ [NL80211_ATTR_HE_6GHZ_CAPABILITY] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)),
+ [NL80211_ATTR_FILS_DISCOVERY] =
+ NLA_POLICY_NESTED(nl80211_fils_discovery_policy),
+ [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] =
+ NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy),
};
/* policy for the key attributes */
@@ -703,7 +749,7 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
[NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
[NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 },
[NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 },
- [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 },
+ [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = NLA_POLICY_MIN_LEN(1),
[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ] = {
.len = sizeof(struct nl80211_wowlan_tcp_data_seq)
},
@@ -711,8 +757,8 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
.len = sizeof(struct nl80211_wowlan_tcp_data_token)
},
[NL80211_WOWLAN_TCP_DATA_INTERVAL] = { .type = NLA_U32 },
- [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 },
- [NL80211_WOWLAN_TCP_WAKE_MASK] = { .type = NLA_MIN_LEN, .len = 1 },
+ [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = NLA_POLICY_MIN_LEN(1),
+ [NL80211_WOWLAN_TCP_WAKE_MASK] = NLA_POLICY_MIN_LEN(1),
};
#endif /* CONFIG_PM */
@@ -738,7 +784,7 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
.type = NLA_BINARY,
.len = NL80211_KCK_EXT_LEN
},
- [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN),
+ [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN(NL80211_REPLAY_CTR_LEN),
[NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 },
};
@@ -778,7 +824,8 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
/* policy for NAN function attributes */
static const struct nla_policy
nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
- [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 },
+ [NL80211_NAN_FUNC_TYPE] =
+ NLA_POLICY_MAX(NLA_U8, NL80211_NAN_FUNC_MAX_TYPE),
[NL80211_NAN_FUNC_SERVICE_ID] = {
.len = NL80211_NAN_FUNC_SERVICE_ID_LEN },
[NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 },
@@ -992,6 +1039,21 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if ((chan->flags & IEEE80211_CHAN_NO_HE) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE))
goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_1MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_2MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_4MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_8MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_16MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ))
+ goto nla_put_failure;
}
if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
@@ -4419,21 +4481,106 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
return true;
}
+static u16 he_mcs_map_to_mcs_mask(u8 he_mcs_map)
+{
+ switch (he_mcs_map) {
+ case IEEE80211_HE_MCS_NOT_SUPPORTED:
+ return 0;
+ case IEEE80211_HE_MCS_SUPPORT_0_7:
+ return 0x00FF;
+ case IEEE80211_HE_MCS_SUPPORT_0_9:
+ return 0x03FF;
+ case IEEE80211_HE_MCS_SUPPORT_0_11:
+ return 0xFFF;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void he_build_mcs_mask(u16 he_mcs_map,
+ u16 he_mcs_mask[NL80211_HE_NSS_MAX])
+{
+ u8 nss;
+
+ for (nss = 0; nss < NL80211_HE_NSS_MAX; nss++) {
+ he_mcs_mask[nss] = he_mcs_map_to_mcs_mask(he_mcs_map & 0x03);
+ he_mcs_map >>= 2;
+ }
+}
+
+static u16 he_get_txmcsmap(struct genl_info *info,
+ const struct ieee80211_sta_he_cap *he_cap)
+{
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ __le16 tx_mcs;
+
+ switch (wdev->chandef.width) {
+ case NL80211_CHAN_WIDTH_80P80:
+ tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_160;
+ break;
+ default:
+ tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80;
+ break;
+ }
+ return le16_to_cpu(tx_mcs);
+}
+
+static bool he_set_mcs_mask(struct genl_info *info,
+ struct wireless_dev *wdev,
+ struct ieee80211_supported_band *sband,
+ struct nl80211_txrate_he *txrate,
+ u16 mcs[NL80211_HE_NSS_MAX])
+{
+ const struct ieee80211_sta_he_cap *he_cap;
+ u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {};
+ u16 tx_mcs_map = 0;
+ u8 i;
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
+ if (!he_cap)
+ return false;
+
+ memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX);
+
+ tx_mcs_map = he_get_txmcsmap(info, he_cap);
+
+ /* Build he_mcs_mask from HE capabilities */
+ he_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
+
+ for (i = 0; i < NL80211_HE_NSS_MAX; i++) {
+ if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
+ mcs[i] = txrate->mcs[i];
+ else
+ return false;
+ }
+
+ return true;
+}
+
static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
struct nlattr *attrs[],
enum nl80211_attrs attr,
- struct cfg80211_bitrate_mask *mask)
+ struct cfg80211_bitrate_mask *mask,
+ struct net_device *dev)
{
struct nlattr *tb[NL80211_TXRATE_MAX + 1];
struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
int rem, i;
struct nlattr *tx_rates;
struct ieee80211_supported_band *sband;
- u16 vht_tx_mcs_map;
+ u16 vht_tx_mcs_map, he_tx_mcs_map;
memset(mask, 0, sizeof(*mask));
/* Default to all rates enabled */
for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ const struct ieee80211_sta_he_cap *he_cap;
+
sband = rdev->wiphy.bands[i];
if (!sband)
@@ -4449,6 +4596,16 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs);
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
+ if (!he_cap)
+ continue;
+
+ he_tx_mcs_map = he_get_txmcsmap(info, he_cap);
+ he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs);
+
+ mask->control[i].he_gi = 0xFF;
+ mask->control[i].he_ltf = 0xFF;
}
/* if no rates are given set it back to the defaults */
@@ -4504,13 +4661,25 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI)
return -EINVAL;
}
+ if (tb[NL80211_TXRATE_HE] &&
+ !he_set_mcs_mask(info, wdev, sband,
+ nla_data(tb[NL80211_TXRATE_HE]),
+ mask->control[band].he_mcs))
+ return -EINVAL;
+ if (tb[NL80211_TXRATE_HE_GI])
+ mask->control[band].he_gi =
+ nla_get_u8(tb[NL80211_TXRATE_HE_GI]);
+ if (tb[NL80211_TXRATE_HE_LTF])
+ mask->control[band].he_ltf =
+ nla_get_u8(tb[NL80211_TXRATE_HE_LTF]);
if (mask->control[band].legacy == 0) {
- /* don't allow empty legacy rates if HT or VHT
+ /* don't allow empty legacy rates if HT, VHT or HE
* are not even supported.
*/
if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
- rdev->wiphy.bands[band]->vht_cap.vht_supported))
+ rdev->wiphy.bands[band]->vht_cap.vht_supported ||
+ ieee80211_get_he_iftype_cap(sband, wdev->iftype)))
return -EINVAL;
for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
@@ -4521,6 +4690,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
if (mask->control[band].vht_mcs[i])
goto out;
+ for (i = 0; i < NL80211_HE_NSS_MAX; i++)
+ if (mask->control[band].he_mcs[i])
+ goto out;
+
/* legacy and mcs rates may not be both empty */
return -EINVAL;
}
@@ -4721,6 +4894,65 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs,
return 0;
}
+static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
+ struct nlattr *attrs,
+ struct cfg80211_ap_settings *params)
+{
+ struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1];
+ int ret;
+ struct cfg80211_fils_discovery *fd = &params->fils_discovery;
+
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_FILS_DISCOVERY))
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, NL80211_FILS_DISCOVERY_ATTR_MAX, attrs,
+ NULL, NULL);
+ if (ret)
+ return ret;
+
+ if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] ||
+ !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] ||
+ !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL])
+ return -EINVAL;
+
+ fd->tmpl_len = nla_len(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
+ fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
+ fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]);
+ fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]);
+
+ return 0;
+}
+
+static int
+nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev,
+ struct nlattr *attrs,
+ struct cfg80211_ap_settings *params)
+{
+ struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1];
+ int ret;
+ struct cfg80211_unsol_bcast_probe_resp *presp =
+ &params->unsol_bcast_probe_resp;
+
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP))
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX,
+ attrs, NULL, NULL);
+ if (ret)
+ return ret;
+
+ if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] ||
+ !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL])
+ return -EINVAL;
+
+ presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
+ presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
+ presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]);
+ return 0;
+}
+
static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
const u8 *rates)
{
@@ -4831,8 +5063,9 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
return false;
return true;
case NL80211_CMD_START_AP:
- /* SAE not supported yet */
- if (auth_type == NL80211_AUTHTYPE_SAE)
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD_AP) &&
+ auth_type == NL80211_AUTHTYPE_SAE)
return false;
/* FILS not supported yet */
if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
@@ -4896,8 +5129,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
params.ssid_len =
nla_len(info->attrs[NL80211_ATTR_SSID]);
- if (params.ssid_len == 0 ||
- params.ssid_len > IEEE80211_MAX_SSID_LEN)
+ if (params.ssid_len == 0)
return -EINVAL;
}
@@ -4966,7 +5198,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_TX_RATES]) {
err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
NL80211_ATTR_TX_RATES,
- &params.beacon_rate);
+ &params.beacon_rate,
+ dev);
if (err)
return err;
@@ -5028,6 +5261,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
goto out;
}
+ if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) {
+ err = nl80211_parse_fils_discovery(rdev,
+ info->attrs[NL80211_ATTR_FILS_DISCOVERY],
+ &params);
+ if (err)
+ goto out;
+ }
+
+ if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
+ err = nl80211_parse_unsol_bcast_probe_resp(
+ rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
+ &params);
+ if (err)
+ return err;
+ }
+
nl80211_calculate_ap_params(&params);
if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
@@ -5837,11 +6086,9 @@ static int nl80211_parse_sta_channel_info(struct genl_info *info,
nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
/*
* Need to include at least one (first channel, number of
- * channels) tuple for each subband, and must have proper
- * tuples for the rest of the data as well.
+ * channels) tuple for each subband (checked in policy),
+ * and must have proper tuples for the rest of the data as well.
*/
- if (params->supported_channels_len < 2)
- return -EINVAL;
if (params->supported_channels_len % 2)
return -EINVAL;
}
@@ -5851,13 +6098,6 @@ static int nl80211_parse_sta_channel_info(struct genl_info *info,
nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
params->supported_oper_classes_len =
nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
- /*
- * The value of the Length field of the Supported Operating
- * Classes element is between 2 and 253.
- */
- if (params->supported_oper_classes_len < 2 ||
- params->supported_oper_classes_len > 253)
- return -EINVAL;
}
return 0;
}
@@ -5880,9 +6120,6 @@ static int nl80211_set_station_tdls(struct genl_info *info,
nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
params->he_capa_len =
nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
-
- if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
- return -EINVAL;
}
err = nl80211_parse_sta_channel_info(info, params);
@@ -6141,10 +6378,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
params.he_capa_len =
nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
-
- /* max len is validated in nla policy */
- if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
- return -EINVAL;
}
if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
@@ -8416,23 +8649,14 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
}
if (ssid) {
- if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) {
- err = -EINVAL;
- goto out_free;
- }
memcpy(request->match_sets[i].ssid.ssid,
nla_data(ssid), nla_len(ssid));
request->match_sets[i].ssid.ssid_len =
nla_len(ssid);
}
- if (bssid) {
- if (nla_len(bssid) != ETH_ALEN) {
- err = -EINVAL;
- goto out_free;
- }
+ if (bssid)
memcpy(request->match_sets[i].bssid,
nla_data(bssid), ETH_ALEN);
- }
/* special attribute - old implementation w/a */
request->match_sets[i].rssi_thold = default_match_rssi;
@@ -8787,10 +9011,10 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON])
+ if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON])
return -EINVAL;
- len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+ len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
if (!len || (len % sizeof(u16)))
return -EINVAL;
@@ -8801,7 +9025,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
params.counter_offsets_beacon =
- nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+ nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
/* sanity checks - counters should fit and be the same */
for (i = 0; i < params.n_counter_offsets_beacon; i++) {
@@ -8814,8 +9038,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) {
- len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+ if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) {
+ len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
if (!len || (len % sizeof(u16)))
return -EINVAL;
@@ -8826,7 +9050,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
params.counter_offsets_presp =
- nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+ nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
/* sanity checks - counters should fit and be the same */
for (i = 0; i < params.n_counter_offsets_presp; i++) {
@@ -9309,9 +9533,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
- /* need to include at least Auth Transaction and Status Code */
- if (auth_data_len < 4)
- return -EINVAL;
}
local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
@@ -9451,7 +9672,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
if (!wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_SAE_OFFLOAD))
+ NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_SAE_OFFLOAD_AP))
return -EINVAL;
settings->sae_pwd =
nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
@@ -10798,7 +11021,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
return -EOPNOTSUPP;
err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
- NL80211_ATTR_TX_RATES, &mask);
+ NL80211_ATTR_TX_RATES, &mask,
+ dev);
if (err)
return err;
@@ -11406,7 +11630,8 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_TX_RATES]) {
err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
NL80211_ATTR_TX_RATES,
- &setup.beacon_rate);
+ &setup.beacon_rate,
+ dev);
if (err)
return err;
@@ -12358,8 +12583,6 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] ||
!tb[NL80211_REKEY_DATA_KCK])
return -EINVAL;
- if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN)
- return -ERANGE;
if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN &&
!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK &&
nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN))
@@ -12684,8 +12907,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
func->cookie = cfg80211_assign_cookie(rdev);
- if (!tb[NL80211_NAN_FUNC_TYPE] ||
- nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
+ if (!tb[NL80211_NAN_FUNC_TYPE]) {
err = -EINVAL;
goto out;
}
@@ -13175,9 +13397,6 @@ static int nl80211_crit_protocol_start(struct sk_buff *skb,
duration =
nla_get_u16(info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]);
- if (duration > NL80211_CRIT_PROTO_MAX_DURATION)
- return -ERANGE;
-
ret = rdev_crit_proto_start(rdev, wdev, proto, duration);
if (!ret)
rdev->crit_proto_nlportid = info->snd_portid;
@@ -13562,8 +13781,7 @@ static int nl80211_set_qos_map(struct sk_buff *skb,
pos = nla_data(info->attrs[NL80211_ATTR_QOS_MAP]);
len = nla_len(info->attrs[NL80211_ATTR_QOS_MAP]);
- if (len % 2 || len < IEEE80211_QOS_MAP_LEN_MIN ||
- len > IEEE80211_QOS_MAP_LEN_MAX)
+ if (len % 2)
return -EINVAL;
qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL);
@@ -13831,17 +14049,9 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
goto out;
}
- if (info->attrs[NL80211_ATTR_PMKR0_NAME]) {
- int r0_name_len = nla_len(info->attrs[NL80211_ATTR_PMKR0_NAME]);
-
- if (r0_name_len != WLAN_PMK_NAME_LEN) {
- ret = -EINVAL;
- goto out;
- }
-
+ if (info->attrs[NL80211_ATTR_PMKR0_NAME])
pmk_conf.pmk_r0_name =
nla_data(info->attrs[NL80211_ATTR_PMKR0_NAME]);
- }
ret = rdev_set_pmk(rdev, dev, &pmk_conf);
out:
@@ -13900,8 +14110,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_SSID]) {
params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
- if (params.ssid.ssid_len == 0 ||
- params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN)
+ if (params.ssid.ssid_len == 0)
return -EINVAL;
memcpy(params.ssid.ssid,
nla_data(info->attrs[NL80211_ATTR_SSID]),
@@ -14202,7 +14411,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) {
attr = NL80211_TID_CONFIG_ATTR_TX_RATE;
err = nl80211_parse_tx_bitrate_mask(info, attrs, attr,
- &tid_conf->txrate_mask);
+ &tid_conf->txrate_mask, dev);
if (err)
return err;
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index d8a90d397423..6043a9d33d61 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1594,7 +1594,7 @@ freq_reg_info_regd(u32 center_freq,
/*
* We only need to know if one frequency rule was
- * was in center_freq's band, that's enough, so lets
+ * in center_freq's band, that's enough, so let's
* not overwrite it once found
*/
if (!band_rule_found)
@@ -1617,9 +1617,11 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
{
const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy);
const struct ieee80211_reg_rule *reg_rule = NULL;
+ const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20};
+ int i = ARRAY_SIZE(bws) - 1;
u32 bw;
- for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
+ for (bw = MHZ_TO_KHZ(bws[i]); bw >= min_bw; bw = MHZ_TO_KHZ(bws[i--])) {
reg_rule = freq_reg_info_regd(center_freq, regd, bw);
if (!IS_ERR(reg_rule))
return reg_rule;
@@ -1631,7 +1633,9 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
u32 center_freq)
{
- return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(20));
+ u32 min_bw = center_freq < MHZ_TO_KHZ(1000) ? 1 : 20;
+
+ return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(min_bw));
}
EXPORT_SYMBOL(freq_reg_info);
@@ -1659,6 +1663,7 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
{
const struct ieee80211_freq_range *freq_range = NULL;
u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0;
+ bool is_s1g = chan->band == NL80211_BAND_S1GHZ;
freq_range = &reg_rule->freq_range;
@@ -1678,70 +1683,72 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
MHZ_TO_KHZ(20)))
bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(10))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(20))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags |= IEEE80211_CHAN_NO_HT40;
- if (max_bandwidth_khz < MHZ_TO_KHZ(80))
- bw_flags |= IEEE80211_CHAN_NO_80MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(160))
- bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ if (is_s1g) {
+ /* S1G is strict about non overlapping channels. We can
+ * calculate which bandwidth is allowed per channel by finding
+ * the largest bandwidth which cleanly divides the freq_range.
+ */
+ int edge_offset;
+ int ch_bw = max_bandwidth_khz;
+
+ while (ch_bw) {
+ edge_offset = (center_freq_khz - ch_bw / 2) -
+ freq_range->start_freq_khz;
+ if (edge_offset % ch_bw == 0) {
+ switch (KHZ_TO_MHZ(ch_bw)) {
+ case 1:
+ bw_flags |= IEEE80211_CHAN_1MHZ;
+ break;
+ case 2:
+ bw_flags |= IEEE80211_CHAN_2MHZ;
+ break;
+ case 4:
+ bw_flags |= IEEE80211_CHAN_4MHZ;
+ break;
+ case 8:
+ bw_flags |= IEEE80211_CHAN_8MHZ;
+ break;
+ case 16:
+ bw_flags |= IEEE80211_CHAN_16MHZ;
+ break;
+ default:
+ /* If we got here, no bandwidths fit on
+ * this frequency, ie. band edge.
+ */
+ bw_flags |= IEEE80211_CHAN_DISABLED;
+ break;
+ }
+ break;
+ }
+ ch_bw /= 2;
+ }
+ } else {
+ if (max_bandwidth_khz < MHZ_TO_KHZ(10))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(20))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(40))
+ bw_flags |= IEEE80211_CHAN_NO_HT40;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(80))
+ bw_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(160))
+ bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ }
return bw_flags;
}
-/*
- * Note that right now we assume the desired channel bandwidth
- * is always 20 MHz for each individual channel (HT40 uses 20 MHz
- * per channel, the primary and the extension channel).
- */
-static void handle_channel(struct wiphy *wiphy,
- enum nl80211_reg_initiator initiator,
- struct ieee80211_channel *chan)
+static void handle_channel_single_rule(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan,
+ u32 flags,
+ struct regulatory_request *lr,
+ struct wiphy *request_wiphy,
+ const struct ieee80211_reg_rule *reg_rule)
{
- u32 flags, bw_flags = 0;
- const struct ieee80211_reg_rule *reg_rule = NULL;
+ u32 bw_flags = 0;
const struct ieee80211_power_rule *power_rule = NULL;
- struct wiphy *request_wiphy = NULL;
- struct regulatory_request *lr = get_last_request();
const struct ieee80211_regdomain *regd;
- request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
-
- flags = chan->orig_flags;
-
- reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan));
- if (IS_ERR(reg_rule)) {
- /*
- * We will disable all channels that do not match our
- * received regulatory rule unless the hint is coming
- * from a Country IE and the Country IE had no information
- * about a band. The IEEE 802.11 spec allows for an AP
- * to send only a subset of the regulatory rules allowed,
- * so an AP in the US that only supports 2.4 GHz may only send
- * a country IE with information for the 2.4 GHz band
- * while 5 GHz is still supported.
- */
- if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
- PTR_ERR(reg_rule) == -ERANGE)
- return;
-
- if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
- request_wiphy && request_wiphy == wiphy &&
- request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
- pr_debug("Disabling freq %d.%03d MHz for good\n",
- chan->center_freq, chan->freq_offset);
- chan->orig_flags |= IEEE80211_CHAN_DISABLED;
- chan->flags = chan->orig_flags;
- } else {
- pr_debug("Disabling freq %d.%03d MHz\n",
- chan->center_freq, chan->freq_offset);
- chan->flags |= IEEE80211_CHAN_DISABLED;
- }
- return;
- }
-
regd = reg_get_regdomain(wiphy);
power_rule = &reg_rule->power_rule;
@@ -1803,6 +1810,204 @@ static void handle_channel(struct wiphy *wiphy,
chan->max_power = chan->max_reg_power;
}
+static void handle_channel_adjacent_rules(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan,
+ u32 flags,
+ struct regulatory_request *lr,
+ struct wiphy *request_wiphy,
+ const struct ieee80211_reg_rule *rrule1,
+ const struct ieee80211_reg_rule *rrule2,
+ struct ieee80211_freq_range *comb_range)
+{
+ u32 bw_flags1 = 0;
+ u32 bw_flags2 = 0;
+ const struct ieee80211_power_rule *power_rule1 = NULL;
+ const struct ieee80211_power_rule *power_rule2 = NULL;
+ const struct ieee80211_regdomain *regd;
+
+ regd = reg_get_regdomain(wiphy);
+
+ power_rule1 = &rrule1->power_rule;
+ power_rule2 = &rrule2->power_rule;
+ bw_flags1 = reg_rule_to_chan_bw_flags(regd, rrule1, chan);
+ bw_flags2 = reg_rule_to_chan_bw_flags(regd, rrule2, chan);
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ /* This guarantees the driver's requested regulatory domain
+ * will always be used as a base for further regulatory
+ * settings
+ */
+ chan->flags =
+ map_regdom_flags(rrule1->flags) |
+ map_regdom_flags(rrule2->flags) |
+ bw_flags1 |
+ bw_flags2;
+ chan->orig_flags = chan->flags;
+ chan->max_antenna_gain =
+ min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain),
+ MBI_TO_DBI(power_rule2->max_antenna_gain));
+ chan->orig_mag = chan->max_antenna_gain;
+ chan->max_reg_power =
+ min_t(int, MBM_TO_DBM(power_rule1->max_eirp),
+ MBM_TO_DBM(power_rule2->max_eirp));
+ chan->max_power = chan->max_reg_power;
+ chan->orig_mpwr = chan->max_reg_power;
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms)
+ chan->dfs_cac_ms = max_t(unsigned int,
+ rrule1->dfs_cac_ms,
+ rrule2->dfs_cac_ms);
+ }
+
+ return;
+ }
+
+ chan->dfs_state = NL80211_DFS_USABLE;
+ chan->dfs_state_entered = jiffies;
+
+ chan->beacon_found = false;
+ chan->flags = flags | bw_flags1 | bw_flags2 |
+ map_regdom_flags(rrule1->flags) |
+ map_regdom_flags(rrule2->flags);
+
+ /* reg_rule_to_chan_bw_flags may forbids 10 and forbids 20 MHz
+ * (otherwise no adj. rule case), recheck therefore
+ */
+ if (cfg80211_does_bw_fit_range(comb_range,
+ ieee80211_channel_to_khz(chan),
+ MHZ_TO_KHZ(10)))
+ chan->flags &= ~IEEE80211_CHAN_NO_10MHZ;
+ if (cfg80211_does_bw_fit_range(comb_range,
+ ieee80211_channel_to_khz(chan),
+ MHZ_TO_KHZ(20)))
+ chan->flags &= ~IEEE80211_CHAN_NO_20MHZ;
+
+ chan->max_antenna_gain =
+ min_t(int, chan->orig_mag,
+ min_t(int,
+ MBI_TO_DBI(power_rule1->max_antenna_gain),
+ MBI_TO_DBI(power_rule2->max_antenna_gain)));
+ chan->max_reg_power = min_t(int,
+ MBM_TO_DBM(power_rule1->max_eirp),
+ MBM_TO_DBM(power_rule2->max_eirp));
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms)
+ chan->dfs_cac_ms = max_t(unsigned int,
+ rrule1->dfs_cac_ms,
+ rrule2->dfs_cac_ms);
+ else
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ }
+
+ if (chan->orig_mpwr) {
+ /* Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER
+ * will always follow the passed country IE power settings.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER)
+ chan->max_power = chan->max_reg_power;
+ else
+ chan->max_power = min(chan->orig_mpwr,
+ chan->max_reg_power);
+ } else {
+ chan->max_power = chan->max_reg_power;
+ }
+}
+
+/* Note that right now we assume the desired channel bandwidth
+ * is always 20 MHz for each individual channel (HT40 uses 20 MHz
+ * per channel, the primary and the extension channel).
+ */
+static void handle_channel(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan)
+{
+ const u32 orig_chan_freq = ieee80211_channel_to_khz(chan);
+ struct regulatory_request *lr = get_last_request();
+ struct wiphy *request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
+ const struct ieee80211_reg_rule *rrule = NULL;
+ const struct ieee80211_reg_rule *rrule1 = NULL;
+ const struct ieee80211_reg_rule *rrule2 = NULL;
+
+ u32 flags = chan->orig_flags;
+
+ rrule = freq_reg_info(wiphy, orig_chan_freq);
+ if (IS_ERR(rrule)) {
+ /* check for adjacent match, therefore get rules for
+ * chan - 20 MHz and chan + 20 MHz and test
+ * if reg rules are adjacent
+ */
+ rrule1 = freq_reg_info(wiphy,
+ orig_chan_freq - MHZ_TO_KHZ(20));
+ rrule2 = freq_reg_info(wiphy,
+ orig_chan_freq + MHZ_TO_KHZ(20));
+ if (!IS_ERR(rrule1) && !IS_ERR(rrule2)) {
+ struct ieee80211_freq_range comb_range;
+
+ if (rrule1->freq_range.end_freq_khz !=
+ rrule2->freq_range.start_freq_khz)
+ goto disable_chan;
+
+ comb_range.start_freq_khz =
+ rrule1->freq_range.start_freq_khz;
+ comb_range.end_freq_khz =
+ rrule2->freq_range.end_freq_khz;
+ comb_range.max_bandwidth_khz =
+ min_t(u32,
+ rrule1->freq_range.max_bandwidth_khz,
+ rrule2->freq_range.max_bandwidth_khz);
+
+ if (!cfg80211_does_bw_fit_range(&comb_range,
+ orig_chan_freq,
+ MHZ_TO_KHZ(20)))
+ goto disable_chan;
+
+ handle_channel_adjacent_rules(wiphy, initiator, chan,
+ flags, lr, request_wiphy,
+ rrule1, rrule2,
+ &comb_range);
+ return;
+ }
+
+disable_chan:
+ /* We will disable all channels that do not match our
+ * received regulatory rule unless the hint is coming
+ * from a Country IE and the Country IE had no information
+ * about a band. The IEEE 802.11 spec allows for an AP
+ * to send only a subset of the regulatory rules allowed,
+ * so an AP in the US that only supports 2.4 GHz may only send
+ * a country IE with information for the 2.4 GHz band
+ * while 5 GHz is still supported.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ PTR_ERR(rrule) == -ERANGE)
+ return;
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ pr_debug("Disabling freq %d.%03d MHz for good\n",
+ chan->center_freq, chan->freq_offset);
+ chan->orig_flags |= IEEE80211_CHAN_DISABLED;
+ chan->flags = chan->orig_flags;
+ } else {
+ pr_debug("Disabling freq %d.%03d MHz\n",
+ chan->center_freq, chan->freq_offset);
+ chan->flags |= IEEE80211_CHAN_DISABLED;
+ }
+ return;
+ }
+
+ handle_channel_single_rule(wiphy, initiator, chan, flags, lr,
+ request_wiphy, rrule);
+}
+
static void handle_band(struct wiphy *wiphy,
enum nl80211_reg_initiator initiator,
struct ieee80211_supported_band *sband)
@@ -3170,7 +3375,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy)
* - send a user regulatory hint if applicable
*
* Device drivers that send a regulatory hint for a specific country
- * keep their own regulatory domain on wiphy->regd so that does does
+ * keep their own regulatory domain on wiphy->regd so that does
* not need to be remembered.
*/
static void restore_regulatory_settings(bool reset_user, bool cached)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 04f2d198c215..84fc8ab16dd2 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -55,7 +55,7 @@
*
* Also note that the hidden_beacon_bss pointer is only relevant
* if the driver uses something other than the IEs, e.g. private
- * data stored stored in the BSS struct, since the beacon IEs are
+ * data stored in the BSS struct, since the beacon IEs are
* also linked into the probe response struct.
*/
@@ -1488,7 +1488,7 @@ static const struct element
ielen - (mbssid_end - ie));
/*
- * If is is not the last subelement in current MBSSID IE or there isn't
+ * If it is not the last subelement in current MBSSID IE or there isn't
* a next MBSSID IE - profile is complete.
*/
if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) ||
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 079ce320dc1e..38df713f2e2e 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -24,7 +24,7 @@
/*
* Software SME in cfg80211, using auth/assoc/deauth calls to the
- * driver. This is is for implementing nl80211's connect/disconnect
+ * driver. This is for implementing nl80211's connect/disconnect
* and wireless extensions (if configured.)
*/
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 6fa99df52f86..f01746894a4e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -111,6 +111,33 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
}
EXPORT_SYMBOL(ieee80211_channel_to_freq_khz);
+enum nl80211_chan_width
+ieee80211_s1g_channel_width(const struct ieee80211_channel *chan)
+{
+ if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ))
+ return NL80211_CHAN_WIDTH_20_NOHT;
+
+ /*S1G defines a single allowed channel width per channel.
+ * Extract that width here.
+ */
+ if (chan->flags & IEEE80211_CHAN_1MHZ)
+ return NL80211_CHAN_WIDTH_1;
+ else if (chan->flags & IEEE80211_CHAN_2MHZ)
+ return NL80211_CHAN_WIDTH_2;
+ else if (chan->flags & IEEE80211_CHAN_4MHZ)
+ return NL80211_CHAN_WIDTH_4;
+ else if (chan->flags & IEEE80211_CHAN_8MHZ)
+ return NL80211_CHAN_WIDTH_8;
+ else if (chan->flags & IEEE80211_CHAN_16MHZ)
+ return NL80211_CHAN_WIDTH_16;
+
+ pr_err("unknown channel width for channel at %dKHz?\n",
+ ieee80211_channel_to_khz(chan));
+
+ return NL80211_CHAN_WIDTH_1;
+}
+EXPORT_SYMBOL(ieee80211_s1g_channel_width);
+
int ieee80211_freq_khz_to_channel(u32 freq)
{
/* TODO: just handle MHz for now */
@@ -399,6 +426,11 @@ unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
{
unsigned int hdrlen = 24;
+ if (ieee80211_is_ext(fc)) {
+ hdrlen = 4;
+ goto out;
+ }
+
if (ieee80211_is_data(fc)) {
if (ieee80211_has_a4(fc))
hdrlen = 30;
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 4d2160c989a3..78f2927ead7f 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -497,7 +497,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
/*
* We only need to store WEP keys, since they're the only keys that
- * can be be set before a connection is established and persist after
+ * can be set before a connection is established and persist after
* disconnecting.
*/
if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index b010bfde0149..56d052bc65cb 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -23,162 +23,6 @@
static DEFINE_IDA(umem_ida);
-void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
-{
- unsigned long flags;
-
- if (!xs->tx)
- return;
-
- spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
- list_add_rcu(&xs->list, &umem->xsk_tx_list);
- spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
-}
-
-void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
-{
- unsigned long flags;
-
- if (!xs->tx)
- return;
-
- spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
- list_del_rcu(&xs->list);
- spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
-}
-
-/* The umem is stored both in the _rx struct and the _tx struct as we do
- * not know if the device has more tx queues than rx, or the opposite.
- * This might also change during run time.
- */
-static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
- u16 queue_id)
-{
- if (queue_id >= max_t(unsigned int,
- dev->real_num_rx_queues,
- dev->real_num_tx_queues))
- return -EINVAL;
-
- if (queue_id < dev->real_num_rx_queues)
- dev->_rx[queue_id].umem = umem;
- if (queue_id < dev->real_num_tx_queues)
- dev->_tx[queue_id].umem = umem;
-
- return 0;
-}
-
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
- u16 queue_id)
-{
- if (queue_id < dev->real_num_rx_queues)
- return dev->_rx[queue_id].umem;
- if (queue_id < dev->real_num_tx_queues)
- return dev->_tx[queue_id].umem;
-
- return NULL;
-}
-EXPORT_SYMBOL(xdp_get_umem_from_qid);
-
-static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
-{
- if (queue_id < dev->real_num_rx_queues)
- dev->_rx[queue_id].umem = NULL;
- if (queue_id < dev->real_num_tx_queues)
- dev->_tx[queue_id].umem = NULL;
-}
-
-int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
- u16 queue_id, u16 flags)
-{
- bool force_zc, force_copy;
- struct netdev_bpf bpf;
- int err = 0;
-
- ASSERT_RTNL();
-
- force_zc = flags & XDP_ZEROCOPY;
- force_copy = flags & XDP_COPY;
-
- if (force_zc && force_copy)
- return -EINVAL;
-
- if (xdp_get_umem_from_qid(dev, queue_id))
- return -EBUSY;
-
- err = xdp_reg_umem_at_qid(dev, umem, queue_id);
- if (err)
- return err;
-
- umem->dev = dev;
- umem->queue_id = queue_id;
-
- if (flags & XDP_USE_NEED_WAKEUP) {
- umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
- /* Tx needs to be explicitly woken up the first time.
- * Also for supporting drivers that do not implement this
- * feature. They will always have to call sendto().
- */
- xsk_set_tx_need_wakeup(umem);
- }
-
- dev_hold(dev);
-
- if (force_copy)
- /* For copy-mode, we are done. */
- return 0;
-
- if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
- err = -EOPNOTSUPP;
- goto err_unreg_umem;
- }
-
- bpf.command = XDP_SETUP_XSK_UMEM;
- bpf.xsk.umem = umem;
- bpf.xsk.queue_id = queue_id;
-
- err = dev->netdev_ops->ndo_bpf(dev, &bpf);
- if (err)
- goto err_unreg_umem;
-
- umem->zc = true;
- return 0;
-
-err_unreg_umem:
- if (!force_zc)
- err = 0; /* fallback to copy mode */
- if (err)
- xdp_clear_umem_at_qid(dev, queue_id);
- return err;
-}
-
-void xdp_umem_clear_dev(struct xdp_umem *umem)
-{
- struct netdev_bpf bpf;
- int err;
-
- ASSERT_RTNL();
-
- if (!umem->dev)
- return;
-
- if (umem->zc) {
- bpf.command = XDP_SETUP_XSK_UMEM;
- bpf.xsk.umem = NULL;
- bpf.xsk.queue_id = umem->queue_id;
-
- err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
-
- if (err)
- WARN(1, "failed to disable umem!\n");
- }
-
- xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
-
- dev_put(umem->dev);
- umem->dev = NULL;
- umem->zc = false;
-}
-
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
@@ -195,38 +39,33 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
}
}
-static void xdp_umem_release(struct xdp_umem *umem)
+static void xdp_umem_addr_unmap(struct xdp_umem *umem)
{
- rtnl_lock();
- xdp_umem_clear_dev(umem);
- rtnl_unlock();
-
- ida_simple_remove(&umem_ida, umem->id);
+ vunmap(umem->addrs);
+ umem->addrs = NULL;
+}
- if (umem->fq) {
- xskq_destroy(umem->fq);
- umem->fq = NULL;
- }
+static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages,
+ u32 nr_pages)
+{
+ umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!umem->addrs)
+ return -ENOMEM;
+ return 0;
+}
- if (umem->cq) {
- xskq_destroy(umem->cq);
- umem->cq = NULL;
- }
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+ umem->zc = false;
+ ida_simple_remove(&umem_ida, umem->id);
- xp_destroy(umem->pool);
+ xdp_umem_addr_unmap(umem);
xdp_umem_unpin_pages(umem);
xdp_umem_unaccount_pages(umem);
kfree(umem);
}
-static void xdp_umem_release_deferred(struct work_struct *work)
-{
- struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
-
- xdp_umem_release(umem);
-}
-
void xdp_get_umem(struct xdp_umem *umem)
{
refcount_inc(&umem->users);
@@ -237,10 +76,8 @@ void xdp_put_umem(struct xdp_umem *umem)
if (!umem)
return;
- if (refcount_dec_and_test(&umem->users)) {
- INIT_WORK(&umem->work, xdp_umem_release_deferred);
- schedule_work(&umem->work);
- }
+ if (refcount_dec_and_test(&umem->users))
+ xdp_umem_release(umem);
}
static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
@@ -319,8 +156,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
- if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
- XDP_UMEM_USES_NEED_WAKEUP))
+ if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return -EINVAL;
if (!unaligned_chunks && !is_power_of_2(chunk_size))
@@ -355,13 +191,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->size = size;
umem->headroom = headroom;
umem->chunk_size = chunk_size;
+ umem->chunks = chunks;
umem->npgs = (u32)npgs;
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
- INIT_LIST_HEAD(&umem->xsk_tx_list);
- spin_lock_init(&umem->xsk_tx_list_lock);
+ INIT_LIST_HEAD(&umem->xsk_dma_list);
refcount_set(&umem->users, 1);
err = xdp_umem_account_pages(umem);
@@ -372,15 +208,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
goto out_account;
- umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
- headroom, size, unaligned_chunks);
- if (!umem->pool) {
- err = -ENOMEM;
- goto out_pin;
- }
+ err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs);
+ if (err)
+ goto out_unpin;
+
return 0;
-out_pin:
+out_unpin:
xdp_umem_unpin_pages(umem);
out_account:
xdp_umem_unaccount_pages(umem);
@@ -412,8 +246,3 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
return umem;
}
-
-bool xdp_umem_validate_queues(struct xdp_umem *umem)
-{
- return umem->fq && umem->cq;
-}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 32067fe98f65..181fdda2f2a8 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -8,14 +8,8 @@
#include <net/xdp_sock_drv.h>
-int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
- u16 queue_id, u16 flags);
-void xdp_umem_clear_dev(struct xdp_umem *umem);
-bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem);
void xdp_put_umem(struct xdp_umem *umem);
-void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
-void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c3231620d210..5eb6662f562a 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,68 +36,108 @@ static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
- READ_ONCE(xs->umem->fq);
+ (xs->pool->fq || READ_ONCE(xs->fq_tmp));
}
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
- if (umem->need_wakeup & XDP_WAKEUP_RX)
+ if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
return;
- umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
- umem->need_wakeup |= XDP_WAKEUP_RX;
+ pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+ pool->cached_need_wakeup |= XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
- if (umem->need_wakeup & XDP_WAKEUP_TX)
+ if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
return;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
- umem->need_wakeup |= XDP_WAKEUP_TX;
+ pool->cached_need_wakeup |= XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
{
- if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+ if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
return;
- umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
- umem->need_wakeup &= ~XDP_WAKEUP_RX;
+ pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
- if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+ if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
return;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
- umem->need_wakeup &= ~XDP_WAKEUP_TX;
+ pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
{
- return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+ return pool->uses_need_wakeup;
+}
+EXPORT_SYMBOL(xsk_uses_need_wakeup);
+
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
+ u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ return dev->_rx[queue_id].pool;
+ if (queue_id < dev->real_num_tx_queues)
+ return dev->_tx[queue_id].pool;
+
+ return NULL;
+}
+EXPORT_SYMBOL(xsk_get_pool_from_qid);
+
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].pool = NULL;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].pool = NULL;
+}
+
+/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
+ * not know if the device has more tx queues than rx, or the opposite.
+ * This might also change during run time.
+ */
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 queue_id)
+{
+ if (queue_id >= max_t(unsigned int,
+ dev->real_num_rx_queues,
+ dev->real_num_tx_queues))
+ return -EINVAL;
+
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].pool = pool;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].pool = pool;
+
+ return 0;
}
-EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
void xp_release(struct xdp_buff_xsk *xskb)
{
@@ -155,12 +195,12 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
struct xdp_buff *xsk_xdp;
int err;
- if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
+ if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
xs->rx_dropped++;
return -ENOSPC;
}
- xsk_xdp = xsk_buff_alloc(xs->umem);
+ xsk_xdp = xsk_buff_alloc(xs->pool);
if (!xsk_xdp) {
xs->rx_dropped++;
return -ENOSPC;
@@ -208,7 +248,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
static void xsk_flush(struct xdp_sock *xs)
{
xskq_prod_submit(xs->rx);
- __xskq_cons_release(xs->umem->fq);
+ __xskq_cons_release(xs->pool->fq);
sock_def_readable(&xs->sk);
}
@@ -249,32 +289,32 @@ void __xsk_map_flush(void)
}
}
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
{
- xskq_prod_submit_n(umem->cq, nb_entries);
+ xskq_prod_submit_n(pool->cq, nb_entries);
}
-EXPORT_SYMBOL(xsk_umem_complete_tx);
+EXPORT_SYMBOL(xsk_tx_completed);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+void xsk_tx_release(struct xsk_buff_pool *pool)
{
struct xdp_sock *xs;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
__xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
}
-EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+EXPORT_SYMBOL(xsk_tx_release);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
+bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
{
struct xdp_sock *xs;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
- if (!xskq_cons_peek_desc(xs->tx, desc, umem)) {
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+ if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
xs->tx->queue_empty_descs++;
continue;
}
@@ -284,7 +324,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (xskq_prod_reserve_addr(umem->cq, desc->addr))
+ if (xskq_prod_reserve_addr(pool->cq, desc->addr))
goto out;
xskq_cons_release(xs->tx);
@@ -296,7 +336,7 @@ out:
rcu_read_unlock();
return false;
}
-EXPORT_SYMBOL(xsk_umem_consume_tx);
+EXPORT_SYMBOL(xsk_tx_peek_desc);
static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
{
@@ -322,7 +362,7 @@ static void xsk_destruct_skb(struct sk_buff *skb)
unsigned long flags;
spin_lock_irqsave(&xs->tx_completion_lock, flags);
- xskq_prod_submit_addr(xs->umem->cq, addr);
+ xskq_prod_submit_addr(xs->pool->cq, addr);
spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
sock_wfree(skb);
@@ -342,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk)
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
+ while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
char *buffer;
u64 addr;
u32 len;
@@ -359,14 +399,14 @@ static int xsk_generic_xmit(struct sock *sk)
skb_put(skb, len);
addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->umem, addr);
+ buffer = xsk_buff_raw_get_data(xs->pool, addr);
err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
+ if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
kfree_skb(skb);
goto out;
}
@@ -431,16 +471,16 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
__poll_t mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
- struct xdp_umem *umem;
+ struct xsk_buff_pool *pool;
if (unlikely(!xsk_is_bound(xs)))
return mask;
- umem = xs->umem;
+ pool = xs->pool;
- if (umem->need_wakeup) {
+ if (pool->cached_need_wakeup) {
if (xs->zc)
- xsk_wakeup(xs, umem->need_wakeup);
+ xsk_wakeup(xs, pool->cached_need_wakeup);
else
/* Poll needs to drive Tx also in copy mode */
__xsk_sendmsg(sk);
@@ -481,7 +521,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
WRITE_ONCE(xs->state, XSK_UNBOUND);
/* Wait for driver to stop using the xdp socket. */
- xdp_del_sk_umem(xs->umem, xs);
+ xp_del_xsk(xs->pool, xs);
xs->dev = NULL;
synchronize_net();
dev_put(dev);
@@ -559,6 +599,8 @@ static int xsk_release(struct socket *sock)
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
+ xskq_destroy(xs->fq_tmp);
+ xskq_destroy(xs->cq_tmp);
sock_orphan(sk);
sock->sk = NULL;
@@ -586,6 +628,11 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
+static bool xsk_validate_queues(struct xdp_sock *xs)
+{
+ return xs->fq_tmp && xs->cq_tmp;
+}
+
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -654,29 +701,64 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
sockfd_put(sock);
goto out_unlock;
}
- if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
- err = -EINVAL;
- sockfd_put(sock);
- goto out_unlock;
+
+ if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
+ /* Share the umem with another socket on another qid
+ * and/or device.
+ */
+ xs->pool = xp_create_and_assign_umem(xs,
+ umem_xs->umem);
+ if (!xs->pool) {
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
+ dev, qid);
+ if (err) {
+ xp_destroy(xs->pool);
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+ } else {
+ /* Share the buffer pool with the other socket. */
+ if (xs->fq_tmp || xs->cq_tmp) {
+ /* Do not allow setting your own fq or cq. */
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ xp_get_pool(umem_xs->pool);
+ xs->pool = umem_xs->pool;
}
xdp_get_umem(umem_xs->umem);
WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
- } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+ } else if (!xs->umem || !xsk_validate_queues(xs)) {
err = -EINVAL;
goto out_unlock;
} else {
/* This xsk has its own umem. */
- err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
- if (err)
+ xs->pool = xp_create_and_assign_umem(xs, xs->umem);
+ if (!xs->pool) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ err = xp_assign_dev(xs->pool, dev, qid, flags);
+ if (err) {
+ xp_destroy(xs->pool);
+ xs->pool = NULL;
goto out_unlock;
+ }
}
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
- xdp_add_sk_umem(xs->umem, xs);
+ xp_add_xsk(xs->pool, xs);
out_unlock:
if (err) {
@@ -782,16 +864,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
mutex_unlock(&xs->mutex);
return -EBUSY;
}
- if (!xs->umem) {
- mutex_unlock(&xs->mutex);
- return -EINVAL;
- }
- q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
- &xs->umem->cq;
+ q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
+ &xs->cq_tmp;
err = xsk_init_queue(entries, q, true);
- if (optname == XDP_UMEM_FILL_RING)
- xp_set_fq(xs->umem->pool, *q);
mutex_unlock(&xs->mutex);
return err;
}
@@ -858,7 +934,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
if (extra_stats) {
stats.rx_ring_full = xs->rx_queue_full;
stats.rx_fill_ring_empty_descs =
- xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0;
+ xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
} else {
stats.rx_dropped += xs->rx_queue_full;
@@ -960,7 +1036,6 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long size = vma->vm_end - vma->vm_start;
struct xdp_sock *xs = xdp_sk(sock->sk);
struct xsk_queue *q = NULL;
- struct xdp_umem *umem;
unsigned long pfn;
struct page *qpg;
@@ -972,16 +1047,12 @@ static int xsk_mmap(struct file *file, struct socket *sock,
} else if (offset == XDP_PGOFF_TX_RING) {
q = READ_ONCE(xs->tx);
} else {
- umem = READ_ONCE(xs->umem);
- if (!umem)
- return -EINVAL;
-
/* Matches the smp_wmb() in XDP_UMEM_REG */
smp_rmb();
if (offset == XDP_UMEM_PGOFF_FILL_RING)
- q = READ_ONCE(umem->fq);
+ q = READ_ONCE(xs->fq_tmp);
else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
- q = READ_ONCE(umem->cq);
+ q = READ_ONCE(xs->cq_tmp);
}
if (!q)
@@ -1019,8 +1090,8 @@ static int xsk_notifier(struct notifier_block *this,
xsk_unbind_dev(xs);
- /* Clear device references in umem. */
- xdp_umem_clear_dev(xs->umem);
+ /* Clear device references. */
+ xp_clear_dev(xs->pool);
}
mutex_unlock(&xs->mutex);
}
@@ -1064,7 +1135,7 @@ static void xsk_destruct(struct sock *sk)
if (!sock_flag(sk, SOCK_DEAD))
return;
- xdp_put_umem(xs->umem);
+ xp_put_pool(xs->pool);
sk_refcnt_debug_dec(sk);
}
@@ -1072,8 +1143,8 @@ static void xsk_destruct(struct sock *sk)
static int xsk_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
- struct sock *sk;
struct xdp_sock *xs;
+ struct sock *sk;
if (!ns_capable(net->user_ns, CAP_NET_RAW))
return -EPERM;
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 455ddd480f3d..da1f73e43924 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -11,13 +11,6 @@
#define XSK_NEXT_PG_CONTIG_SHIFT 0
#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
-/* Flags for the umem flags field.
- *
- * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
- * flags. See inlude/uapi/include/linux/if_xdp.h.
- */
-#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
-
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
@@ -51,5 +44,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
int xsk_map_inc(struct xsk_map *map);
void xsk_map_put(struct xsk_map *map);
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
+int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 queue_id);
#endif /* XSK_H_ */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index a2044c245215..795d7c81c0ca 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -2,21 +2,37 @@
#include <net/xsk_buff_pool.h>
#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/swiotlb.h>
#include "xsk_queue.h"
+#include "xdp_umem.h"
+#include "xsk.h"
-static void xp_addr_unmap(struct xsk_buff_pool *pool)
+void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- vunmap(pool->addrs);
+ unsigned long flags;
+
+ if (!xs->tx)
+ return;
+
+ spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ list_add_rcu(&xs->tx_list, &pool->xsk_tx_list);
+ spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
}
-static int xp_addr_map(struct xsk_buff_pool *pool,
- struct page **pages, u32 nr_pages)
+void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (!pool->addrs)
- return -ENOMEM;
- return 0;
+ unsigned long flags;
+
+ if (!xs->tx)
+ return;
+
+ spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ list_del_rcu(&xs->tx_list);
+ spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
}
void xp_destroy(struct xsk_buff_pool *pool)
@@ -24,59 +40,61 @@ void xp_destroy(struct xsk_buff_pool *pool)
if (!pool)
return;
- xp_addr_unmap(pool);
kvfree(pool->heads);
kvfree(pool);
}
-struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
- u32 chunk_size, u32 headroom, u64 size,
- bool unaligned)
+struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
+ struct xdp_umem *umem)
{
struct xsk_buff_pool *pool;
struct xdp_buff_xsk *xskb;
- int err;
u32 i;
- pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
+ pool = kvzalloc(struct_size(pool, free_heads, umem->chunks),
+ GFP_KERNEL);
if (!pool)
goto out;
- pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
+ pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
if (!pool->heads)
goto out;
- pool->chunk_mask = ~((u64)chunk_size - 1);
- pool->addrs_cnt = size;
- pool->heads_cnt = chunks;
- pool->free_heads_cnt = chunks;
- pool->headroom = headroom;
- pool->chunk_size = chunk_size;
- pool->unaligned = unaligned;
- pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
+ pool->chunk_mask = ~((u64)umem->chunk_size - 1);
+ pool->addrs_cnt = umem->size;
+ pool->heads_cnt = umem->chunks;
+ pool->free_heads_cnt = umem->chunks;
+ pool->headroom = umem->headroom;
+ pool->chunk_size = umem->chunk_size;
+ pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+ pool->frame_len = umem->chunk_size - umem->headroom -
+ XDP_PACKET_HEADROOM;
+ pool->umem = umem;
+ pool->addrs = umem->addrs;
INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->xsk_tx_list);
+ spin_lock_init(&pool->xsk_tx_list_lock);
+ refcount_set(&pool->users, 1);
+
+ pool->fq = xs->fq_tmp;
+ pool->cq = xs->cq_tmp;
+ xs->fq_tmp = NULL;
+ xs->cq_tmp = NULL;
for (i = 0; i < pool->free_heads_cnt; i++) {
xskb = &pool->heads[i];
xskb->pool = pool;
- xskb->xdp.frame_sz = chunk_size - headroom;
+ xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
pool->free_heads[i] = xskb;
}
- err = xp_addr_map(pool, pages, nr_pages);
- if (!err)
- return pool;
+ return pool;
out:
xp_destroy(pool);
return NULL;
}
-void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
-{
- pool->fq = fq;
-}
-
void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
{
u32 i;
@@ -86,70 +104,320 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
}
EXPORT_SYMBOL(xp_set_rxq_info);
-void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
{
- dma_addr_t *dma;
- u32 i;
+ struct netdev_bpf bpf;
+ int err;
- if (pool->dma_pages_cnt == 0)
+ ASSERT_RTNL();
+
+ if (pool->umem->zc) {
+ bpf.command = XDP_SETUP_XSK_POOL;
+ bpf.xsk.pool = NULL;
+ bpf.xsk.queue_id = pool->queue_id;
+
+ err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
+
+ if (err)
+ WARN(1, "Failed to disable zero-copy!\n");
+ }
+}
+
+static int __xp_assign_dev(struct xsk_buff_pool *pool,
+ struct net_device *netdev, u16 queue_id, u16 flags)
+{
+ bool force_zc, force_copy;
+ struct netdev_bpf bpf;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ force_zc = flags & XDP_ZEROCOPY;
+ force_copy = flags & XDP_COPY;
+
+ if (force_zc && force_copy)
+ return -EINVAL;
+
+ if (xsk_get_pool_from_qid(netdev, queue_id))
+ return -EBUSY;
+
+ pool->netdev = netdev;
+ pool->queue_id = queue_id;
+ err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
+ if (err)
+ return err;
+
+ if (flags & XDP_USE_NEED_WAKEUP) {
+ pool->uses_need_wakeup = true;
+ /* Tx needs to be explicitly woken up the first time.
+ * Also for supporting drivers that do not implement this
+ * feature. They will always have to call sendto().
+ */
+ pool->cached_need_wakeup = XDP_WAKEUP_TX;
+ }
+
+ dev_hold(netdev);
+
+ if (force_copy)
+ /* For copy-mode, we are done. */
+ return 0;
+
+ if (!netdev->netdev_ops->ndo_bpf ||
+ !netdev->netdev_ops->ndo_xsk_wakeup) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_pool;
+ }
+
+ bpf.command = XDP_SETUP_XSK_POOL;
+ bpf.xsk.pool = pool;
+ bpf.xsk.queue_id = queue_id;
+
+ err = netdev->netdev_ops->ndo_bpf(netdev, &bpf);
+ if (err)
+ goto err_unreg_pool;
+
+ if (!pool->dma_pages) {
+ WARN(1, "Driver did not DMA map zero-copy buffers");
+ goto err_unreg_xsk;
+ }
+ pool->umem->zc = true;
+ return 0;
+
+err_unreg_xsk:
+ xp_disable_drv_zc(pool);
+err_unreg_pool:
+ if (!force_zc)
+ err = 0; /* fallback to copy mode */
+ if (err)
+ xsk_clear_pool_at_qid(netdev, queue_id);
+ return err;
+}
+
+int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
+ u16 queue_id, u16 flags)
+{
+ return __xp_assign_dev(pool, dev, queue_id, flags);
+}
+
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
+ struct net_device *dev, u16 queue_id)
+{
+ u16 flags;
+
+ /* One fill and completion ring required for each queue id. */
+ if (!pool->fq || !pool->cq)
+ return -EINVAL;
+
+ flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
+ if (pool->uses_need_wakeup)
+ flags |= XDP_USE_NEED_WAKEUP;
+
+ return __xp_assign_dev(pool, dev, queue_id, flags);
+}
+
+void xp_clear_dev(struct xsk_buff_pool *pool)
+{
+ if (!pool->netdev)
return;
- for (i = 0; i < pool->dma_pages_cnt; i++) {
- dma = &pool->dma_pages[i];
+ xp_disable_drv_zc(pool);
+ xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
+ dev_put(pool->netdev);
+ pool->netdev = NULL;
+}
+
+static void xp_release_deferred(struct work_struct *work)
+{
+ struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool,
+ work);
+
+ rtnl_lock();
+ xp_clear_dev(pool);
+ rtnl_unlock();
+
+ if (pool->fq) {
+ xskq_destroy(pool->fq);
+ pool->fq = NULL;
+ }
+
+ if (pool->cq) {
+ xskq_destroy(pool->cq);
+ pool->cq = NULL;
+ }
+
+ xdp_put_umem(pool->umem);
+ xp_destroy(pool);
+}
+
+void xp_get_pool(struct xsk_buff_pool *pool)
+{
+ refcount_inc(&pool->users);
+}
+
+void xp_put_pool(struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return;
+
+ if (refcount_dec_and_test(&pool->users)) {
+ INIT_WORK(&pool->work, xp_release_deferred);
+ schedule_work(&pool->work);
+ }
+}
+
+static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
+{
+ struct xsk_dma_map *dma_map;
+
+ list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
+ if (dma_map->netdev == pool->netdev)
+ return dma_map;
+ }
+
+ return NULL;
+}
+
+static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev,
+ u32 nr_pages, struct xdp_umem *umem)
+{
+ struct xsk_dma_map *dma_map;
+
+ dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
+ if (!dma_map)
+ return NULL;
+
+ dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);
+ if (!dma_map) {
+ kfree(dma_map);
+ return NULL;
+ }
+
+ dma_map->netdev = netdev;
+ dma_map->dev = dev;
+ dma_map->dma_need_sync = false;
+ dma_map->dma_pages_cnt = nr_pages;
+ refcount_set(&dma_map->users, 0);
+ list_add(&dma_map->list, &umem->xsk_dma_list);
+ return dma_map;
+}
+
+static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
+{
+ list_del(&dma_map->list);
+ kvfree(dma_map->dma_pages);
+ kfree(dma_map);
+}
+
+static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs)
+{
+ dma_addr_t *dma;
+ u32 i;
+
+ for (i = 0; i < dma_map->dma_pages_cnt; i++) {
+ dma = &dma_map->dma_pages[i];
if (*dma) {
- dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
+ dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
*dma = 0;
}
}
+ xp_destroy_dma_map(dma_map);
+}
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+ struct xsk_dma_map *dma_map;
+
+ if (pool->dma_pages_cnt == 0)
+ return;
+
+ dma_map = xp_find_dma_map(pool);
+ if (!dma_map) {
+ WARN(1, "Could not find dma_map for device");
+ return;
+ }
+
+ if (!refcount_dec_and_test(&dma_map->users))
+ return;
+
+ __xp_dma_unmap(dma_map, attrs);
kvfree(pool->dma_pages);
pool->dma_pages_cnt = 0;
pool->dev = NULL;
}
EXPORT_SYMBOL(xp_dma_unmap);
-static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
+static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
{
u32 i;
- for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
- if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
- pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+ for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
+ if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
+ dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
else
- pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+ dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}
}
+static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
+{
+ pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
+ if (!pool->dma_pages)
+ return -ENOMEM;
+
+ pool->dev = dma_map->dev;
+ pool->dma_pages_cnt = dma_map->dma_pages_cnt;
+ pool->dma_need_sync = dma_map->dma_need_sync;
+ refcount_inc(&dma_map->users);
+ memcpy(pool->dma_pages, dma_map->dma_pages,
+ pool->dma_pages_cnt * sizeof(*pool->dma_pages));
+
+ return 0;
+}
+
int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
unsigned long attrs, struct page **pages, u32 nr_pages)
{
+ struct xsk_dma_map *dma_map;
dma_addr_t dma;
+ int err;
u32 i;
- pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
- GFP_KERNEL);
- if (!pool->dma_pages)
- return -ENOMEM;
+ dma_map = xp_find_dma_map(pool);
+ if (dma_map) {
+ err = xp_init_dma_info(pool, dma_map);
+ if (err)
+ return err;
+
+ return 0;
+ }
- pool->dev = dev;
- pool->dma_pages_cnt = nr_pages;
- pool->dma_need_sync = false;
+ dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem);
+ if (!dma_map)
+ return -ENOMEM;
- for (i = 0; i < pool->dma_pages_cnt; i++) {
+ for (i = 0; i < dma_map->dma_pages_cnt; i++) {
dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
if (dma_mapping_error(dev, dma)) {
- xp_dma_unmap(pool, attrs);
+ __xp_dma_unmap(dma_map, attrs);
return -ENOMEM;
}
if (dma_need_sync(dev, dma))
- pool->dma_need_sync = true;
- pool->dma_pages[i] = dma;
+ dma_map->dma_need_sync = true;
+ dma_map->dma_pages[i] = dma;
}
if (pool->unaligned)
- xp_check_dma_contiguity(pool);
+ xp_check_dma_contiguity(dma_map);
+
+ err = xp_init_dma_info(pool, dma_map);
+ if (err) {
+ __xp_dma_unmap(dma_map, attrs);
+ return err;
+ }
+
return 0;
}
EXPORT_SYMBOL(xp_dma_map);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 21e9c2d123ee..5bd8ea9d206a 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -46,6 +46,7 @@ static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs,
static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
{
+ struct xsk_buff_pool *pool = xs->pool;
struct xdp_umem *umem = xs->umem;
struct xdp_diag_umem du = {};
int err;
@@ -58,8 +59,8 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.num_pages = umem->npgs;
du.chunk_size = umem->chunk_size;
du.headroom = umem->headroom;
- du.ifindex = umem->dev ? umem->dev->ifindex : 0;
- du.queue_id = umem->queue_id;
+ du.ifindex = pool->netdev ? pool->netdev->ifindex : 0;
+ du.queue_id = pool->queue_id;
du.flags = 0;
if (umem->zc)
du.flags |= XDP_DU_F_ZEROCOPY;
@@ -67,10 +68,11 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du);
- if (!err && umem->fq)
- err = xsk_diag_put_ring(umem->fq, XDP_DIAG_UMEM_FILL_RING, nlskb);
- if (!err && umem->cq) {
- err = xsk_diag_put_ring(umem->cq, XDP_DIAG_UMEM_COMPLETION_RING,
+ if (!err && pool->fq)
+ err = xsk_diag_put_ring(pool->fq,
+ XDP_DIAG_UMEM_FILL_RING, nlskb);
+ if (!err && pool->cq) {
+ err = xsk_diag_put_ring(pool->cq, XDP_DIAG_UMEM_COMPLETION_RING,
nlskb);
}
return err;
@@ -83,7 +85,7 @@ static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.n_rx_dropped = xs->rx_dropped;
du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx);
du.n_rx_full = xs->rx_queue_full;
- du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0;
+ du.n_fill_ring_empty = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx);
du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx);
return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bf42cfd74b89..2d883f631c85 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -166,9 +166,9 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
struct xdp_desc *d,
- struct xdp_umem *umem)
+ struct xsk_buff_pool *pool)
{
- if (!xp_validate_desc(umem->pool, d)) {
+ if (!xp_validate_desc(pool, d)) {
q->invalid_descs++;
return false;
}
@@ -177,14 +177,14 @@ static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
static inline bool xskq_cons_read_desc(struct xsk_queue *q,
struct xdp_desc *desc,
- struct xdp_umem *umem)
+ struct xsk_buff_pool *pool)
{
while (q->cached_cons != q->cached_prod) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx = q->cached_cons & q->ring_mask;
*desc = ring->desc[idx];
- if (xskq_cons_is_valid_desc(q, desc, umem))
+ if (xskq_cons_is_valid_desc(q, desc, pool))
return true;
q->cached_cons++;
@@ -236,11 +236,11 @@ static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
struct xdp_desc *desc,
- struct xdp_umem *umem)
+ struct xsk_buff_pool *pool)
{
if (q->cached_prod == q->cached_cons)
xskq_cons_get_entries(q);
- return xskq_cons_read_desc(q, desc, umem);
+ return xskq_cons_read_desc(q, desc, pool);
}
static inline void xskq_cons_release(struct xsk_queue *q)
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 8367adbbe9df..2a4fd6677155 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -254,8 +254,16 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
spin_unlock_bh(&map->lock);
}
+static bool xsk_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ return meta0->max_entries == meta1->max_entries &&
+ bpf_map_meta_equal(meta0, meta1);
+}
+
static int xsk_map_btf_id;
const struct bpf_map_ops xsk_map_ops = {
+ .map_meta_equal = xsk_map_meta_equal,
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,