summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c242
-rw-r--r--net/9p/trans_xen.c4
-rw-r--r--net/Kconfig20
-rw-r--r--net/Makefile3
-rw-r--r--net/bluetooth/hci_conn.c89
-rw-r--r--net/bluetooth/hci_event.c18
-rw-r--r--net/bluetooth/hci_sync.c13
-rw-r--r--net/bluetooth/hidp/core.c2
-rw-r--r--net/bluetooth/l2cap_core.c24
-rw-r--r--net/bluetooth/sco.c85
-rw-r--r--net/bpf/bpf_dummy_struct_ops.c14
-rw-r--r--net/bpf/test_run.c34
-rw-r--r--net/bridge/br_arp_nd_proxy.c33
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_forward.c8
-rw-r--r--net/bridge/br_if.c2
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_netfilter_hooks.c17
-rw-r--r--net/bridge/br_netlink.c8
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/br_switchdev.c11
-rw-r--r--net/bridge/br_vlan.c1
-rw-r--r--net/bridge/br_vlan_options.c20
-rw-r--r--net/can/isotp.c74
-rw-r--r--net/can/j1939/transport.c5
-rw-r--r--net/compat.c13
-rw-r--r--net/core/bpf_sk_storage.c24
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/dev_ioctl.c12
-rw-r--r--net/core/drop_monitor.c33
-rw-r--r--net/core/filter.c29
-rw-r--r--net/core/gro.c2
-rw-r--r--net/core/netpoll.c19
-rw-r--r--net/core/page_pool.c36
-rw-r--r--net/core/rtnetlink.c13
-rw-r--r--net/core/scm.c9
-rw-r--r--net/core/skbuff.c121
-rw-r--r--net/core/sock.c13
-rw-r--r--net/core/sock_map.c8
-rw-r--r--net/core/xdp.c29
-rw-r--r--net/dsa/Makefile12
-rw-r--r--net/dsa/dsa.c19
-rw-r--r--net/dsa/master.c2
-rw-r--r--net/dsa/master.h2
-rw-r--r--net/dsa/slave.c11
-rw-r--r--net/dsa/stubs.c10
-rw-r--r--net/dsa/switch.c85
-rw-r--r--net/dsa/trace.c39
-rw-r--r--net/dsa/trace.h447
-rw-r--r--net/ethtool/linkmodes.c7
-rw-r--r--net/ethtool/mm.c33
-rw-r--r--net/handshake/.kunitconfig11
-rw-r--r--net/handshake/Makefile13
-rw-r--r--net/handshake/genl.c58
-rw-r--r--net/handshake/genl.h24
-rw-r--r--net/handshake/handshake-test.c523
-rw-r--r--net/handshake/handshake.h87
-rw-r--r--net/handshake/netlink.c319
-rw-r--r--net/handshake/request.c344
-rw-r--r--net/handshake/tlshd.c418
-rw-r--r--net/handshake/trace.c20
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/bpf_tcp_ca.c23
-rw-r--r--net/ipv4/fou_bpf.c119
-rw-r--r--net/ipv4/fou_core.c5
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/ip_tunnel.c22
-rw-r--r--net/ipv4/ipip.c1
-rw-r--r--net/ipv4/ping.c8
-rw-r--r--net/ipv4/raw.c36
-rw-r--r--net/ipv4/raw_diag.c10
-rw-r--r--net/ipv4/sysctl_net_ipv4.c3
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_cong.c66
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/icmp.c15
-rw-r--r--net/ipv6/ip6_output.c7
-rw-r--r--net/ipv6/ipv6_sockglue.c1
-rw-r--r--net/ipv6/raw.c10
-rw-r--r--net/ipv6/rpl.c3
-rw-r--r--net/ipv6/sit.c2
-rw-r--r--net/ipv6/udp.c8
-rw-r--r--net/l2tp/l2tp_ip.c8
-rw-r--r--net/l2tp/l2tp_ip6.c8
-rw-r--r--net/mac80211/drop.h56
-rw-r--r--net/mac80211/ieee80211_i.h8
-rw-r--r--net/mac80211/main.c31
-rw-r--r--net/mac80211/rx.c55
-rw-r--r--net/mac80211/wpa.c24
-rw-r--r--net/mptcp/fastopen.c11
-rw-r--r--net/mptcp/options.c14
-rw-r--r--net/mptcp/pm.c4
-rw-r--r--net/mptcp/pm_netlink.c4
-rw-r--r--net/mptcp/pm_userspace.c4
-rw-r--r--net/mptcp/protocol.c173
-rw-r--r--net/mptcp/protocol.h8
-rw-r--r--net/mptcp/sockopt.c24
-rw-r--r--net/mptcp/subflow.c100
-rw-r--r--net/netfilter/nf_conntrack_bpf.c5
-rw-r--r--net/netfilter/nf_tables_api.c69
-rw-r--r--net/netfilter/nft_lookup.c36
-rw-r--r--net/netlink/af_netlink.c15
-rw-r--r--net/openvswitch/actions.c2
-rw-r--r--net/packet/af_packet.c95
-rw-r--r--net/packet/diag.c2
-rw-r--r--net/packet/internal.h2
-rw-r--r--net/qrtr/af_qrtr.c10
-rw-r--r--net/qrtr/ns.c15
-rw-r--r--net/sched/act_csum.c3
-rw-r--r--net/sched/cls_api.c3
-rw-r--r--net/sched/sch_mqprio.c188
-rw-r--r--net/sched/sch_mqprio_lib.c14
-rw-r--r--net/sched/sch_mqprio_lib.h2
-rw-r--r--net/sched/sch_qfq.c13
-rw-r--r--net/sched/sch_taprio.c77
-rw-r--r--net/sctp/associola.c5
-rw-r--r--net/sctp/auth.c2
-rw-r--r--net/sctp/input.c2
-rw-r--r--net/sctp/outqueue.c11
-rw-r--r--net/sctp/sm_make_chunk.c32
-rw-r--r--net/sctp/sm_sideeffect.c3
-rw-r--r--net/sctp/sm_statefuns.c14
-rw-r--r--net/sctp/socket.c9
-rw-r--r--net/sctp/stream.c2
-rw-r--r--net/sctp/stream_interleave.c5
-rw-r--r--net/smc/af_smc.c11
-rw-r--r--net/socket.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_test.c6
-rw-r--r--net/sunrpc/svcauth_unix.c17
-rw-r--r--net/sunrpc/xprtsock.c1
-rw-r--r--net/vmw_vsock/virtio_transport_common.c10
-rw-r--r--net/vmw_vsock/vmci_transport.c8
-rw-r--r--net/vmw_vsock/vsock_loopback.c3
-rw-r--r--net/xdp/xsk.c9
-rw-r--r--net/xdp/xsk_queue.h19
-rw-r--r--net/xdp/xskmap.c8
-rw-r--r--net/xfrm/xfrm_device.c2
-rw-r--r--net/xfrm/xfrm_input.c66
-rw-r--r--net/xfrm/xfrm_output.c33
140 files changed, 4471 insertions, 848 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 5920544e93e8..870e4935d6e6 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -26,6 +26,7 @@
#include <linux/ethtool.h>
#include <linux/phy.h>
#include <net/arp.h>
+#include <net/macsec.h>
#include "vlan.h"
#include "vlanproc.h"
@@ -572,6 +573,9 @@ static int vlan_dev_init(struct net_device *dev)
NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
NETIF_F_ALL_FCOE;
+ if (real_dev->vlan_features & NETIF_F_HW_MACSEC)
+ dev->hw_features |= NETIF_F_HW_MACSEC;
+
dev->features |= dev->hw_features | NETIF_F_LLTX;
netif_inherit_tso_max(dev, real_dev);
if (dev->features & NETIF_F_VLAN_FEATURES)
@@ -803,6 +807,241 @@ static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
return 0;
}
+#if IS_ENABLED(CONFIG_MACSEC)
+
+static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx)
+{
+ return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops;
+}
+
+static int vlan_macsec_offload(int (* const func)(struct macsec_context *),
+ struct macsec_context *ctx)
+{
+ if (unlikely(!func))
+ return 0;
+
+ return (*func)(ctx);
+}
+
+static int vlan_macsec_dev_open(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_dev_open, ctx);
+}
+
+static int vlan_macsec_dev_stop(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_dev_stop, ctx);
+}
+
+static int vlan_macsec_add_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_secy, ctx);
+}
+
+static int vlan_macsec_upd_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_secy, ctx);
+}
+
+static int vlan_macsec_del_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_secy, ctx);
+}
+
+static int vlan_macsec_add_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_rxsc, ctx);
+}
+
+static int vlan_macsec_upd_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx);
+}
+
+static int vlan_macsec_del_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_rxsc, ctx);
+}
+
+static int vlan_macsec_add_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_rxsa, ctx);
+}
+
+static int vlan_macsec_upd_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx);
+}
+
+static int vlan_macsec_del_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_rxsa, ctx);
+}
+
+static int vlan_macsec_add_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_txsa, ctx);
+}
+
+static int vlan_macsec_upd_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_txsa, ctx);
+}
+
+static int vlan_macsec_del_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_txsa, ctx);
+}
+
+static int vlan_macsec_get_dev_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx);
+}
+
+static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx);
+}
+
+static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx);
+}
+
+static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx);
+}
+
+static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx);
+}
+
+static const struct macsec_ops macsec_offload_ops = {
+ /* Device wide */
+ .mdo_dev_open = vlan_macsec_dev_open,
+ .mdo_dev_stop = vlan_macsec_dev_stop,
+ /* SecY */
+ .mdo_add_secy = vlan_macsec_add_secy,
+ .mdo_upd_secy = vlan_macsec_upd_secy,
+ .mdo_del_secy = vlan_macsec_del_secy,
+ /* Security channels */
+ .mdo_add_rxsc = vlan_macsec_add_rxsc,
+ .mdo_upd_rxsc = vlan_macsec_upd_rxsc,
+ .mdo_del_rxsc = vlan_macsec_del_rxsc,
+ /* Security associations */
+ .mdo_add_rxsa = vlan_macsec_add_rxsa,
+ .mdo_upd_rxsa = vlan_macsec_upd_rxsa,
+ .mdo_del_rxsa = vlan_macsec_del_rxsa,
+ .mdo_add_txsa = vlan_macsec_add_txsa,
+ .mdo_upd_txsa = vlan_macsec_upd_txsa,
+ .mdo_del_txsa = vlan_macsec_del_txsa,
+ /* Statistics */
+ .mdo_get_dev_stats = vlan_macsec_get_dev_stats,
+ .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats,
+ .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats,
+ .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats,
+ .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats,
+};
+
+#endif
+
static const struct ethtool_ops vlan_ethtool_ops = {
.get_link_ksettings = vlan_ethtool_get_link_ksettings,
.get_drvinfo = vlan_ethtool_get_drvinfo,
@@ -869,6 +1108,9 @@ void vlan_setup(struct net_device *dev)
dev->priv_destructor = vlan_dev_free;
dev->ethtool_ops = &vlan_ethtool_ops;
+#if IS_ENABLED(CONFIG_MACSEC)
+ dev->macsec_ops = &macsec_offload_ops;
+#endif
dev->min_mtu = 0;
dev->max_mtu = ETH_MAX_MTU;
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index c64050e839ac..1fffe2bed5b0 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -280,6 +280,10 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv)
write_unlock(&xen_9pfs_lock);
for (i = 0; i < priv->num_rings; i++) {
+ struct xen_9pfs_dataring *ring = &priv->rings[i];
+
+ cancel_work_sync(&ring->work);
+
if (!priv->rings[i].intf)
break;
if (priv->rings[i].irq > 0)
diff --git a/net/Kconfig b/net/Kconfig
index f806722bccf4..7d39c1773eb4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -68,6 +68,26 @@ source "net/iucv/Kconfig"
source "net/smc/Kconfig"
source "net/xdp/Kconfig"
+config NET_HANDSHAKE
+ bool
+ depends on SUNRPC || NVME_TARGET_TCP || NVME_TCP
+ default y
+
+config NET_HANDSHAKE_KUNIT_TEST
+ tristate "KUnit tests for the handshake upcall mechanism" if !KUNIT_ALL_TESTS
+ default KUNIT_ALL_TESTS
+ depends on KUNIT
+ help
+ This builds the KUnit tests for the handshake upcall mechanism.
+
+ KUnit tests run during boot and output the results to the debug
+ log in TAP format (https://testanything.org/). Only useful for
+ kernel devs running KUnit test harness and are not for inclusion
+ into a production build.
+
+ For more information on KUnit and unit tests in general, refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
config INET
bool "TCP/IP networking"
help
diff --git a/net/Makefile b/net/Makefile
index 0914bea9c335..4c4dc535453d 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
obj-$(CONFIG_BRIDGE) += bridge/
obj-$(CONFIG_NET_DEVLINK) += devlink/
-obj-$(CONFIG_NET_DSA) += dsa/
+obj-y += dsa/
obj-$(CONFIG_ATALK) += appletalk/
obj-$(CONFIG_X25) += x25/
obj-$(CONFIG_LAPB) += lapb/
@@ -79,3 +79,4 @@ obj-$(CONFIG_NET_NCSI) += ncsi/
obj-$(CONFIG_XDP_SOCKETS) += xdp/
obj-$(CONFIG_MPTCP) += mptcp/
obj-$(CONFIG_MCTP) += mctp/
+obj-$(CONFIG_NET_HANDSHAKE) += handshake/
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 17b946f9ba31..8455ba141ee6 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -68,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = {
};
/* This function requires the caller holds hdev->lock */
-static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
+static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
{
struct hci_conn_params *params;
struct hci_dev *hdev = conn->hdev;
@@ -88,9 +88,28 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr,
bdaddr_type);
- if (!params || !params->explicit_connect)
+ if (!params)
return;
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ params->conn = NULL;
+ }
+
+ if (!params->explicit_connect)
+ return;
+
+ /* If the status indicates successful cancellation of
+ * the attempt (i.e. Unknown Connection Id) there's no point of
+ * notifying failure since we'll go back to keep trying to
+ * connect. The only exception is explicit connect requests
+ * where a timeout + cancel does indicate an actual failure.
+ */
+ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID)
+ mgmt_connect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, status);
+
/* The connection attempt was doing scan for new RPA, and is
* in scan phase. If params are not associated with any other
* autoconnect action, remove them completely. If they are, just unmark
@@ -178,7 +197,7 @@ static void le_scan_cleanup(struct work_struct *work)
rcu_read_unlock();
if (c == conn) {
- hci_connect_le_scan_cleanup(conn);
+ hci_connect_le_scan_cleanup(conn, 0x00);
hci_conn_cleanup(conn);
}
@@ -1049,6 +1068,17 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
return conn;
}
+static bool hci_conn_unlink(struct hci_conn *conn)
+{
+ if (!conn->link)
+ return false;
+
+ conn->link->link = NULL;
+ conn->link = NULL;
+
+ return true;
+}
+
int hci_conn_del(struct hci_conn *conn)
{
struct hci_dev *hdev = conn->hdev;
@@ -1060,15 +1090,16 @@ int hci_conn_del(struct hci_conn *conn)
cancel_delayed_work_sync(&conn->idle_work);
if (conn->type == ACL_LINK) {
- struct hci_conn *sco = conn->link;
- if (sco) {
- sco->link = NULL;
+ struct hci_conn *link = conn->link;
+
+ if (link) {
+ hci_conn_unlink(conn);
/* Due to race, SCO connection might be not established
* yet at this point. Delete it now, otherwise it is
* possible for it to be stuck and can't be deleted.
*/
- if (sco->handle == HCI_CONN_HANDLE_UNSET)
- hci_conn_del(sco);
+ if (link->handle == HCI_CONN_HANDLE_UNSET)
+ hci_conn_del(link);
}
/* Unacked frames */
@@ -1084,7 +1115,7 @@ int hci_conn_del(struct hci_conn *conn)
struct hci_conn *acl = conn->link;
if (acl) {
- acl->link = NULL;
+ hci_conn_unlink(conn);
hci_conn_drop(acl);
}
@@ -1179,31 +1210,8 @@ EXPORT_SYMBOL(hci_get_route);
static void hci_le_conn_failed(struct hci_conn *conn, u8 status)
{
struct hci_dev *hdev = conn->hdev;
- struct hci_conn_params *params;
- params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
- conn->dst_type);
- if (params && params->conn) {
- hci_conn_drop(params->conn);
- hci_conn_put(params->conn);
- params->conn = NULL;
- }
-
- /* If the status indicates successful cancellation of
- * the attempt (i.e. Unknown Connection Id) there's no point of
- * notifying failure since we'll go back to keep trying to
- * connect. The only exception is explicit connect requests
- * where a timeout + cancel does indicate an actual failure.
- */
- if (status != HCI_ERROR_UNKNOWN_CONN_ID ||
- (params && params->explicit_connect))
- mgmt_connect_failed(hdev, &conn->dst, conn->type,
- conn->dst_type, status);
-
- /* Since we may have temporarily stopped the background scanning in
- * favor of connection establishment, we should restart it.
- */
- hci_update_passive_scan(hdev);
+ hci_connect_le_scan_cleanup(conn, status);
/* Enable advertising in case this was a failed connection
* attempt as a peripheral.
@@ -1237,15 +1245,15 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
{
struct hci_conn *conn = data;
+ bt_dev_dbg(hdev, "err %d", err);
+
hci_dev_lock(hdev);
if (!err) {
- hci_connect_le_scan_cleanup(conn);
+ hci_connect_le_scan_cleanup(conn, 0x00);
goto done;
}
- bt_dev_err(hdev, "request failed to create LE connection: err %d", err);
-
/* Check if connection is still pending */
if (conn != hci_lookup_le_connect(hdev))
goto done;
@@ -2438,6 +2446,12 @@ void hci_conn_hash_flush(struct hci_dev *hdev)
c->state = BT_CLOSED;
hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM);
+
+ /* Unlink before deleting otherwise it is possible that
+ * hci_conn_del removes the link which may cause the list to
+ * contain items already freed.
+ */
+ hci_conn_unlink(c);
hci_conn_del(c);
}
}
@@ -2775,6 +2789,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
{
int r = 0;
+ if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags))
+ return 0;
+
switch (conn->state) {
case BT_CONNECTED:
case BT_CONFIG:
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ad92a4be5851..e87c928c9e17 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2881,16 +2881,6 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
conn->resp_addr_type = peer_addr_type;
bacpy(&conn->resp_addr, peer_addr);
-
- /* We don't want the connection attempt to stick around
- * indefinitely since LE doesn't have a page timeout concept
- * like BR/EDR. Set a timer for any connection that doesn't use
- * the accept list for connecting.
- */
- if (filter_policy == HCI_LE_USE_PEER_ADDR)
- queue_delayed_work(conn->hdev->workqueue,
- &conn->le_conn_timeout,
- conn->conn_timeout);
}
static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
@@ -5902,6 +5892,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
if (status)
goto unlock;
+ /* Drop the connection if it has been aborted */
+ if (test_bit(HCI_CONN_CANCEL, &conn->flags)) {
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
if (conn->dst_type == ADDR_LE_DEV_PUBLIC)
addr_type = BDADDR_LE_PUBLIC;
else
@@ -6995,7 +6991,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
bis->iso_qos.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
bis->iso_qos.in.sdu = le16_to_cpu(ev->max_pdu);
- hci_connect_cfm(bis, ev->status);
+ hci_iso_setup_path(bis);
}
hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5a6aa1627791..632be1267288 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -246,8 +246,9 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk);
if (IS_ERR(skb)) {
- bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode,
- PTR_ERR(skb));
+ if (!event)
+ bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode,
+ PTR_ERR(skb));
return PTR_ERR(skb);
}
@@ -5126,8 +5127,11 @@ static int hci_le_connect_cancel_sync(struct hci_dev *hdev,
if (test_bit(HCI_CONN_SCANNING, &conn->flags))
return 0;
+ if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags))
+ return 0;
+
return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL,
- 6, &conn->dst, HCI_CMD_TIMEOUT);
+ 0, NULL, HCI_CMD_TIMEOUT);
}
static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn)
@@ -6102,6 +6106,9 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn)
conn->conn_timeout, NULL);
done:
+ if (err == -ETIMEDOUT)
+ hci_le_connect_cancel_sync(hdev, conn);
+
/* Re-enable advertising after the connection attempt is finished. */
hci_resume_advertising_sync(hdev);
return err;
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index bed1a7b9205c..707f229f896a 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -433,7 +433,7 @@ static void hidp_set_timer(struct hidp_session *session)
static void hidp_del_timer(struct hidp_session *session)
{
if (session->idle_to > 0)
- del_timer(&session->timer);
+ del_timer_sync(&session->timer);
}
static void hidp_process_report(struct hidp_session *session, int type,
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 49926f59cc12..55a7226233f9 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -4652,33 +4652,27 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid);
- mutex_lock(&conn->chan_lock);
-
- chan = __l2cap_get_chan_by_scid(conn, dcid);
+ chan = l2cap_get_chan_by_scid(conn, dcid);
if (!chan) {
- mutex_unlock(&conn->chan_lock);
cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid);
return 0;
}
- l2cap_chan_hold(chan);
- l2cap_chan_lock(chan);
-
rsp.dcid = cpu_to_le16(chan->scid);
rsp.scid = cpu_to_le16(chan->dcid);
l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp);
chan->ops->set_shutdown(chan);
+ mutex_lock(&conn->chan_lock);
l2cap_chan_del(chan, ECONNRESET);
+ mutex_unlock(&conn->chan_lock);
chan->ops->close(chan);
l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
- mutex_unlock(&conn->chan_lock);
-
return 0;
}
@@ -4698,33 +4692,27 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid);
- mutex_lock(&conn->chan_lock);
-
- chan = __l2cap_get_chan_by_scid(conn, scid);
+ chan = l2cap_get_chan_by_scid(conn, scid);
if (!chan) {
mutex_unlock(&conn->chan_lock);
return 0;
}
- l2cap_chan_hold(chan);
- l2cap_chan_lock(chan);
-
if (chan->state != BT_DISCONN) {
l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
- mutex_unlock(&conn->chan_lock);
return 0;
}
+ mutex_lock(&conn->chan_lock);
l2cap_chan_del(chan, 0);
+ mutex_unlock(&conn->chan_lock);
chan->ops->close(chan);
l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
- mutex_unlock(&conn->chan_lock);
-
return 0;
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 1111da4e2f2b..cd1a27ac555d 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -235,27 +235,41 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
return err;
}
-static int sco_connect(struct hci_dev *hdev, struct sock *sk)
+static int sco_connect(struct sock *sk)
{
struct sco_conn *conn;
struct hci_conn *hcon;
+ struct hci_dev *hdev;
int err, type;
BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst);
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
if (lmp_esco_capable(hdev) && !disable_esco)
type = ESCO_LINK;
else
type = SCO_LINK;
if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT &&
- (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)))
- return -EOPNOTSUPP;
+ (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
sco_pi(sk)->setting, &sco_pi(sk)->codec);
- if (IS_ERR(hcon))
- return PTR_ERR(hcon);
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
conn = sco_conn_add(hcon);
if (!conn) {
@@ -263,13 +277,15 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk)
return -ENOMEM;
}
- /* Update source addr of the socket */
- bacpy(&sco_pi(sk)->src, &hcon->src);
-
err = sco_chan_add(conn, sk, NULL);
if (err)
return err;
+ lock_sock(sk);
+
+ /* Update source addr of the socket */
+ bacpy(&sco_pi(sk)->src, &hcon->src);
+
if (hcon->state == BT_CONNECTED) {
sco_sock_clear_timer(sk);
sk->sk_state = BT_CONNECTED;
@@ -278,6 +294,13 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk)
sco_sock_set_timer(sk, sk->sk_sndtimeo);
}
+ release_sock(sk);
+
+ return err;
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
return err;
}
@@ -565,7 +588,6 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
- struct hci_dev *hdev;
int err;
BT_DBG("sk %p", sk);
@@ -574,37 +596,26 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen
addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
- lock_sock(sk);
- if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
- err = -EBADFD;
- goto done;
- }
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
+ return -EBADFD;
- if (sk->sk_type != SOCK_SEQPACKET) {
+ if (sk->sk_type != SOCK_SEQPACKET)
err = -EINVAL;
- goto done;
- }
-
- hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR);
- if (!hdev) {
- err = -EHOSTUNREACH;
- goto done;
- }
- hci_dev_lock(hdev);
+ lock_sock(sk);
/* Set destination address and psm */
bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr);
+ release_sock(sk);
- err = sco_connect(hdev, sk);
- hci_dev_unlock(hdev);
- hci_dev_put(hdev);
+ err = sco_connect(sk);
if (err)
- goto done;
+ return err;
+
+ lock_sock(sk);
err = bt_sock_wait_state(sk, BT_CONNECTED,
sock_sndtimeo(sk, flags & O_NONBLOCK));
-done:
release_sock(sk);
return err;
}
@@ -1129,6 +1140,8 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
break;
}
+ release_sock(sk);
+
/* find total buffer size required to copy codec + caps */
hci_dev_lock(hdev);
list_for_each_entry(c, &hdev->local_codecs, list) {
@@ -1146,15 +1159,13 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
buf_len += sizeof(struct bt_codecs);
if (buf_len > len) {
hci_dev_put(hdev);
- err = -ENOBUFS;
- break;
+ return -ENOBUFS;
}
ptr = optval;
if (put_user(num_codecs, ptr)) {
hci_dev_put(hdev);
- err = -EFAULT;
- break;
+ return -EFAULT;
}
ptr += sizeof(num_codecs);
@@ -1194,12 +1205,14 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
ptr += len;
}
- if (!err && put_user(buf_len, optlen))
- err = -EFAULT;
-
hci_dev_unlock(hdev);
hci_dev_put(hdev);
+ lock_sock(sk);
+
+ if (!err && put_user(buf_len, optlen))
+ err = -EFAULT;
+
break;
default:
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ff4f89a2b02a..5918d1b32e19 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -173,14 +173,11 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t,
static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id,
- enum bpf_type_flag *flag)
+ int off, int size)
{
const struct btf_type *state;
const struct btf_type *t;
s32 type_id;
- int err;
type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
BTF_KIND_STRUCT);
@@ -194,11 +191,12 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
return -EACCES;
}
- err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
- if (err < 0)
- return err;
+ if (off + size > sizeof(struct bpf_dummy_ops_state)) {
+ bpf_log(log, "write access at off %d with size %d\n", off, size);
+ return -EACCES;
+ }
- return atype == BPF_READ ? err : NOT_INIT;
+ return NOT_INIT;
}
static const struct bpf_verifier_ops bpf_dummy_verifier_ops = {
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index d350f31c7a3d..0b9bd9b39990 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -215,6 +215,16 @@ static void xdp_test_run_teardown(struct xdp_test_data *xdp)
kfree(xdp->skbs);
}
+static bool frame_was_changed(const struct xdp_page_head *head)
+{
+ /* xdp_scrub_frame() zeroes the data pointer, flags is the last field,
+ * i.e. has the highest chances to be overwritten. If those two are
+ * untouched, it's most likely safe to skip the context reset.
+ */
+ return head->frame->data != head->orig_ctx.data ||
+ head->frame->flags != head->orig_ctx.flags;
+}
+
static bool ctx_was_changed(struct xdp_page_head *head)
{
return head->orig_ctx.data != head->ctx.data ||
@@ -224,7 +234,7 @@ static bool ctx_was_changed(struct xdp_page_head *head)
static void reset_ctx(struct xdp_page_head *head)
{
- if (likely(!ctx_was_changed(head)))
+ if (likely(!frame_was_changed(head) && !ctx_was_changed(head)))
return;
head->ctx.data = head->orig_ctx.data;
@@ -538,6 +548,11 @@ int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg)
return (long)arg->a;
}
+__bpf_kfunc u32 bpf_fentry_test9(u32 *a)
+{
+ return *a;
+}
+
__bpf_kfunc int bpf_modify_return_test(int a, int *b)
{
*b += 1;
@@ -567,6 +582,11 @@ long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d)
return (long)a + (long)b + (long)c + d;
}
+int noinline bpf_fentry_shadow_test(int a)
+{
+ return a + 1;
+}
+
struct prog_test_member1 {
int a;
};
@@ -598,6 +618,11 @@ bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
return &prog_test_struct;
}
+__bpf_kfunc void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p)
+{
+ WARN_ON_ONCE(1);
+}
+
__bpf_kfunc struct prog_test_member *
bpf_kfunc_call_memb_acquire(void)
{
@@ -607,9 +632,6 @@ bpf_kfunc_call_memb_acquire(void)
__bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
{
- if (!p)
- return;
-
refcount_dec(&p->cnt);
}
@@ -795,6 +817,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
BTF_SET8_END(test_sk_check_kfunc_ids)
static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
@@ -844,7 +867,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 ||
bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 ||
- bpf_fentry_test8(&arg) != 0)
+ bpf_fentry_test8(&arg) != 0 ||
+ bpf_fentry_test9(&retval) != 0)
goto out;
break;
case BPF_MODIFY_RETURN:
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index b45c00c01dea..c7869a286df4 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -30,7 +30,7 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
bool neigh_suppress = false;
list_for_each_entry(p, &br->port_list, list) {
- if (p->flags & BR_NEIGH_SUPPRESS) {
+ if (p->flags & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) {
neigh_suppress = true;
break;
}
@@ -158,7 +158,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
return;
if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
- if (p && (p->flags & BR_NEIGH_SUPPRESS))
+ if (br_is_neigh_suppress_enabled(p, vid))
return;
if (parp->ar_op != htons(ARPOP_RREQUEST) &&
parp->ar_op != htons(ARPOP_RREPLY) &&
@@ -202,8 +202,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
bool replied = false;
if ((p && (p->flags & BR_PROXYARP)) ||
- (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI |
- BR_NEIGH_SUPPRESS)))) {
+ (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)) ||
+ br_is_neigh_suppress_enabled(f->dst, vid)) {
if (!vid)
br_arp_send(br, p, skb->dev, sip, tip,
sha, n->ha, sha, 0, 0);
@@ -407,7 +407,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
- if (p && (p->flags & BR_NEIGH_SUPPRESS))
+ if (br_is_neigh_suppress_enabled(p, vid))
return;
if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
@@ -461,7 +461,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (f) {
bool replied = false;
- if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) {
+ if (br_is_neigh_suppress_enabled(f->dst, vid)) {
if (vid != 0)
br_nd_send(br, p, skb, n,
skb->vlan_proto,
@@ -483,3 +483,24 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
}
}
#endif
+
+bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid)
+{
+ if (!p)
+ return false;
+
+ if (!vid)
+ return !!(p->flags & BR_NEIGH_SUPPRESS);
+
+ if (p->flags & BR_NEIGH_VLAN_SUPPRESS) {
+ struct net_bridge_vlan_group *vg = nbp_vlan_group_rcu(p);
+ struct net_bridge_vlan *v;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return false;
+ return !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED);
+ } else {
+ return !!(p->flags & BR_NEIGH_SUPPRESS);
+ }
+}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index df47c876230e..8eca8a5c80c6 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -80,10 +80,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
dest = eth_hdr(skb)->h_dest;
if (is_broadcast_ether_addr(dest)) {
- br_flood(br, skb, BR_PKT_BROADCAST, false, true);
+ br_flood(br, skb, BR_PKT_BROADCAST, false, true, vid);
} else if (is_multicast_ether_addr(dest)) {
if (unlikely(netpoll_tx_running(dev))) {
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid);
goto out;
}
if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) {
@@ -96,11 +96,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst))
br_multicast_flood(mdst, skb, brmctx, false, true);
else
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid);
} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
br_forward(dst->dst, skb, false, true);
} else {
- br_flood(br, skb, BR_PKT_UNICAST, false, true);
+ br_flood(br, skb, BR_PKT_UNICAST, false, true, vid);
}
out:
rcu_read_unlock();
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 02bb620d3b8d..57744704ff69 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -197,7 +197,8 @@ out:
/* called under rcu_read_lock */
void br_flood(struct net_bridge *br, struct sk_buff *skb,
- enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
+ u16 vid)
{
struct net_bridge_port *prev = NULL;
struct net_bridge_port *p;
@@ -224,8 +225,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
/* Do not flood to ports that enable proxy ARP */
if (p->flags & BR_PROXYARP)
continue;
- if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
- BR_INPUT_SKB_CB(skb)->proxyarp_replied)
+ if (BR_INPUT_SKB_CB(skb)->proxyarp_replied &&
+ ((p->flags & BR_PROXYARP_WIFI) ||
+ br_is_neigh_suppress_enabled(p, vid)))
continue;
prev = maybe_deliver(prev, p, skb, local_orig);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 24f01ff113f0..3f04b40f6056 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -759,7 +759,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
if (mask & BR_AUTO_MASK)
nbp_update_port_count(br);
- if (mask & BR_NEIGH_SUPPRESS)
+ if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS))
br_recalculate_neigh_suppress_enabled(br);
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 3027e8f6be15..fc17b9fd93e6 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -207,7 +207,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
br_forward(dst->dst, skb, local_rcv, false);
} else {
if (!mcast_hit)
- br_flood(br, skb, pkt_type, local_rcv, false);
+ br_flood(br, skb, pkt_type, local_rcv, false, vid);
else
br_multicast_flood(mdst, skb, brmctx, local_rcv, false);
}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 3e3065bc0465..1a801fab9543 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -869,12 +869,17 @@ static unsigned int ip_sabotage_in(void *priv,
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
- if (nf_bridge && !nf_bridge->in_prerouting &&
- !netif_is_l3_master(skb->dev) &&
- !netif_is_l3_slave(skb->dev)) {
- nf_bridge_info_free(skb);
- state->okfn(state->net, state->sk, skb);
- return NF_STOLEN;
+ if (nf_bridge) {
+ if (nf_bridge->sabotage_in_done)
+ return NF_ACCEPT;
+
+ if (!nf_bridge->in_prerouting &&
+ !netif_is_l3_master(skb->dev) &&
+ !netif_is_l3_slave(skb->dev)) {
+ nf_bridge->sabotage_in_done = 1;
+ state->okfn(state->net, state->sk, skb);
+ return NF_STOLEN;
+ }
}
return NF_ACCEPT;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index fefb1c0e248b..05c5863d2e20 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -189,6 +189,7 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
+ nla_total_size(1) /* IFLA_BRPORT_LOCKED */
+ nla_total_size(1) /* IFLA_BRPORT_MAB */
+ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -278,7 +279,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
!!(p->flags & BR_MRP_LOST_IN_CONT)) ||
nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
- nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)))
+ nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) ||
+ nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+ !!(p->flags & BR_NEIGH_VLAN_SUPPRESS)))
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
@@ -891,6 +894,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
[IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT },
[IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+ [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
};
/* Change the state of the port and notify spanning tree */
@@ -957,6 +961,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+ br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+ BR_NEIGH_VLAN_SUPPRESS);
if ((p->flags & BR_PORT_MAB) &&
(!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 7264fd40f82f..2119729ded2b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -178,6 +178,7 @@ enum {
BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1),
BR_VLFLAG_MCAST_ENABLED = BIT(2),
BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3),
+ BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4),
};
/**
@@ -849,7 +850,8 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
bool local_rcv, bool local_orig);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood(struct net_bridge *br, struct sk_buff *skb,
- enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
+ u16 vid);
/* return true if both source port and dest port are isolated */
static inline bool br_skb_isolated(const struct net_bridge_port *to,
@@ -2218,4 +2220,5 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
+bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid);
#endif
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index de18e9c1d7a7..ba95c4d74a60 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -148,6 +148,17 @@ br_switchdev_fdb_notify(struct net_bridge *br,
if (test_bit(BR_FDB_LOCKED, &fdb->flags))
return;
+ /* Entries with these flags were created using ndm_state == NUD_REACHABLE,
+ * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something
+ * equivalent to 'bridge fdb add ... master dynamic (sticky)'.
+ * Drivers don't know how to deal with these, so don't notify them to
+ * avoid confusing them.
+ */
+ if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) &&
+ !test_bit(BR_FDB_STATIC, &fdb->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
+ return;
+
br_switchdev_fdb_populate(br, &item, fdb, NULL);
switch (type) {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8a3dbc09ba38..15f44d026e75 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2134,6 +2134,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 },
[BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT },
[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
};
static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index e378c2f3a9e2..8fa89b04ee94 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -52,7 +52,9 @@ bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
const struct net_bridge_port *p)
{
if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
- !__vlan_tun_put(skb, v))
+ !__vlan_tun_put(skb, v) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
+ !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED)))
return false;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -80,6 +82,7 @@ size_t br_vlan_opts_nl_size(void)
+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */
+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */
#endif
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */
+ 0;
}
@@ -239,6 +242,21 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
}
#endif
+ if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) {
+ bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+ bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]);
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans");
+ return -EINVAL;
+ }
+
+ if (val != enabled) {
+ v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+ *changed = true;
+ }
+ }
+
return 0;
}
diff --git a/net/can/isotp.c b/net/can/isotp.c
index f1ea91772ae7..a750259cb79c 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -130,7 +130,8 @@ enum {
ISOTP_WAIT_FIRST_FC,
ISOTP_WAIT_FC,
ISOTP_WAIT_DATA,
- ISOTP_SENDING
+ ISOTP_SENDING,
+ ISOTP_SHUTDOWN,
};
struct tpcon {
@@ -903,8 +904,8 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
txtimer);
struct sock *sk = &so->sk;
- /* don't handle timeouts in IDLE state */
- if (so->tx.state == ISOTP_IDLE)
+ /* don't handle timeouts in IDLE or SHUTDOWN state */
+ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN)
return HRTIMER_NORESTART;
/* we did not get any flow control or echo frame in time */
@@ -941,7 +942,6 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
struct isotp_sock *so = isotp_sk(sk);
- u32 old_state = so->tx.state;
struct sk_buff *skb;
struct net_device *dev;
struct canfd_frame *cf;
@@ -951,23 +951,24 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
int off;
int err;
- if (!so->bound)
+ if (!so->bound || so->tx.state == ISOTP_SHUTDOWN)
return -EADDRNOTAVAIL;
+wait_free_buffer:
/* we do not support multiple buffers - for now */
- if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE ||
- wq_has_sleeper(&so->wait)) {
- if (msg->msg_flags & MSG_DONTWAIT) {
- err = -EAGAIN;
- goto err_out;
- }
+ if (wq_has_sleeper(&so->wait) && (msg->msg_flags & MSG_DONTWAIT))
+ return -EAGAIN;
- /* wait for complete transmission of current pdu */
- err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
- if (err)
- goto err_out;
+ /* wait for complete transmission of current pdu */
+ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ if (err)
+ goto err_event_drop;
- so->tx.state = ISOTP_SENDING;
+ if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) {
+ if (so->tx.state == ISOTP_SHUTDOWN)
+ return -EADDRNOTAVAIL;
+
+ goto wait_free_buffer;
}
/* PDU size > default => try max_pdu_size */
@@ -1107,7 +1108,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
if (wait_tx_done) {
/* wait for complete transmission of current pdu */
- wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ if (err)
+ goto err_event_drop;
if (sk->sk_err)
return -sk->sk_err;
@@ -1115,13 +1118,15 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
return size;
+err_event_drop:
+ /* got signal: force tx state machine to be idle */
+ so->tx.state = ISOTP_IDLE;
+ hrtimer_cancel(&so->txfrtimer);
+ hrtimer_cancel(&so->txtimer);
err_out_drop:
/* drop this PDU and unlock a potential wait queue */
- old_state = ISOTP_IDLE;
-err_out:
- so->tx.state = old_state;
- if (so->tx.state == ISOTP_IDLE)
- wake_up_interruptible(&so->wait);
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
return err;
}
@@ -1153,7 +1158,7 @@ static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (ret < 0)
goto out_err;
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (msg->msg_name) {
__sockaddr_check_size(ISOTP_MIN_NAMELEN);
@@ -1183,10 +1188,12 @@ static int isotp_release(struct socket *sock)
net = sock_net(sk);
/* wait for complete transmission of current pdu */
- wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 &&
+ cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE)
+ ;
/* force state machines to be idle also when a signal occurred */
- so->tx.state = ISOTP_IDLE;
+ so->tx.state = ISOTP_SHUTDOWN;
so->rx.state = ISOTP_IDLE;
spin_lock(&isotp_notifier_lock);
@@ -1652,6 +1659,21 @@ static int isotp_init(struct sock *sk)
return 0;
}
+static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+
+ __poll_t mask = datagram_poll(file, sock, wait);
+ poll_wait(file, &so->wait, wait);
+
+ /* Check for false positives due to TX state */
+ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE))
+ mask &= ~(EPOLLOUT | EPOLLWRNORM);
+
+ return mask;
+}
+
static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
@@ -1667,7 +1689,7 @@ static const struct proto_ops isotp_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = isotp_getname,
- .poll = datagram_poll,
+ .poll = isotp_poll,
.ioctl = isotp_sock_no_ioctlcmd,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index fb92c3609e17..fe3df23a2595 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -604,7 +604,10 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
/* reserve CAN header */
skb_reserve(skb, offsetof(struct can_frame, data));
- memcpy(skb->cb, re_skcb, sizeof(skb->cb));
+ /* skb->cb must be large enough to hold a j1939_sk_buff_cb structure */
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*re_skcb));
+
+ memcpy(skb->cb, re_skcb, sizeof(*re_skcb));
skcb = j1939_skb_to_cb(skb);
if (swap_src_dst)
j1939_skbcb_swap(skcb);
diff --git a/net/compat.c b/net/compat.c
index 161b7bea1f62..6564720f32b7 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -113,7 +113,7 @@ int get_compat_msghdr(struct msghdr *kmsg,
#define CMSG_COMPAT_FIRSTHDR(msg) \
(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
- (struct compat_cmsghdr __user *)((msg)->msg_control) : \
+ (struct compat_cmsghdr __user *)((msg)->msg_control_user) : \
(struct compat_cmsghdr __user *)NULL)
#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \
@@ -126,7 +126,7 @@ static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *ms
struct compat_cmsghdr __user *cmsg, int cmsg_len)
{
char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len);
- if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) >
+ if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control_user) >
msg->msg_controllen)
return NULL;
return (struct compat_cmsghdr __user *)ptr;
@@ -211,6 +211,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
goto Einval;
/* Ok, looks like we made it. Hook it up and return success. */
+ kmsg->msg_control_is_user = false;
kmsg->msg_control = kcmsg_base;
kmsg->msg_controllen = kcmlen;
return 0;
@@ -225,7 +226,7 @@ Efault:
int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
{
- struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+ struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control_user;
struct compat_cmsghdr cmhdr;
struct old_timeval32 ctv;
struct old_timespec32 cts[3];
@@ -274,7 +275,7 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
cmlen = CMSG_COMPAT_SPACE(len);
if (kmsg->msg_controllen < cmlen)
cmlen = kmsg->msg_controllen;
- kmsg->msg_control += cmlen;
+ kmsg->msg_control_user += cmlen;
kmsg->msg_controllen -= cmlen;
return 0;
}
@@ -289,7 +290,7 @@ static int scm_max_fds_compat(struct msghdr *msg)
void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
{
struct compat_cmsghdr __user *cm =
- (struct compat_cmsghdr __user *)msg->msg_control;
+ (struct compat_cmsghdr __user *)msg->msg_control_user;
unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
int fdmax = min_t(int, scm_max_fds_compat(msg), scm->fp->count);
int __user *cmsg_data = CMSG_COMPAT_DATA(cm);
@@ -313,7 +314,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
cmlen = CMSG_COMPAT_SPACE(i * sizeof(int));
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
- msg->msg_control += cmlen;
+ msg->msg_control_user += cmlen;
msg->msg_controllen -= cmlen;
}
}
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 7a36353dbc22..d4172534dfa8 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
@@ -49,7 +49,6 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
- bool free_sk_storage = false;
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -58,13 +57,8 @@ void bpf_sk_storage_free(struct sock *sk)
return;
}
- raw_spin_lock_bh(&sk_storage->lock);
- free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage);
- raw_spin_unlock_bh(&sk_storage->lock);
+ bpf_local_storage_destroy(sk_storage);
rcu_read_unlock();
-
- if (free_sk_storage)
- kfree_rcu(sk_storage, rcu);
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -74,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &sk_cache);
+ return bpf_local_storage_map_alloc(attr, &sk_cache, false);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
@@ -100,8 +94,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(err);
}
-static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct socket *sock;
@@ -120,7 +114,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
return err;
}
-static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
{
struct socket *sock;
int fd, err;
@@ -203,7 +197,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
} else {
ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
if (ret) {
- kfree(copy_selem);
+ bpf_selem_free(copy_selem, smap, true);
atomic_sub(smap->elem_size,
&newsk->sk_omem_alloc);
bpf_map_put(map);
@@ -418,7 +412,7 @@ const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -430,7 +424,7 @@ const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
.allowed = bpf_sk_storage_tracing_allowed,
};
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ce5985be84b..1551aabac343 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1612,7 +1612,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
- N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP)
+ N(XDP_FEAT_CHANGE)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -3197,6 +3197,7 @@ static u16 skb_tx_hash(const struct net_device *dev,
}
if (skb_rx_queue_recorded(skb)) {
+ DEBUG_NET_WARN_ON_ONCE(qcount == 0);
hash = skb_get_rx_queue(skb);
if (hash >= qoffset)
hash -= qoffset;
@@ -3314,8 +3315,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
skb->len - start, ~(__u32)0,
crc32c_csum_stub));
*(__le32 *)(skb->data + offset) = crc32c_csum;
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum_not_inet = 0;
+ skb_reset_csum_not_inet(skb);
out:
return ret;
}
@@ -4358,6 +4358,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
list_add_tail(&napi->poll_list, &sd->poll_list);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
@@ -5039,7 +5040,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
- __kfree_skb_defer(skb);
+ __napi_kfree_skb(skb,
+ get_kfree_skb_cb(skb)->reason);
}
}
@@ -6068,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+ WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state);
do {
@@ -6383,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
@@ -10870,7 +10874,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL, NULL, 0,
- portid, nlmsg_seq(nlh));
+ portid, nlh);
/*
* Flush the unicast and multicast chains
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 6d772837eb3f..3730945ee294 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -7,7 +7,7 @@
#include <linux/net_tstamp.h>
#include <linux/wireless.h>
#include <linux/if_bridge.h>
-#include <net/dsa.h>
+#include <net/dsa_stubs.h>
#include <net/wext.h>
#include "dev.h"
@@ -259,9 +259,6 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
{
- struct netdev_notifier_hwtstamp_info info = {
- .info.dev = dev,
- };
struct kernel_hwtstamp_config kernel_cfg;
struct netlink_ext_ack extack = {};
struct hwtstamp_config cfg;
@@ -276,12 +273,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
if (err)
return err;
- info.info.extack = &extack;
- info.config = &kernel_cfg;
-
- err = call_netdevice_notifiers_info(NETDEV_PRE_CHANGE_HWTSTAMP,
- &info.info);
- err = notifier_to_errno(err);
+ err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
if (err) {
if (extack._msg)
netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 5a782d1d8fd3..aff31cd944c2 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -21,6 +21,7 @@
#include <linux/workqueue.h>
#include <linux/netlink.h>
#include <linux/net_dropmon.h>
+#include <linux/bitfield.h>
#include <linux/percpu.h>
#include <linux/timer.h>
#include <linux/bitops.h>
@@ -29,6 +30,7 @@
#include <net/genetlink.h>
#include <net/netevent.h>
#include <net/flow_offload.h>
+#include <net/dropreason.h>
#include <net/devlink.h>
#include <trace/events/skb.h>
@@ -504,8 +506,6 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
if (!nskb)
return;
- if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0))
- reason = SKB_DROP_REASON_NOT_SPECIFIED;
cb = NET_DM_SKB_CB(nskb);
cb->reason = reason;
cb->pc = location;
@@ -552,9 +552,9 @@ static size_t net_dm_in_port_size(void)
}
#define NET_DM_MAX_SYMBOL_LEN 40
+#define NET_DM_MAX_REASON_LEN 50
-static size_t net_dm_packet_report_size(size_t payload_len,
- enum skb_drop_reason reason)
+static size_t net_dm_packet_report_size(size_t payload_len)
{
size_t size;
@@ -576,7 +576,7 @@ static size_t net_dm_packet_report_size(size_t payload_len,
/* NET_DM_ATTR_PROTO */
nla_total_size(sizeof(u16)) +
/* NET_DM_ATTR_REASON */
- nla_total_size(strlen(drop_reasons[reason]) + 1) +
+ nla_total_size(NET_DM_MAX_REASON_LEN + 1) +
/* NET_DM_ATTR_PAYLOAD */
nla_total_size(payload_len);
}
@@ -610,6 +610,8 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
size_t payload_len)
{
struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
+ const struct drop_reason_list *list = NULL;
+ unsigned int subsys, subsys_reason;
char buf[NET_DM_MAX_SYMBOL_LEN];
struct nlattr *attr;
void *hdr;
@@ -627,9 +629,24 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
NET_DM_ATTR_PAD))
goto nla_put_failure;
+ rcu_read_lock();
+ subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK);
+ if (subsys < SKB_DROP_REASON_SUBSYS_NUM)
+ list = rcu_dereference(drop_reasons_by_subsys[subsys]);
+ subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK;
+ if (!list ||
+ subsys_reason >= list->n_reasons ||
+ !list->reasons[subsys_reason] ||
+ strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) {
+ list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]);
+ subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ }
if (nla_put_string(msg, NET_DM_ATTR_REASON,
- drop_reasons[cb->reason]))
+ list->reasons[subsys_reason])) {
+ rcu_read_unlock();
goto nla_put_failure;
+ }
+ rcu_read_unlock();
snprintf(buf, sizeof(buf), "%pS", cb->pc);
if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
@@ -687,9 +704,7 @@ static void net_dm_packet_report(struct sk_buff *skb)
if (net_dm_trunc_len)
payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
- msg = nlmsg_new(net_dm_packet_report_size(payload_len,
- NET_DM_SKB_CB(skb)->reason),
- GFP_KERNEL);
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
if (!msg)
goto out;
diff --git a/net/core/filter.c b/net/core/filter.c
index a8c8fd96c822..df0df59814ae 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5002,7 +5002,7 @@ const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
.func = bpf_get_socket_ptr_cookie,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
};
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
@@ -8746,23 +8746,18 @@ EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id, enum bpf_type_flag *flag);
+ int off, int size);
EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ int off, int size)
{
int ret = -EACCES;
- if (atype == BPF_READ)
- return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-
mutex_lock(&nf_conn_btf_access_lock);
if (nfct_btf_struct_access)
- ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+ ret = nfct_btf_struct_access(log, reg, off, size);
mutex_unlock(&nf_conn_btf_access_lock);
return ret;
@@ -8829,17 +8824,13 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
static int xdp_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ int off, int size)
{
int ret = -EACCES;
- if (atype == BPF_READ)
- return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-
mutex_lock(&nf_conn_btf_access_lock);
if (nfct_btf_struct_access)
- ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+ ret = nfct_btf_struct_access(log, reg, off, size);
mutex_unlock(&nf_conn_btf_access_lock);
return ret;
@@ -9189,7 +9180,7 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
__u8 tmp_reg = BPF_REG_AX;
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
- PKT_VLAN_PRESENT_OFFSET);
+ SKB_BF_MONO_TC_OFFSET);
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
SKB_MONO_DELIVERY_TIME_MASK, 2);
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
@@ -9236,7 +9227,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
/* AX is needed because src_reg and dst_reg could be the same */
__u8 tmp_reg = BPF_REG_AX;
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
@@ -9271,14 +9262,14 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
if (!prog->tstamp_type_access) {
__u8 tmp_reg = BPF_REG_AX;
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
/* Writing __sk_buff->tstamp as ingress, goto <clear> */
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
/* goto <store> */
*insn++ = BPF_JMP_A(2);
/* <clear>: mono_delivery_time */
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
- *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET);
+ *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
}
#endif
diff --git a/net/core/gro.c b/net/core/gro.c
index a606705a0859..2d84165cb4f1 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -633,7 +633,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
- __kfree_skb_defer(skb);
+ __napi_kfree_skb(skb, SKB_CONSUMED);
break;
case GRO_HELD:
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a089b704b986..e6a739b1afa9 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -137,6 +137,20 @@ static void queue_process(struct work_struct *work)
}
}
+static int netif_local_xmit_active(struct net_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
+ if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id())
+ return 1;
+ }
+
+ return 0;
+}
+
static void poll_one_napi(struct napi_struct *napi)
{
int work;
@@ -183,7 +197,10 @@ void netpoll_poll_dev(struct net_device *dev)
if (!ni || down_trylock(&ni->dev_lock))
return;
- if (!netif_running(dev)) {
+ /* Some drivers will take the same locks in poll and xmit,
+ * we can't poll if local CPU is already in xmit.
+ */
+ if (!netif_running(dev) || netif_local_xmit_active(dev)) {
up(&ni->dev_lock);
return;
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c18799865..e212e9d7edcb 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -19,6 +19,7 @@
#include <linux/mm.h> /* for put_page() */
#include <linux/poison.h>
#include <linux/ethtool.h>
+#include <linux/netdevice.h>
#include <trace/events/page_pool.h>
@@ -315,7 +316,8 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
*/
dma = dma_map_page_attrs(pool->p.dev, page, 0,
(PAGE_SIZE << pool->p.order),
- pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
+ pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
if (dma_mapping_error(pool->p.dev, dma))
return false;
@@ -483,7 +485,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
/* When page is unmapped, it cannot be returned to our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
- DMA_ATTR_SKIP_CPU_SYNC);
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
page_pool_clear_pp_info(page);
@@ -837,6 +839,21 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
pool->xdp_mem_id = mem->id;
}
+void page_pool_unlink_napi(struct page_pool *pool)
+{
+ if (!pool->p.napi)
+ return;
+
+ /* To avoid races with recycling and additional barriers make sure
+ * pool and NAPI are unlinked when NAPI is disabled.
+ */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
+ READ_ONCE(pool->p.napi->list_owner) != -1);
+
+ WRITE_ONCE(pool->p.napi, NULL);
+}
+EXPORT_SYMBOL(page_pool_unlink_napi);
+
void page_pool_destroy(struct page_pool *pool)
{
if (!pool)
@@ -845,6 +862,7 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_put(pool))
return;
+ page_pool_unlink_napi(pool);
page_pool_free_frag(pool);
if (!page_pool_release(pool))
@@ -874,9 +892,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
EXPORT_SYMBOL(page_pool_update_nid);
-bool page_pool_return_skb_page(struct page *page)
+bool page_pool_return_skb_page(struct page *page, bool napi_safe)
{
+ struct napi_struct *napi;
struct page_pool *pp;
+ bool allow_direct;
page = compound_head(page);
@@ -892,12 +912,20 @@ bool page_pool_return_skb_page(struct page *page)
pp = page->pp;
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ */
+ napi = READ_ONCE(pp->p.napi);
+ allow_direct = napi_safe && napi &&
+ READ_ONCE(napi->list_owner) == smp_processor_id();
+
/* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
- page_pool_put_full_page(pp, page, false);
+ page_pool_put_full_page(pp, page, allow_direct);
return true;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 906aebdc566b..653901a1bf75 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -61,7 +61,7 @@
#include "dev.h"
#define RTNL_MAX_TYPE 50
-#define RTNL_SLAVE_MAX_TYPE 42
+#define RTNL_SLAVE_MAX_TYPE 43
struct rtnl_link {
rtnl_doit_func doit;
@@ -3975,16 +3975,23 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
unsigned int change,
u32 event, gfp_t flags, int *new_nsid,
- int new_ifindex, u32 portid, u32 seq)
+ int new_ifindex, u32 portid,
+ const struct nlmsghdr *nlh)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ u32 seq = 0;
skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
if (skb == NULL)
goto errout;
+ if (nlmsg_report(nlh))
+ seq = nlmsg_seq(nlh);
+ else
+ portid = 0;
+
err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
type, portid, seq, change, 0, 0, event,
new_nsid, new_ifindex, -1, flags);
@@ -4020,7 +4027,7 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
return;
skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
- new_ifindex, portid, nlmsg_seq(nlh));
+ new_ifindex, portid, nlh);
if (skb)
rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
}
diff --git a/net/core/scm.c b/net/core/scm.c
index acb7d776fa6e..3cd7dd377e53 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -250,7 +250,10 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
}
cmlen = min(CMSG_SPACE(len), msg->msg_controllen);
- msg->msg_control += cmlen;
+ if (msg->msg_control_is_user)
+ msg->msg_control_user += cmlen;
+ else
+ msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
return 0;
@@ -299,7 +302,7 @@ static int scm_max_fds(struct msghdr *msg)
void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
{
struct cmsghdr __user *cm =
- (__force struct cmsghdr __user *)msg->msg_control;
+ (__force struct cmsghdr __user *)msg->msg_control_user;
unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count);
int __user *cmsg_data = CMSG_USER_DATA(cm);
@@ -332,7 +335,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
cmlen = CMSG_SPACE(i * sizeof(int));
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
- msg->msg_control += cmlen;
+ msg->msg_control_user += cmlen;
msg->msg_controllen -= cmlen;
}
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 050a875d09c5..0d998806b377 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -58,6 +58,7 @@
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
+#include <linux/bitfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
@@ -72,6 +73,7 @@
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool.h>
+#include <net/dropreason.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -122,11 +124,59 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
-const char * const drop_reasons[] = {
+static const char * const drop_reasons[] = {
[SKB_CONSUMED] = "CONSUMED",
DEFINE_DROP_REASON(FN, FN)
};
-EXPORT_SYMBOL(drop_reasons);
+
+static const struct drop_reason_list drop_reasons_core = {
+ .reasons = drop_reasons,
+ .n_reasons = ARRAY_SIZE(drop_reasons),
+};
+
+const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
+ [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
+};
+EXPORT_SYMBOL(drop_reasons_by_subsys);
+
+/**
+ * drop_reasons_register_subsys - register another drop reason subsystem
+ * @subsys: the subsystem to register, must not be the core
+ * @list: the list of drop reasons within the subsystem, must point to
+ * a statically initialized list
+ */
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+ const struct drop_reason_list *list)
+{
+ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+ subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+ "invalid subsystem %d\n", subsys))
+ return;
+
+ /* must point to statically allocated memory, so INIT is OK */
+ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
+}
+EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
+
+/**
+ * drop_reasons_unregister_subsys - unregister a drop reason subsystem
+ * @subsys: the subsystem to remove, must not be the core
+ *
+ * Note: This will synchronize_rcu() to ensure no users when it returns.
+ */
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
+{
+ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+ subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+ "invalid subsystem %d\n", subsys))
+ return;
+
+ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
/**
* skb_panic - private function for out-of-line support
@@ -839,11 +889,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return page_pool_return_skb_page(virt_to_page(data));
+ return page_pool_return_skb_page(virt_to_page(data), napi_safe);
}
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -856,12 +906,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head);
}
-static void skb_free_head(struct sk_buff *skb)
+static void skb_free_head(struct sk_buff *skb, bool napi_safe)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
- if (skb_pp_recycle(skb, head))
+ if (skb_pp_recycle(skb, head, napi_safe))
return;
skb_free_frag(head);
} else {
@@ -869,7 +919,8 @@ static void skb_free_head(struct sk_buff *skb)
}
}
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
+ bool napi_safe)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -888,13 +939,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
}
for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+ napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
free_head:
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_free_head(skb);
+ skb_free_head(skb, napi_safe);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
@@ -955,11 +1006,12 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
+ bool napi_safe)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb, reason);
+ skb_release_data(skb, reason, napi_safe);
}
/**
@@ -973,7 +1025,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
@@ -984,7 +1036,10 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
if (unlikely(!skb_unref(skb)))
return false;
- DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+ DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
+ u32_get_bits(reason,
+ SKB_DROP_REASON_SUBSYS_MASK) >=
+ SKB_DROP_REASON_SUBSYS_NUM);
if (reason == SKB_CONSUMED)
trace_consume_skb(skb, __builtin_return_address(0));
@@ -1027,7 +1082,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
- skb_release_all(skb, reason);
+ skb_release_all(skb, reason, false);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1201,7 +1256,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
kfree_skbmem(skb);
}
@@ -1224,9 +1279,9 @@ static void napi_skb_cache_put(struct sk_buff *skb)
}
}
-void __kfree_skb_defer(struct sk_buff *skb)
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ skb_release_all(skb, reason, true);
napi_skb_cache_put(skb);
}
@@ -1264,7 +1319,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb, SKB_CONSUMED);
+ skb_release_all(skb, SKB_CONSUMED, !!budget);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1395,7 +1450,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst, SKB_CONSUMED);
+ skb_release_all(dst, SKB_CONSUMED, false);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -2018,9 +2073,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
} else {
- skb_free_head(skb);
+ skb_free_head(skb, false);
}
off = (data + nhead) - skb->head;
@@ -5187,6 +5242,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+#ifdef CONFIG_WIRELESS
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
struct sock *sk = skb->sk;
@@ -5212,6 +5268,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+#endif /* CONFIG_WIRELESS */
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
@@ -5597,18 +5654,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;
- /* In general, avoid mixing slab allocated and page_pool allocated
- * pages within the same SKB. However when @to is not pp_recycle and
- * @from is cloned, we can transition frag pages from page_pool to
- * reference counted.
- *
- * On the other hand, don't allow coalescing two pp_recycle SKBs if
- * @from is cloned, in case the SKB is using page_pool fragment
+ /* In general, avoid mixing page_pool and non-page_pool allocated
+ * pages within the same SKB. Additionally avoid dealing with clones
+ * with page_pool pages, in case the SKB is using page_pool fragment
* references (PP_FLAG_PAGE_FRAG). Since we only take full page
* references for cloned SKBs at the moment that would result in
* inconsistent reference counts.
+ * In theory we could take full references if @from is cloned and
+ * !@to->pp_recycle but its tricky (due to potential race with
+ * the clone disappearing) and rare, so not worth dealing with.
*/
- if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
+ if (to->pp_recycle != from->pp_recycle ||
+ (from->pp_recycle && skb_cloned(from)))
return false;
if (len <= skb_tailroom(to)) {
@@ -6389,12 +6446,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
- skb_free_head(skb);
+ skb_free_head(skb, false);
}
skb->head = data;
@@ -6529,7 +6586,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
skb->head = data;
skb->head_frag = 0;
diff --git a/net/core/sock.c b/net/core/sock.c
index c25888795390..5440e67bcfe3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1396,15 +1396,10 @@ set_sndbuf:
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
- /* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
- ret = -EPERM;
- else {
- if (val < 0)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_ll_usec, val);
- }
+ if (val < 0)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(sk->sk_ll_usec, val);
break;
case SO_PREFER_BUSY_POLL:
if (valbool && !sockopt_capable(CAP_NET_ADMIN))
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9b854e236d23..7c189c2e2fbf 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -437,7 +437,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
__sock_map_delete(stab, sk, link_raw);
}
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
+static long sock_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
u32 i = *(u32 *)key;
@@ -587,8 +587,8 @@ out:
return ret;
}
-static int sock_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct sock *sk = (struct sock *)value;
int ret;
@@ -925,7 +925,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
raw_spin_unlock_bh(&bucket->lock);
}
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+static long sock_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
u32 hash, key_size = map->key_size;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 528d4b37983d..41e5ca8643ec 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -531,21 +531,6 @@ out:
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
-/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
-{
- struct xdp_mem_allocator *xa;
- struct page *page;
-
- rcu_read_lock();
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- page = virt_to_head_page(data);
- if (xa)
- page_pool_release_page(xa->page_pool, page);
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__xdp_release_frame);
-
void xdp_attachment_setup(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{
@@ -658,8 +643,8 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
* - RX ring dev queue index (skb_record_rx_queue)
*/
- /* Until page_pool get SKB return path, release DMA here */
- xdp_release_frame(xdpf);
+ if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
/* Allow SKB to reuse area used by xdp_frame */
xdp_scrub_frame(xdpf);
@@ -734,13 +719,21 @@ __bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *tim
* bpf_xdp_metadata_rx_hash - Read XDP frame RX hash.
* @ctx: XDP context pointer.
* @hash: Return value pointer.
+ * @rss_type: Return value pointer for RSS type.
+ *
+ * The RSS hash type (@rss_type) specifies what portion of packet headers NIC
+ * hardware used when calculating RSS hash value. The RSS type can be decoded
+ * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits
+ * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types*
+ * ``XDP_RSS_TYPE_L*``.
*
* Return:
* * Returns 0 on success or ``-errno`` on error.
* * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc
* * ``-ENODATA`` : means no RX-hash available for this frame
*/
-__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
+__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
+ enum xdp_rss_hash_type *rss_type)
{
return -EOPNOTSUPP;
}
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index cc7e93a562fe..12e305824a96 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,4 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
+
+# the stubs are built-in whenever DSA is built-in or module
+ifdef CONFIG_NET_DSA
+obj-y := stubs.o
+endif
+
# the core
obj-$(CONFIG_NET_DSA) += dsa_core.o
dsa_core-y += \
@@ -10,7 +16,8 @@ dsa_core-y += \
slave.o \
switch.o \
tag.o \
- tag_8021q.o
+ tag_8021q.o \
+ trace.o
# tagging formats
obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
@@ -31,3 +38,6 @@ obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o
obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
+
+# for tracing framework to find trace.h
+CFLAGS_trace.o := -I$(src)
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index e5f156940c67..ab1afe67fd18 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -17,6 +17,7 @@
#include <linux/of.h>
#include <linux/of_mdio.h>
#include <linux/of_net.h>
+#include <net/dsa_stubs.h>
#include <net/sch_generic.h>
#include "devlink.h"
@@ -1702,6 +1703,20 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
}
EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
+static const struct dsa_stubs __dsa_stubs = {
+ .master_hwtstamp_validate = __dsa_master_hwtstamp_validate,
+};
+
+static void dsa_register_stubs(void)
+{
+ dsa_stubs = &__dsa_stubs;
+}
+
+static void dsa_unregister_stubs(void)
+{
+ dsa_stubs = NULL;
+}
+
static int __init dsa_init_module(void)
{
int rc;
@@ -1721,6 +1736,8 @@ static int __init dsa_init_module(void)
if (rc)
goto netlink_register_fail;
+ dsa_register_stubs();
+
return 0;
netlink_register_fail:
@@ -1735,6 +1752,8 @@ module_init(dsa_init_module);
static void __exit dsa_cleanup_module(void)
{
+ dsa_unregister_stubs();
+
rtnl_link_unregister(&dsa_link_ops);
dsa_slave_unregister_notifier();
diff --git a/net/dsa/master.c b/net/dsa/master.c
index c2cabe6248b1..6be89ab0cc01 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -198,7 +198,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
/* Deny PTP operations on master if there is at least one switch in the tree
* that is PTP capable.
*/
-int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+int __dsa_master_hwtstamp_validate(struct net_device *dev,
const struct kernel_hwtstamp_config *config,
struct netlink_ext_ack *extack)
{
diff --git a/net/dsa/master.h b/net/dsa/master.h
index 80842f4e27f7..76e39d3ec909 100644
--- a/net/dsa/master.h
+++ b/net/dsa/master.h
@@ -15,7 +15,7 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
struct netlink_ext_ack *extack);
void dsa_master_lag_teardown(struct net_device *lag_dev,
struct dsa_port *cpu_dp);
-int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+int __dsa_master_hwtstamp_validate(struct net_device *dev,
const struct kernel_hwtstamp_config *config,
struct netlink_ext_ack *extack);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8abc1658ac47..165bb2cb8431 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -3289,7 +3289,6 @@ static int dsa_master_changeupper(struct net_device *dev,
static int dsa_slave_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
- struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
switch (event) {
@@ -3419,16 +3418,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
return NOTIFY_OK;
}
- case NETDEV_PRE_CHANGE_HWTSTAMP: {
- struct netdev_notifier_hwtstamp_info *info = ptr;
- int err;
-
- if (!netdev_uses_dsa(dev))
- return NOTIFY_DONE;
-
- err = dsa_master_pre_change_hwtstamp(dev, info->config, extack);
- return notifier_from_errno(err);
- }
default:
break;
}
diff --git a/net/dsa/stubs.c b/net/dsa/stubs.c
new file mode 100644
index 000000000000..2ed8a6c85fbf
--- /dev/null
+++ b/net/dsa/stubs.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Stubs for DSA functionality called by the core network stack.
+ * These are necessary because CONFIG_NET_DSA can be a module, and built-in
+ * code cannot directly call symbols exported by modules.
+ */
+#include <net/dsa_stubs.h>
+
+const struct dsa_stubs *dsa_stubs;
+EXPORT_SYMBOL_GPL(dsa_stubs);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index d5bc4bb7310d..8c9a9f94b756 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -18,6 +18,7 @@
#include "slave.h"
#include "switch.h"
#include "tag_8021q.h"
+#include "trace.h"
static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
unsigned int ageing_time)
@@ -164,14 +165,20 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp,
int err = 0;
/* No need to bother with refcounting for user ports */
- if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
- return ds->ops->port_mdb_add(ds, port, mdb, db);
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_mdb_add(ds, port, mdb, db);
+ trace_dsa_mdb_add_hw(dp, mdb->addr, mdb->vid, &db, err);
+
+ return err;
+ }
mutex_lock(&dp->addr_lists_lock);
a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db);
if (a) {
refcount_inc(&a->refcount);
+ trace_dsa_mdb_add_bump(dp, mdb->addr, mdb->vid, &db,
+ &a->refcount);
goto out;
}
@@ -182,6 +189,7 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp,
}
err = ds->ops->port_mdb_add(ds, port, mdb, db);
+ trace_dsa_mdb_add_hw(dp, mdb->addr, mdb->vid, &db, err);
if (err) {
kfree(a);
goto out;
@@ -209,21 +217,30 @@ static int dsa_port_do_mdb_del(struct dsa_port *dp,
int err = 0;
/* No need to bother with refcounting for user ports */
- if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
- return ds->ops->port_mdb_del(ds, port, mdb, db);
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_mdb_del(ds, port, mdb, db);
+ trace_dsa_mdb_del_hw(dp, mdb->addr, mdb->vid, &db, err);
+
+ return err;
+ }
mutex_lock(&dp->addr_lists_lock);
a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db);
if (!a) {
+ trace_dsa_mdb_del_not_found(dp, mdb->addr, mdb->vid, &db);
err = -ENOENT;
goto out;
}
- if (!refcount_dec_and_test(&a->refcount))
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_mdb_del_drop(dp, mdb->addr, mdb->vid, &db,
+ &a->refcount);
goto out;
+ }
err = ds->ops->port_mdb_del(ds, port, mdb, db);
+ trace_dsa_mdb_del_hw(dp, mdb->addr, mdb->vid, &db, err);
if (err) {
refcount_set(&a->refcount, 1);
goto out;
@@ -247,14 +264,19 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr,
int err = 0;
/* No need to bother with refcounting for user ports */
- if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
- return ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ trace_dsa_fdb_add_hw(dp, addr, vid, &db, err);
+
+ return err;
+ }
mutex_lock(&dp->addr_lists_lock);
a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db);
if (a) {
refcount_inc(&a->refcount);
+ trace_dsa_fdb_add_bump(dp, addr, vid, &db, &a->refcount);
goto out;
}
@@ -265,6 +287,7 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr,
}
err = ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ trace_dsa_fdb_add_hw(dp, addr, vid, &db, err);
if (err) {
kfree(a);
goto out;
@@ -291,21 +314,29 @@ static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr,
int err = 0;
/* No need to bother with refcounting for user ports */
- if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
- return ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ trace_dsa_fdb_del_hw(dp, addr, vid, &db, err);
+
+ return err;
+ }
mutex_lock(&dp->addr_lists_lock);
a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db);
if (!a) {
+ trace_dsa_fdb_del_not_found(dp, addr, vid, &db);
err = -ENOENT;
goto out;
}
- if (!refcount_dec_and_test(&a->refcount))
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_fdb_del_drop(dp, addr, vid, &db, &a->refcount);
goto out;
+ }
err = ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ trace_dsa_fdb_del_hw(dp, addr, vid, &db, err);
if (err) {
refcount_set(&a->refcount, 1);
goto out;
@@ -332,6 +363,8 @@ static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag,
a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db);
if (a) {
refcount_inc(&a->refcount);
+ trace_dsa_lag_fdb_add_bump(lag->dev, addr, vid, &db,
+ &a->refcount);
goto out;
}
@@ -342,6 +375,7 @@ static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag,
}
err = ds->ops->lag_fdb_add(ds, *lag, addr, vid, db);
+ trace_dsa_lag_fdb_add_hw(lag->dev, addr, vid, &db, err);
if (err) {
kfree(a);
goto out;
@@ -370,14 +404,19 @@ static int dsa_switch_do_lag_fdb_del(struct dsa_switch *ds, struct dsa_lag *lag,
a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db);
if (!a) {
+ trace_dsa_lag_fdb_del_not_found(lag->dev, addr, vid, &db);
err = -ENOENT;
goto out;
}
- if (!refcount_dec_and_test(&a->refcount))
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_lag_fdb_del_drop(lag->dev, addr, vid, &db,
+ &a->refcount);
goto out;
+ }
err = ds->ops->lag_fdb_del(ds, *lag, addr, vid, db);
+ trace_dsa_lag_fdb_del_hw(lag->dev, addr, vid, &db, err);
if (err) {
refcount_set(&a->refcount, 1);
goto out;
@@ -656,8 +695,12 @@ static int dsa_port_do_vlan_add(struct dsa_port *dp,
int err = 0;
/* No need to bother with refcounting for user ports. */
- if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
- return ds->ops->port_vlan_add(ds, port, vlan, extack);
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ trace_dsa_vlan_add_hw(dp, vlan, err);
+
+ return err;
+ }
/* No need to propagate on shared ports the existing VLANs that were
* re-notified after just the flags have changed. This would cause a
@@ -672,6 +715,7 @@ static int dsa_port_do_vlan_add(struct dsa_port *dp,
v = dsa_vlan_find(&dp->vlans, vlan);
if (v) {
refcount_inc(&v->refcount);
+ trace_dsa_vlan_add_bump(dp, vlan, &v->refcount);
goto out;
}
@@ -682,6 +726,7 @@ static int dsa_port_do_vlan_add(struct dsa_port *dp,
}
err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ trace_dsa_vlan_add_hw(dp, vlan, err);
if (err) {
kfree(v);
goto out;
@@ -706,21 +751,29 @@ static int dsa_port_do_vlan_del(struct dsa_port *dp,
int err = 0;
/* No need to bother with refcounting for user ports */
- if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
- return ds->ops->port_vlan_del(ds, port, vlan);
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_vlan_del(ds, port, vlan);
+ trace_dsa_vlan_del_hw(dp, vlan, err);
+
+ return err;
+ }
mutex_lock(&dp->vlans_lock);
v = dsa_vlan_find(&dp->vlans, vlan);
if (!v) {
+ trace_dsa_vlan_del_not_found(dp, vlan);
err = -ENOENT;
goto out;
}
- if (!refcount_dec_and_test(&v->refcount))
+ if (!refcount_dec_and_test(&v->refcount)) {
+ trace_dsa_vlan_del_drop(dp, vlan, &v->refcount);
goto out;
+ }
err = ds->ops->port_vlan_del(ds, port, vlan);
+ trace_dsa_vlan_del_hw(dp, vlan, err);
if (err) {
refcount_set(&v->refcount, 1);
goto out;
diff --git a/net/dsa/trace.c b/net/dsa/trace.c
new file mode 100644
index 000000000000..1b107165d331
--- /dev/null
+++ b/net/dsa/trace.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2022-2023 NXP
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+void dsa_db_print(const struct dsa_db *db, char buf[DSA_DB_BUFSIZ])
+{
+ switch (db->type) {
+ case DSA_DB_PORT:
+ sprintf(buf, "port %s", db->dp->name);
+ break;
+ case DSA_DB_LAG:
+ sprintf(buf, "lag %s id %d", db->lag.dev->name, db->lag.id);
+ break;
+ case DSA_DB_BRIDGE:
+ sprintf(buf, "bridge %s num %d", db->bridge.dev->name,
+ db->bridge.num);
+ break;
+ default:
+ sprintf(buf, "unknown");
+ break;
+ }
+}
+
+const char *dsa_port_kind(const struct dsa_port *dp)
+{
+ switch (dp->type) {
+ case DSA_PORT_TYPE_USER:
+ return "user";
+ case DSA_PORT_TYPE_CPU:
+ return "cpu";
+ case DSA_PORT_TYPE_DSA:
+ return "dsa";
+ default:
+ return "unused";
+ }
+}
diff --git a/net/dsa/trace.h b/net/dsa/trace.h
new file mode 100644
index 000000000000..567f29a39707
--- /dev/null
+++ b/net/dsa/trace.h
@@ -0,0 +1,447 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright 2022-2023 NXP
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dsa
+
+#if !defined(_NET_DSA_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NET_DSA_TRACE_H
+
+#include <net/dsa.h>
+#include <net/switchdev.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/refcount.h>
+#include <linux/tracepoint.h>
+
+/* Enough to fit "bridge %s num %d" where num has 3 digits */
+#define DSA_DB_BUFSIZ (IFNAMSIZ + 16)
+
+void dsa_db_print(const struct dsa_db *db, char buf[DSA_DB_BUFSIZ]);
+const char *dsa_port_kind(const struct dsa_port *dp);
+
+DECLARE_EVENT_CLASS(dsa_port_addr_op_hw,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db, int err),
+
+ TP_ARGS(dp, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, dev_name(dp->ds->dev));
+ __assign_str(kind, dsa_port_kind(dp));
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->addr,
+ __entry->vid, __entry->db_buf, __entry->err)
+);
+
+/* Add unicast/multicast address to hardware, either on user ports
+ * (where no refcounting is kept), or on shared ports when the entry
+ * is first seen and its refcount is 1.
+ */
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_fdb_add_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_mdb_add_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+/* Delete unicast/multicast address from hardware, either on user ports or
+ * when the refcount on shared ports reaches 0
+ */
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_fdb_del_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_mdb_del_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DECLARE_EVENT_CLASS(dsa_port_addr_op_refcount,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(dp, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, dev_name(dp->ds->dev));
+ __assign_str(kind, dsa_port_kind(dp));
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->addr,
+ __entry->vid, __entry->db_buf, __entry->refcount)
+);
+
+/* Bump the refcount of an existing unicast/multicast address on shared ports */
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_fdb_add_bump,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_mdb_add_bump,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+/* Drop the refcount of a multicast address that we still keep on
+ * shared ports
+ */
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_fdb_del_drop,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_mdb_del_drop,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DECLARE_EVENT_CLASS(dsa_port_addr_del_not_found,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db),
+
+ TP_ARGS(dp, addr, vid, db),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, dev_name(dp->ds->dev));
+ __assign_str(kind, dsa_port_kind(dp));
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\"",
+ __get_str(dev), __get_str(kind), __entry->port,
+ __entry->addr, __entry->vid, __entry->db_buf)
+);
+
+/* Attempt to delete a unicast/multicast address on shared ports for which
+ * the delete operation was called more times than the addition
+ */
+DEFINE_EVENT(dsa_port_addr_del_not_found, dsa_fdb_del_not_found,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+ TP_ARGS(dp, addr, vid, db));
+
+DEFINE_EVENT(dsa_port_addr_del_not_found, dsa_mdb_del_not_found,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+ TP_ARGS(dp, addr, vid, db));
+
+TRACE_EVENT(dsa_lag_fdb_add_hw,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+
+ TP_ARGS(lag_dev, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, lag_dev->name);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->err)
+);
+
+TRACE_EVENT(dsa_lag_fdb_add_bump,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(lag_dev, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, lag_dev->name);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->refcount)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_hw,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+
+ TP_ARGS(lag_dev, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, lag_dev->name);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->err)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_drop,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(lag_dev, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, lag_dev->name);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->refcount)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_not_found,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+
+ TP_ARGS(lag_dev, addr, vid, db),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, lag_dev->name);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\"",
+ __get_str(dev), __entry->addr, __entry->vid, __entry->db_buf)
+);
+
+DECLARE_EVENT_CLASS(dsa_vlan_op_hw,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+
+ TP_ARGS(dp, vlan, err),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ __field(u16, flags)
+ __field(bool, changed)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, dev_name(dp->ds->dev));
+ __assign_str(kind, dsa_port_kind(dp));
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ __entry->flags = vlan->flags;
+ __entry->changed = vlan->changed;
+ __entry->err = err;
+ ),
+
+ TP_printk("%s %s port %d vid %u%s%s%s",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid,
+ __entry->flags & BRIDGE_VLAN_INFO_PVID ? " pvid" : "",
+ __entry->flags & BRIDGE_VLAN_INFO_UNTAGGED ? " untagged" : "",
+ __entry->changed ? " (changed)" : "")
+);
+
+DEFINE_EVENT(dsa_vlan_op_hw, dsa_vlan_add_hw,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+ TP_ARGS(dp, vlan, err));
+
+DEFINE_EVENT(dsa_vlan_op_hw, dsa_vlan_del_hw,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+ TP_ARGS(dp, vlan, err));
+
+DECLARE_EVENT_CLASS(dsa_vlan_op_refcount,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+
+ TP_ARGS(dp, vlan, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ __field(u16, flags)
+ __field(bool, changed)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, dev_name(dp->ds->dev));
+ __assign_str(kind, dsa_port_kind(dp));
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ __entry->flags = vlan->flags;
+ __entry->changed = vlan->changed;
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s %s port %d vid %u%s%s%s refcount %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid,
+ __entry->flags & BRIDGE_VLAN_INFO_PVID ? " pvid" : "",
+ __entry->flags & BRIDGE_VLAN_INFO_UNTAGGED ? " untagged" : "",
+ __entry->changed ? " (changed)" : "", __entry->refcount)
+);
+
+DEFINE_EVENT(dsa_vlan_op_refcount, dsa_vlan_add_bump,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+ TP_ARGS(dp, vlan, refcount));
+
+DEFINE_EVENT(dsa_vlan_op_refcount, dsa_vlan_del_drop,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+ TP_ARGS(dp, vlan, refcount));
+
+TRACE_EVENT(dsa_vlan_del_not_found,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan),
+
+ TP_ARGS(dp, vlan),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev, dev_name(dp->ds->dev));
+ __assign_str(kind, dsa_port_kind(dp));
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ ),
+
+ TP_printk("%s %s port %d vid %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid)
+);
+
+#endif /* _NET_DSA_TRACE_H */
+
+/* We don't want to use include/trace/events */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index fab66c169b9f..20165e07ef90 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -270,11 +270,12 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
"lanes configuration not supported by device");
return -EOPNOTSUPP;
}
- } else if (!lsettings->autoneg) {
- /* If autoneg is off and lanes parameter is not passed from user,
- * set the lanes parameter to 0.
+ } else if (!lsettings->autoneg && ksettings->lanes) {
+ /* If autoneg is off and lanes parameter is not passed from user but
+ * it was defined previously then set the lanes parameter to 0.
*/
ksettings->lanes = 0;
+ *mod = true;
}
ret = ethnl_update_bitset(ksettings->link_modes.advertising,
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
index fce3cc2734f9..4058a557b5a4 100644
--- a/net/ethtool/mm.c
+++ b/net/ethtool/mm.c
@@ -214,6 +214,16 @@ static int ethnl_set_mm(struct ethnl_req_info *req_info, struct genl_info *info)
return -ERANGE;
}
+ if (cfg.verify_enabled && !cfg.tx_enabled) {
+ NL_SET_ERR_MSG(extack, "Verification requires TX enabled");
+ return -EINVAL;
+ }
+
+ if (cfg.tx_enabled && !cfg.pmac_enabled) {
+ NL_SET_ERR_MSG(extack, "TX enabled requires pMAC enabled");
+ return -EINVAL;
+ }
+
ret = dev->ethtool_ops->set_mm(dev, &cfg, extack);
return ret < 0 ? ret : 1;
}
@@ -249,3 +259,26 @@ bool __ethtool_dev_mm_supported(struct net_device *dev)
return !ret;
}
+
+bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ bool supported;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!ops)
+ return false;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return false;
+
+ supported = __ethtool_dev_mm_supported(dev);
+
+ ethnl_ops_complete(dev);
+
+ return supported;
+}
+EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported);
diff --git a/net/handshake/.kunitconfig b/net/handshake/.kunitconfig
new file mode 100644
index 000000000000..5c48cf4abca2
--- /dev/null
+++ b/net/handshake/.kunitconfig
@@ -0,0 +1,11 @@
+CONFIG_KUNIT=y
+CONFIG_UBSAN=y
+CONFIG_STACKTRACE=y
+CONFIG_NET=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_MULTIUSER=y
+CONFIG_NFS_FS=y
+CONFIG_SUNRPC=y
+CONFIG_NET_HANDSHAKE=y
+CONFIG_NET_HANDSHAKE_KUNIT_TEST=y
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
new file mode 100644
index 000000000000..247d73c6ff6e
--- /dev/null
+++ b/net/handshake/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the Generic HANDSHAKE service
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+obj-y += handshake.o
+handshake-y := genl.o netlink.o request.o tlshd.o trace.o
+
+obj-$(CONFIG_NET_HANDSHAKE_KUNIT_TEST) += handshake-test.o
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
new file mode 100644
index 000000000000..9f29efb1493e
--- /dev/null
+++ b/net/handshake/genl.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "genl.h"
+
+#include <linux/handshake.h>
+
+/* HANDSHAKE_CMD_ACCEPT - do */
+static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
+ [HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 2),
+};
+
+/* HANDSHAKE_CMD_DONE - do */
+static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
+ [HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
+ [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_U32, },
+ [HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
+};
+
+/* Ops table for handshake */
+static const struct genl_split_ops handshake_nl_ops[] = {
+ {
+ .cmd = HANDSHAKE_CMD_ACCEPT,
+ .doit = handshake_nl_accept_doit,
+ .policy = handshake_accept_nl_policy,
+ .maxattr = HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = HANDSHAKE_CMD_DONE,
+ .doit = handshake_nl_done_doit,
+ .policy = handshake_done_nl_policy,
+ .maxattr = HANDSHAKE_A_DONE_REMOTE_AUTH,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group handshake_nl_mcgrps[] = {
+ [HANDSHAKE_NLGRP_NONE] = { "none", },
+ [HANDSHAKE_NLGRP_TLSHD] = { "tlshd", },
+};
+
+struct genl_family handshake_nl_family __ro_after_init = {
+ .name = HANDSHAKE_FAMILY_NAME,
+ .version = HANDSHAKE_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = handshake_nl_ops,
+ .n_split_ops = ARRAY_SIZE(handshake_nl_ops),
+ .mcgrps = handshake_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(handshake_nl_mcgrps),
+};
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
new file mode 100644
index 000000000000..2c1f1aa6a02a
--- /dev/null
+++ b/net/handshake/genl.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_HANDSHAKE_GEN_H
+#define _LINUX_HANDSHAKE_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <linux/handshake.h>
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info);
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ HANDSHAKE_NLGRP_NONE,
+ HANDSHAKE_NLGRP_TLSHD,
+};
+
+extern struct genl_family handshake_nl_family;
+
+#endif /* _LINUX_HANDSHAKE_GEN_H */
diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c
new file mode 100644
index 000000000000..e6adc5dec11a
--- /dev/null
+++ b/net/handshake/handshake-test.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Oracle and/or its affiliates.
+ *
+ * KUnit test of the handshake upcall mechanism.
+ */
+
+#include <kunit/test.h>
+#include <kunit/visibility.h>
+
+#include <linux/kernel.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+
+static int test_accept_func(struct handshake_req *req, struct genl_info *info,
+ int fd)
+{
+ return 0;
+}
+
+static void test_done_func(struct handshake_req *req, unsigned int status,
+ struct genl_info *info)
+{
+}
+
+struct handshake_req_alloc_test_param {
+ const char *desc;
+ struct handshake_proto *proto;
+ gfp_t gfp;
+ bool expect_success;
+};
+
+static struct handshake_proto handshake_req_alloc_proto_2 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_NONE,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_3 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_MAX,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_4 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_5 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_6 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_privsize = UINT_MAX,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_good = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+};
+
+static const
+struct handshake_req_alloc_test_param handshake_req_alloc_params[] = {
+ {
+ .desc = "handshake_req_alloc NULL proto",
+ .proto = NULL,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc CLASS_NONE",
+ .proto = &handshake_req_alloc_proto_2,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc CLASS_MAX",
+ .proto = &handshake_req_alloc_proto_3,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc no callbacks",
+ .proto = &handshake_req_alloc_proto_4,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc no done callback",
+ .proto = &handshake_req_alloc_proto_5,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc excessive privsize",
+ .proto = &handshake_req_alloc_proto_6,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc all good",
+ .proto = &handshake_req_alloc_proto_good,
+ .gfp = GFP_KERNEL,
+ .expect_success = true,
+ },
+};
+
+static void
+handshake_req_alloc_get_desc(const struct handshake_req_alloc_test_param *param,
+ char *desc)
+{
+ strscpy(desc, param->desc, KUNIT_PARAM_DESC_SIZE);
+}
+
+/* Creates the function handshake_req_alloc_gen_params */
+KUNIT_ARRAY_PARAM(handshake_req_alloc, handshake_req_alloc_params,
+ handshake_req_alloc_get_desc);
+
+static void handshake_req_alloc_case(struct kunit *test)
+{
+ const struct handshake_req_alloc_test_param *param = test->param_value;
+ struct handshake_req *result;
+
+ /* Arrange */
+
+ /* Act */
+ result = handshake_req_alloc(param->proto, param->gfp);
+
+ /* Assert */
+ if (param->expect_success)
+ KUNIT_EXPECT_NOT_NULL(test, result);
+ else
+ KUNIT_EXPECT_NULL(test, result);
+
+ kfree(result);
+}
+
+static void handshake_req_submit_test1(struct kunit *test)
+{
+ struct socket *sock;
+ int err, result;
+
+ /* Arrange */
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* Act */
+ result = handshake_req_submit(sock, NULL, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test2(struct kunit *test)
+{
+ struct handshake_req *req;
+ int result;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ /* Act */
+ result = handshake_req_submit(NULL, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ /* handshake_req_submit() destroys @req on error */
+}
+
+static void handshake_req_submit_test3(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ int err, result;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ sock->file = NULL;
+
+ /* Act */
+ result = handshake_req_submit(sock, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ /* handshake_req_submit() destroys @req on error */
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test4(struct kunit *test)
+{
+ struct handshake_req *req, *result;
+ struct socket *sock;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* Act */
+ result = handshake_req_hash_lookup(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_NOT_NULL(test, result);
+ KUNIT_EXPECT_PTR_EQ(test, req, result);
+
+ handshake_req_cancel(sock->sk);
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test5(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct net *net;
+ int saved, err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ saved = hn->hn_pending;
+ hn->hn_pending = hn->hn_pending_max + 1;
+
+ /* Act */
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, err, -EAGAIN);
+
+ sock_release(sock);
+ hn->hn_pending = saved;
+}
+
+static void handshake_req_submit_test6(struct kunit *test)
+{
+ struct handshake_req *req1, *req2;
+ struct socket *sock;
+ int err;
+
+ /* Arrange */
+ req1 = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req1);
+ req2 = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req2);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+
+ /* Act */
+ err = handshake_req_submit(sock, req1, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ err = handshake_req_submit(sock, req2, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, err, -EBUSY);
+
+ handshake_req_cancel(sock->sk);
+ sock_release(sock);
+}
+
+static void handshake_req_cancel_test1(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* NB: handshake_req hasn't been accepted */
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_TRUE(test, result);
+
+ sock_release(sock);
+}
+
+static void handshake_req_cancel_test2(struct kunit *test)
+{
+ struct handshake_req *req, *next;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct net *net;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ /* Pretend to accept this request */
+ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
+ KUNIT_ASSERT_PTR_EQ(test, req, next);
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_TRUE(test, result);
+
+ sock_release(sock);
+}
+
+static void handshake_req_cancel_test3(struct kunit *test)
+{
+ struct handshake_req *req, *next;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct net *net;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ /* Pretend to accept this request */
+ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
+ KUNIT_ASSERT_PTR_EQ(test, req, next);
+
+ /* Pretend to complete this request */
+ handshake_complete(next, -ETIMEDOUT, NULL);
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_FALSE(test, result);
+
+ sock_release(sock);
+}
+
+static struct handshake_req *handshake_req_destroy_test;
+
+static void test_destroy_func(struct handshake_req *req)
+{
+ handshake_req_destroy_test = req;
+}
+
+static struct handshake_proto handshake_req_alloc_proto_destroy = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+ .hp_destroy = test_destroy_func,
+};
+
+static void handshake_req_destroy_test1(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ int err;
+
+ /* Arrange */
+ handshake_req_destroy_test = NULL;
+
+ req = handshake_req_alloc(&handshake_req_alloc_proto_destroy, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file);
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ handshake_req_cancel(sock->sk);
+
+ /* Act */
+ sock_release(sock);
+
+ /* Assert */
+ KUNIT_EXPECT_PTR_EQ(test, handshake_req_destroy_test, req);
+}
+
+static struct kunit_case handshake_api_test_cases[] = {
+ {
+ .name = "req_alloc API fuzzing",
+ .run_case = handshake_req_alloc_case,
+ .generate_params = handshake_req_alloc_gen_params,
+ },
+ {
+ .name = "req_submit NULL req arg",
+ .run_case = handshake_req_submit_test1,
+ },
+ {
+ .name = "req_submit NULL sock arg",
+ .run_case = handshake_req_submit_test2,
+ },
+ {
+ .name = "req_submit NULL sock->file",
+ .run_case = handshake_req_submit_test3,
+ },
+ {
+ .name = "req_lookup works",
+ .run_case = handshake_req_submit_test4,
+ },
+ {
+ .name = "req_submit max pending",
+ .run_case = handshake_req_submit_test5,
+ },
+ {
+ .name = "req_submit multiple",
+ .run_case = handshake_req_submit_test6,
+ },
+ {
+ .name = "req_cancel before accept",
+ .run_case = handshake_req_cancel_test1,
+ },
+ {
+ .name = "req_cancel after accept",
+ .run_case = handshake_req_cancel_test2,
+ },
+ {
+ .name = "req_cancel after done",
+ .run_case = handshake_req_cancel_test3,
+ },
+ {
+ .name = "req_destroy works",
+ .run_case = handshake_req_destroy_test1,
+ },
+ {}
+};
+
+static struct kunit_suite handshake_api_suite = {
+ .name = "Handshake API tests",
+ .test_cases = handshake_api_test_cases,
+};
+
+kunit_test_suites(&handshake_api_suite);
+
+MODULE_DESCRIPTION("Test handshake upcall API functions");
+MODULE_LICENSE("GPL");
diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h
new file mode 100644
index 000000000000..4dac965c99df
--- /dev/null
+++ b/net/handshake/handshake.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _INTERNAL_HANDSHAKE_H
+#define _INTERNAL_HANDSHAKE_H
+
+/* Per-net namespace context */
+struct handshake_net {
+ spinlock_t hn_lock; /* protects next 3 fields */
+ int hn_pending;
+ int hn_pending_max;
+ struct list_head hn_requests;
+
+ unsigned long hn_flags;
+};
+
+enum hn_flags_bits {
+ HANDSHAKE_F_NET_DRAINING,
+};
+
+struct handshake_proto;
+
+/* One handshake request */
+struct handshake_req {
+ struct list_head hr_list;
+ struct rhash_head hr_rhash;
+ unsigned long hr_flags;
+ const struct handshake_proto *hr_proto;
+ struct sock *hr_sk;
+ void (*hr_odestruct)(struct sock *sk);
+
+ /* Always the last field */
+ char hr_priv[];
+};
+
+enum hr_flags_bits {
+ HANDSHAKE_F_REQ_COMPLETED,
+};
+
+/* Invariants for all handshake requests for one transport layer
+ * security protocol
+ */
+struct handshake_proto {
+ int hp_handler_class;
+ size_t hp_privsize;
+ unsigned long hp_flags;
+
+ int (*hp_accept)(struct handshake_req *req,
+ struct genl_info *info, int fd);
+ void (*hp_done)(struct handshake_req *req,
+ unsigned int status,
+ struct genl_info *info);
+ void (*hp_destroy)(struct handshake_req *req);
+};
+
+enum hp_flags_bits {
+ HANDSHAKE_F_PROTO_NOTIFY,
+};
+
+/* netlink.c */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+ gfp_t flags);
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+ struct genl_info *info);
+struct handshake_net *handshake_pernet(struct net *net);
+
+/* request.c */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+ gfp_t flags);
+int handshake_req_hash_init(void);
+void handshake_req_hash_destroy(void);
+void *handshake_req_private(struct handshake_req *req);
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk);
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class);
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+ gfp_t flags);
+void handshake_complete(struct handshake_req *req, unsigned int status,
+ struct genl_info *info);
+bool handshake_req_cancel(struct sock *sk);
+
+#endif /* _INTERNAL_HANDSHAKE_H */
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
new file mode 100644
index 000000000000..8ea0ff993f9f
--- /dev/null
+++ b/net/handshake/netlink.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <kunit/visibility.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+#include "genl.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * handshake_genl_notify - Notify handlers that a request is waiting
+ * @net: target network namespace
+ * @proto: handshake protocol
+ * @flags: memory allocation control flags
+ *
+ * Returns zero on success or a negative errno if notification failed.
+ */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+ gfp_t flags)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ /* Disable notifications during unit testing */
+ if (!test_bit(HANDSHAKE_F_PROTO_NOTIFY, &proto->hp_flags))
+ return 0;
+
+ if (!genl_has_listeners(&handshake_nl_family, net,
+ proto->hp_handler_class))
+ return -ESRCH;
+
+ msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0,
+ HANDSHAKE_CMD_READY);
+ if (!hdr)
+ goto out_free;
+
+ if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+ proto->hp_handler_class) < 0) {
+ genlmsg_cancel(msg, hdr);
+ goto out_free;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_multicast_netns(&handshake_nl_family, net, msg,
+ 0, proto->hp_handler_class, flags);
+
+out_free:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+/**
+ * handshake_genl_put - Create a generic netlink message header
+ * @msg: buffer in which to create the header
+ * @info: generic netlink message context
+ *
+ * Returns a ready-to-use header, or NULL.
+ */
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+ struct genl_info *info)
+{
+ return genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &handshake_nl_family, 0, info->genlhdr->cmd);
+}
+EXPORT_SYMBOL(handshake_genl_put);
+
+/*
+ * dup() a kernel socket for use as a user space file descriptor
+ * in the current process. The kernel socket must have an
+ * instatiated struct file.
+ *
+ * Implicit argument: "current()"
+ */
+static int handshake_dup(struct socket *sock)
+{
+ struct file *file;
+ int newfd;
+
+ if (!sock->file)
+ return -EBADF;
+
+ file = get_file(sock->file);
+ newfd = get_unused_fd_flags(O_CLOEXEC);
+ if (newfd < 0) {
+ fput(file);
+ return newfd;
+ }
+
+ fd_install(newfd, file);
+ return newfd;
+}
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct handshake_net *hn = handshake_pernet(net);
+ struct handshake_req *req = NULL;
+ struct socket *sock;
+ int class, fd, err;
+
+ err = -EOPNOTSUPP;
+ if (!hn)
+ goto out_status;
+
+ err = -EINVAL;
+ if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS))
+ goto out_status;
+ class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]);
+
+ err = -EAGAIN;
+ req = handshake_req_next(hn, class);
+ if (!req)
+ goto out_status;
+
+ sock = req->hr_sk->sk_socket;
+ fd = handshake_dup(sock);
+ if (fd < 0) {
+ err = fd;
+ goto out_complete;
+ }
+ err = req->hr_proto->hp_accept(req, info, fd);
+ if (err)
+ goto out_complete;
+
+ trace_handshake_cmd_accept(net, req, req->hr_sk, fd);
+ return 0;
+
+out_complete:
+ handshake_complete(req, -EIO, NULL);
+ fput(sock->file);
+out_status:
+ trace_handshake_cmd_accept_err(net, req, NULL, err);
+ return err;
+}
+
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct socket *sock = NULL;
+ struct handshake_req *req;
+ int fd, status, err;
+
+ if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD))
+ return -EINVAL;
+ fd = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
+
+ err = 0;
+ sock = sockfd_lookup(fd, &err);
+ if (err) {
+ err = -EBADF;
+ goto out_status;
+ }
+
+ req = handshake_req_hash_lookup(sock->sk);
+ if (!req) {
+ err = -EBUSY;
+ fput(sock->file);
+ goto out_status;
+ }
+
+ trace_handshake_cmd_done(net, req, sock->sk, fd);
+
+ status = -EIO;
+ if (info->attrs[HANDSHAKE_A_DONE_STATUS])
+ status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
+
+ handshake_complete(req, status, info);
+ fput(sock->file);
+ return 0;
+
+out_status:
+ trace_handshake_cmd_done_err(net, req, sock->sk, err);
+ return err;
+}
+
+static unsigned int handshake_net_id;
+
+static int __net_init handshake_net_init(struct net *net)
+{
+ struct handshake_net *hn = net_generic(net, handshake_net_id);
+ unsigned long tmp;
+ struct sysinfo si;
+
+ /*
+ * Arbitrary limit to prevent handshakes that do not make
+ * progress from clogging up the system. The cap scales up
+ * with the amount of physical memory on the system.
+ */
+ si_meminfo(&si);
+ tmp = si.totalram / (25 * si.mem_unit);
+ hn->hn_pending_max = clamp(tmp, 3UL, 50UL);
+
+ spin_lock_init(&hn->hn_lock);
+ hn->hn_pending = 0;
+ hn->hn_flags = 0;
+ INIT_LIST_HEAD(&hn->hn_requests);
+ return 0;
+}
+
+static void __net_exit handshake_net_exit(struct net *net)
+{
+ struct handshake_net *hn = net_generic(net, handshake_net_id);
+ struct handshake_req *req;
+ LIST_HEAD(requests);
+
+ /*
+ * Drain the net's pending list. Requests that have been
+ * accepted and are in progress will be destroyed when
+ * the socket is closed.
+ */
+ spin_lock(&hn->hn_lock);
+ set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags);
+ list_splice_init(&requests, &hn->hn_requests);
+ spin_unlock(&hn->hn_lock);
+
+ while (!list_empty(&requests)) {
+ req = list_first_entry(&requests, struct handshake_req, hr_list);
+ list_del(&req->hr_list);
+
+ /*
+ * Requests on this list have not yet been
+ * accepted, so they do not have an fd to put.
+ */
+
+ handshake_complete(req, -ETIMEDOUT, NULL);
+ }
+}
+
+static struct pernet_operations __net_initdata handshake_genl_net_ops = {
+ .init = handshake_net_init,
+ .exit = handshake_net_exit,
+ .id = &handshake_net_id,
+ .size = sizeof(struct handshake_net),
+};
+
+/**
+ * handshake_pernet - Get the handshake private per-net structure
+ * @net: network namespace
+ *
+ * Returns a pointer to the net's private per-net structure for the
+ * handshake module, or NULL if handshake_init() failed.
+ */
+struct handshake_net *handshake_pernet(struct net *net)
+{
+ return handshake_net_id ?
+ net_generic(net, handshake_net_id) : NULL;
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_pernet);
+
+static int __init handshake_init(void)
+{
+ int ret;
+
+ ret = handshake_req_hash_init();
+ if (ret) {
+ pr_warn("handshake: hash initialization failed (%d)\n", ret);
+ return ret;
+ }
+
+ ret = genl_register_family(&handshake_nl_family);
+ if (ret) {
+ pr_warn("handshake: netlink registration failed (%d)\n", ret);
+ handshake_req_hash_destroy();
+ return ret;
+ }
+
+ /*
+ * ORDER: register_pernet_subsys must be done last.
+ *
+ * If initialization does not make it past pernet_subsys
+ * registration, then handshake_net_id will remain 0. That
+ * shunts the handshake consumer API to return ENOTSUPP
+ * to prevent it from dereferencing something that hasn't
+ * been allocated.
+ */
+ ret = register_pernet_subsys(&handshake_genl_net_ops);
+ if (ret) {
+ pr_warn("handshake: pernet registration failed (%d)\n", ret);
+ genl_unregister_family(&handshake_nl_family);
+ handshake_req_hash_destroy();
+ }
+
+ return ret;
+}
+
+static void __exit handshake_exit(void)
+{
+ unregister_pernet_subsys(&handshake_genl_net_ops);
+ handshake_net_id = 0;
+
+ handshake_req_hash_destroy();
+ genl_unregister_family(&handshake_nl_family);
+}
+
+module_init(handshake_init);
+module_exit(handshake_exit);
diff --git a/net/handshake/request.c b/net/handshake/request.c
new file mode 100644
index 000000000000..94d5cef3e048
--- /dev/null
+++ b/net/handshake/request.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handshake request lifetime events
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/fdtable.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <kunit/visibility.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/*
+ * We need both a handshake_req -> sock mapping, and a sock ->
+ * handshake_req mapping. Both are one-to-one.
+ *
+ * To avoid adding another pointer field to struct sock, net/handshake
+ * maintains a hash table, indexed by the memory address of @sock, to
+ * find the struct handshake_req outstanding for that socket. The
+ * reverse direction uses a simple pointer field in the handshake_req
+ * struct.
+ */
+
+static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp;
+
+static const struct rhashtable_params handshake_rhash_params = {
+ .key_len = sizeof_field(struct handshake_req, hr_sk),
+ .key_offset = offsetof(struct handshake_req, hr_sk),
+ .head_offset = offsetof(struct handshake_req, hr_rhash),
+ .automatic_shrinking = true,
+};
+
+int handshake_req_hash_init(void)
+{
+ return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params);
+}
+
+void handshake_req_hash_destroy(void)
+{
+ rhashtable_destroy(&handshake_rhashtbl);
+}
+
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk)
+{
+ return rhashtable_lookup_fast(&handshake_rhashtbl, &sk,
+ handshake_rhash_params);
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_req_hash_lookup);
+
+static bool handshake_req_hash_add(struct handshake_req *req)
+{
+ int ret;
+
+ ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl,
+ &req->hr_rhash,
+ handshake_rhash_params);
+ return ret == 0;
+}
+
+static void handshake_req_destroy(struct handshake_req *req)
+{
+ if (req->hr_proto->hp_destroy)
+ req->hr_proto->hp_destroy(req);
+ rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash,
+ handshake_rhash_params);
+ kfree(req);
+}
+
+static void handshake_sk_destruct(struct sock *sk)
+{
+ void (*sk_destruct)(struct sock *sk);
+ struct handshake_req *req;
+
+ req = handshake_req_hash_lookup(sk);
+ if (!req)
+ return;
+
+ trace_handshake_destruct(sock_net(sk), req, sk);
+ sk_destruct = req->hr_odestruct;
+ handshake_req_destroy(req);
+ if (sk_destruct)
+ sk_destruct(sk);
+}
+
+/**
+ * handshake_req_alloc - Allocate a handshake request
+ * @proto: security protocol
+ * @flags: memory allocation flags
+ *
+ * Returns an initialized handshake_req or NULL.
+ */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+ gfp_t flags)
+{
+ struct handshake_req *req;
+
+ if (!proto)
+ return NULL;
+ if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE)
+ return NULL;
+ if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX)
+ return NULL;
+ if (!proto->hp_accept || !proto->hp_done)
+ return NULL;
+
+ req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags);
+ if (!req)
+ return NULL;
+
+ INIT_LIST_HEAD(&req->hr_list);
+ req->hr_proto = proto;
+ return req;
+}
+EXPORT_SYMBOL(handshake_req_alloc);
+
+/**
+ * handshake_req_private - Get per-handshake private data
+ * @req: handshake arguments
+ *
+ */
+void *handshake_req_private(struct handshake_req *req)
+{
+ return (void *)&req->hr_priv;
+}
+EXPORT_SYMBOL(handshake_req_private);
+
+static bool __add_pending_locked(struct handshake_net *hn,
+ struct handshake_req *req)
+{
+ if (WARN_ON_ONCE(!list_empty(&req->hr_list)))
+ return false;
+ hn->hn_pending++;
+ list_add_tail(&req->hr_list, &hn->hn_requests);
+ return true;
+}
+
+static void __remove_pending_locked(struct handshake_net *hn,
+ struct handshake_req *req)
+{
+ hn->hn_pending--;
+ list_del_init(&req->hr_list);
+}
+
+/*
+ * Returns %true if the request was found on @net's pending list,
+ * otherwise %false.
+ *
+ * If @req was on a pending list, it has not yet been accepted.
+ */
+static bool remove_pending(struct handshake_net *hn, struct handshake_req *req)
+{
+ bool ret = false;
+
+ spin_lock(&hn->hn_lock);
+ if (!list_empty(&req->hr_list)) {
+ __remove_pending_locked(hn, req);
+ ret = true;
+ }
+ spin_unlock(&hn->hn_lock);
+
+ return ret;
+}
+
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class)
+{
+ struct handshake_req *req, *pos;
+
+ req = NULL;
+ spin_lock(&hn->hn_lock);
+ list_for_each_entry(pos, &hn->hn_requests, hr_list) {
+ if (pos->hr_proto->hp_handler_class != class)
+ continue;
+ __remove_pending_locked(hn, pos);
+ req = pos;
+ break;
+ }
+ spin_unlock(&hn->hn_lock);
+
+ return req;
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_req_next);
+
+/**
+ * handshake_req_submit - Submit a handshake request
+ * @sock: open socket on which to perform the handshake
+ * @req: handshake arguments
+ * @flags: memory allocation flags
+ *
+ * Return values:
+ * %0: Request queued
+ * %-EINVAL: Invalid argument
+ * %-EBUSY: A handshake is already under way for this socket
+ * %-ESRCH: No handshake agent is available
+ * %-EAGAIN: Too many pending handshake requests
+ * %-ENOMEM: Failed to allocate memory
+ * %-EMSGSIZE: Failed to construct notification message
+ * %-EOPNOTSUPP: Handshake module not initialized
+ *
+ * A zero return value from handshake_req_submit() means that
+ * exactly one subsequent completion callback is guaranteed.
+ *
+ * A negative return value from handshake_req_submit() means that
+ * no completion callback will be done and that @req has been
+ * destroyed.
+ */
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+ gfp_t flags)
+{
+ struct handshake_net *hn;
+ struct net *net;
+ int ret;
+
+ if (!sock || !req || !sock->file) {
+ kfree(req);
+ return -EINVAL;
+ }
+
+ req->hr_sk = sock->sk;
+ if (!req->hr_sk) {
+ kfree(req);
+ return -EINVAL;
+ }
+ req->hr_odestruct = req->hr_sk->sk_destruct;
+ req->hr_sk->sk_destruct = handshake_sk_destruct;
+
+ ret = -EOPNOTSUPP;
+ net = sock_net(req->hr_sk);
+ hn = handshake_pernet(net);
+ if (!hn)
+ goto out_err;
+
+ ret = -EAGAIN;
+ if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max)
+ goto out_err;
+
+ spin_lock(&hn->hn_lock);
+ ret = -EOPNOTSUPP;
+ if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags))
+ goto out_unlock;
+ ret = -EBUSY;
+ if (!handshake_req_hash_add(req))
+ goto out_unlock;
+ if (!__add_pending_locked(hn, req))
+ goto out_unlock;
+ spin_unlock(&hn->hn_lock);
+
+ ret = handshake_genl_notify(net, req->hr_proto, flags);
+ if (ret) {
+ trace_handshake_notify_err(net, req, req->hr_sk, ret);
+ if (remove_pending(hn, req))
+ goto out_err;
+ }
+
+ /* Prevent socket release while a handshake request is pending */
+ sock_hold(req->hr_sk);
+
+ trace_handshake_submit(net, req, req->hr_sk);
+ return 0;
+
+out_unlock:
+ spin_unlock(&hn->hn_lock);
+out_err:
+ trace_handshake_submit_err(net, req, req->hr_sk, ret);
+ handshake_req_destroy(req);
+ return ret;
+}
+EXPORT_SYMBOL(handshake_req_submit);
+
+void handshake_complete(struct handshake_req *req, unsigned int status,
+ struct genl_info *info)
+{
+ struct sock *sk = req->hr_sk;
+ struct net *net = sock_net(sk);
+
+ if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ trace_handshake_complete(net, req, sk, status);
+ req->hr_proto->hp_done(req, status, info);
+
+ /* Handshake request is no longer pending */
+ sock_put(sk);
+ }
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_complete);
+
+/**
+ * handshake_req_cancel - Cancel an in-progress handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ * %true - Uncompleted handshake request was canceled
+ * %false - Handshake request already completed or not found
+ */
+bool handshake_req_cancel(struct sock *sk)
+{
+ struct handshake_req *req;
+ struct handshake_net *hn;
+ struct net *net;
+
+ net = sock_net(sk);
+ req = handshake_req_hash_lookup(sk);
+ if (!req) {
+ trace_handshake_cancel_none(net, req, sk);
+ return false;
+ }
+
+ hn = handshake_pernet(net);
+ if (hn && remove_pending(hn, req)) {
+ /* Request hadn't been accepted */
+ goto out_true;
+ }
+ if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ /* Request already completed */
+ trace_handshake_cancel_busy(net, req, sk);
+ return false;
+ }
+
+out_true:
+ trace_handshake_cancel(net, req, sk);
+
+ /* Handshake request is no longer pending */
+ sock_put(sk);
+ return true;
+}
+EXPORT_SYMBOL(handshake_req_cancel);
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
new file mode 100644
index 000000000000..fcbeb63b4eb1
--- /dev/null
+++ b/net/handshake/tlshd.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Establish a TLS session for a kernel socket consumer
+ * using the tlshd user space handler.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2021-2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/keyctl.h>
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+struct tls_handshake_req {
+ void (*th_consumer_done)(void *data, int status,
+ key_serial_t peerid);
+ void *th_consumer_data;
+
+ int th_type;
+ unsigned int th_timeout_ms;
+ int th_auth_mode;
+ key_serial_t th_keyring;
+ key_serial_t th_certificate;
+ key_serial_t th_privkey;
+
+ unsigned int th_num_peerids;
+ key_serial_t th_peerid[5];
+};
+
+static struct tls_handshake_req *
+tls_handshake_req_init(struct handshake_req *req,
+ const struct tls_handshake_args *args)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+
+ treq->th_timeout_ms = args->ta_timeout_ms;
+ treq->th_consumer_done = args->ta_done;
+ treq->th_consumer_data = args->ta_data;
+ treq->th_keyring = args->ta_keyring;
+ treq->th_num_peerids = 0;
+ treq->th_certificate = TLS_NO_CERT;
+ treq->th_privkey = TLS_NO_PRIVKEY;
+ return treq;
+}
+
+static void tls_handshake_remote_peerids(struct tls_handshake_req *treq,
+ struct genl_info *info)
+{
+ struct nlattr *head = nlmsg_attrdata(info->nlhdr, GENL_HDRLEN);
+ int rem, len = nlmsg_attrlen(info->nlhdr, GENL_HDRLEN);
+ struct nlattr *nla;
+ unsigned int i;
+
+ i = 0;
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+ i++;
+ }
+ if (!i)
+ return;
+ treq->th_num_peerids = min_t(unsigned int, i,
+ ARRAY_SIZE(treq->th_peerid));
+
+ i = 0;
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+ treq->th_peerid[i++] = nla_get_u32(nla);
+ if (i >= treq->th_num_peerids)
+ break;
+ }
+}
+
+/**
+ * tls_handshake_done - callback to handle a CMD_DONE request
+ * @req: socket on which the handshake was performed
+ * @status: session status code
+ * @info: full results of session establishment
+ *
+ */
+static void tls_handshake_done(struct handshake_req *req,
+ unsigned int status, struct genl_info *info)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+
+ treq->th_peerid[0] = TLS_NO_PEERID;
+ if (info)
+ tls_handshake_remote_peerids(treq, info);
+
+ treq->th_consumer_done(treq->th_consumer_data, -status,
+ treq->th_peerid[0]);
+}
+
+#if IS_ENABLED(CONFIG_KEYS)
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+ key_ref_t process_keyring_ref, keyring_ref;
+ int ret;
+
+ if (treq->th_keyring == TLS_NO_KEYRING)
+ return 0;
+
+ process_keyring_ref = lookup_user_key(KEY_SPEC_PROCESS_KEYRING,
+ KEY_LOOKUP_CREATE,
+ KEY_NEED_WRITE);
+ if (IS_ERR(process_keyring_ref)) {
+ ret = PTR_ERR(process_keyring_ref);
+ goto out;
+ }
+
+ keyring_ref = lookup_user_key(treq->th_keyring, KEY_LOOKUP_CREATE,
+ KEY_NEED_LINK);
+ if (IS_ERR(keyring_ref)) {
+ ret = PTR_ERR(keyring_ref);
+ goto out_put_key;
+ }
+
+ ret = key_link(key_ref_to_ptr(process_keyring_ref),
+ key_ref_to_ptr(keyring_ref));
+
+ key_ref_put(keyring_ref);
+out_put_key:
+ key_ref_put(process_keyring_ref);
+out:
+ return ret;
+}
+#else
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+ return 0;
+}
+#endif
+
+static int tls_handshake_put_peer_identity(struct sk_buff *msg,
+ struct tls_handshake_req *treq)
+{
+ unsigned int i;
+
+ for (i = 0; i < treq->th_num_peerids; i++)
+ if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+ treq->th_peerid[i]) < 0)
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int tls_handshake_put_certificate(struct sk_buff *msg,
+ struct tls_handshake_req *treq)
+{
+ struct nlattr *entry_attr;
+
+ if (treq->th_certificate == TLS_NO_CERT &&
+ treq->th_privkey == TLS_NO_PRIVKEY)
+ return 0;
+
+ entry_attr = nla_nest_start(msg, HANDSHAKE_A_ACCEPT_CERTIFICATE);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, HANDSHAKE_A_X509_CERT,
+ treq->th_certificate) ||
+ nla_put_u32(msg, HANDSHAKE_A_X509_PRIVKEY,
+ treq->th_privkey)) {
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+}
+
+/**
+ * tls_handshake_accept - callback to construct a CMD_ACCEPT response
+ * @req: handshake parameters to return
+ * @info: generic netlink message context
+ * @fd: file descriptor to be returned
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+static int tls_handshake_accept(struct handshake_req *req,
+ struct genl_info *info, int fd)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+ struct nlmsghdr *hdr;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = tls_handshake_private_keyring(treq);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out;
+ hdr = handshake_genl_put(msg, info);
+ if (!hdr)
+ goto out_cancel;
+
+ ret = -EMSGSIZE;
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
+ if (ret < 0)
+ goto out_cancel;
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
+ if (ret < 0)
+ goto out_cancel;
+ if (treq->th_timeout_ms) {
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms);
+ if (ret < 0)
+ goto out_cancel;
+ }
+
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE,
+ treq->th_auth_mode);
+ if (ret < 0)
+ goto out_cancel;
+ switch (treq->th_auth_mode) {
+ case HANDSHAKE_AUTH_PSK:
+ ret = tls_handshake_put_peer_identity(msg, treq);
+ if (ret < 0)
+ goto out_cancel;
+ break;
+ case HANDSHAKE_AUTH_X509:
+ ret = tls_handshake_put_certificate(msg, treq);
+ if (ret < 0)
+ goto out_cancel;
+ break;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+out_cancel:
+ genlmsg_cancel(msg, hdr);
+out:
+ return ret;
+}
+
+static const struct handshake_proto tls_handshake_proto = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_privsize = sizeof(struct tls_handshake_req),
+ .hp_flags = BIT(HANDSHAKE_F_PROTO_NOTIFY),
+
+ .hp_accept = tls_handshake_accept,
+ .hp_done = tls_handshake_done,
+};
+
+/**
+ * tls_client_hello_anon - request an anonymous TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_UNAUTH;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_anon);
+
+/**
+ * tls_client_hello_x509 - request an x.509-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+ treq->th_certificate = args->ta_my_cert;
+ treq->th_privkey = args->ta_my_privkey;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_x509);
+
+/**
+ * tls_client_hello_psk - request a PSK-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-EINVAL: Wrong number of local peer IDs
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+ unsigned int i;
+
+ if (!args->ta_num_peerids ||
+ args->ta_num_peerids > ARRAY_SIZE(treq->th_peerid))
+ return -EINVAL;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+ treq->th_num_peerids = args->ta_num_peerids;
+ for (i = 0; i < args->ta_num_peerids; i++)
+ treq->th_peerid[i] = args->ta_my_peerids[i];
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_psk);
+
+/**
+ * tls_server_hello_x509 - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+ treq->th_certificate = args->ta_my_cert;
+ treq->th_privkey = args->ta_my_privkey;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_x509);
+
+/**
+ * tls_server_hello_psk - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+ treq->th_num_peerids = 1;
+ treq->th_peerid[0] = args->ta_my_peerids[0];
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_psk);
+
+/**
+ * tls_handshake_cancel - cancel a pending handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ * %true - Uncompleted handshake request was canceled
+ * %false - Handshake request already completed or not found
+ */
+bool tls_handshake_cancel(struct sock *sk)
+{
+ return handshake_req_cancel(sk);
+}
+EXPORT_SYMBOL(tls_handshake_cancel);
diff --git a/net/handshake/trace.c b/net/handshake/trace.c
new file mode 100644
index 000000000000..1c4d8e27e17a
--- /dev/null
+++ b/net/handshake/trace.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace points for transport security layer handshakes.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "handshake.h"
+
+#define CREATE_TRACE_POINTS
+
+#include <trace/events/handshake.h>
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 880277c9fd07..b18ba8ef93ad 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
obj-$(CONFIG_NET_IPIP) += ipip.o
gre-y := gre_demux.o
-fou-y := fou_core.o fou_nl.o
+fou-y := fou_core.o fou_nl.o fou_bpf.o
obj-$(CONFIG_NET_FOU) += fou.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd9..4406d796cc2f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -72,15 +72,11 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ int off, int size)
{
const struct btf_type *t;
size_t end;
- if (atype == BPF_READ)
- return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-
t = btf_type_by_id(reg->btf, reg->btf_id);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
@@ -113,6 +109,9 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
case offsetof(struct tcp_sock, ecn_flags):
end = offsetofend(struct tcp_sock, ecn_flags);
break;
+ case offsetof(struct tcp_sock, app_limited):
+ end = offsetofend(struct tcp_sock, app_limited);
+ break;
default:
bpf_log(log, "no write support to tcp_sock at off %d\n", off);
return -EACCES;
@@ -239,8 +238,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
sizeof(tcp_ca->name)) <= 0)
return -EINVAL;
- if (tcp_ca_find(utcp_ca->name))
- return -EEXIST;
return 1;
}
@@ -266,13 +263,25 @@ static void bpf_tcp_ca_unreg(void *kdata)
tcp_unregister_congestion_control(kdata);
}
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+ return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+ return tcp_validate_congestion_control(kdata);
+}
+
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,
.unreg = bpf_tcp_ca_unreg,
+ .update = bpf_tcp_ca_update,
.check_member = bpf_tcp_ca_check_member,
.init_member = bpf_tcp_ca_init_member,
.init = bpf_tcp_ca_init,
+ .validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
};
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
new file mode 100644
index 000000000000..3760a14b6b57
--- /dev/null
+++ b/net/ipv4/fou_bpf.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Fou Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/fou.h>
+
+struct bpf_fou_encap {
+ __be16 sport;
+ __be16 dport;
+};
+
+enum bpf_fou_encap_type {
+ FOU_BPF_ENCAP_FOU,
+ FOU_BPF_ENCAP_GUE,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in BTF");
+
+/* bpf_skb_set_fou_encap - Set FOU encap parameters
+ *
+ * This function allows for using GUE or FOU encapsulation together with an
+ * ipip device in collect-metadata mode.
+ *
+ * It is meant to be used in BPF tc-hooks and after a call to the
+ * bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
+ *
+ * Parameters:
+ * @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap Pointer to a `struct bpf_fou_encap` storing UDP src and
+ * dst ports. If sport is set to 0 the kernel will auto-assign a
+ * port. This is similar to using `encap-sport auto`.
+ * Cannot be NULL
+ * @type Encapsulation type for the packet. Their definitions are
+ * specified in `enum bpf_fou_encap_type`
+ */
+__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
+ struct bpf_fou_encap *encap, int type)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!encap))
+ return -EINVAL;
+
+ if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
+
+ switch (type) {
+ case FOU_BPF_ENCAP_FOU:
+ info->encap.type = TUNNEL_ENCAP_FOU;
+ break;
+ case FOU_BPF_ENCAP_GUE:
+ info->encap.type = TUNNEL_ENCAP_GUE;
+ break;
+ default:
+ info->encap.type = TUNNEL_ENCAP_NONE;
+ }
+
+ if (info->key.tun_flags & TUNNEL_CSUM)
+ info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
+
+ info->encap.sport = encap->sport;
+ info->encap.dport = encap->dport;
+
+ return 0;
+}
+
+/* bpf_skb_get_fou_encap - Get FOU encap parameters
+ *
+ * This function allows for reading encap metadata from a packet received
+ * on an ipip device in collect-metadata mode.
+ *
+ * Parameters:
+ * @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap Pointer to a struct bpf_fou_encap storing UDP source and
+ * destination port. Cannot be NULL
+ */
+__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
+ struct bpf_fou_encap *encap)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!info))
+ return -EINVAL;
+
+ encap->sport = info->encap.sport;
+ encap->dport = info->encap.dport;
+
+ return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(fou_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
+BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
+BTF_SET8_END(fou_kfunc_set)
+
+static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fou_kfunc_set,
+};
+
+int register_fou_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &fou_bpf_kfunc_set);
+}
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index cafec9b4eee0..0c41076e31ed 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -1236,10 +1236,15 @@ static int __init fou_init(void)
if (ret < 0)
goto unregister;
+ ret = register_fou_bpf();
+ if (ret < 0)
+ goto kfunc_failed;
+
ret = ip_tunnel_encap_add_fou_ops();
if (ret == 0)
return 0;
+kfunc_failed:
genl_unregister_family(&fou_nl_family);
unregister:
unregister_pernet_device(&fou_net_ops);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 8cebb476b3ab..b8607763d113 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -749,6 +749,11 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
room = 576;
room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
+ /* Guard against tiny mtu. We need to include at least one
+ * IP network header for this message to make any sense.
+ */
+ if (room <= (int)sizeof(struct iphdr))
+ goto ende;
icmp_param.data_len = skb_in->len - icmp_param.offset;
if (icmp_param.data_len > room)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 2541083d49ad..beeae624c412 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -359,6 +359,20 @@ err_dev_set_mtu:
return ERR_PTR(err);
}
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct udphdr *udph;
+
+ if (iph->protocol != IPPROTO_UDP)
+ return;
+
+ udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
+ info->encap.sport = udph->source;
+ info->encap.dport = udph->dest;
+}
+EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
+
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error)
@@ -572,7 +586,11 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
dev_net(dev), 0, skb->mark, skb_get_hash(skb),
key->flow_flags);
- if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+
+ if (!tunnel_hlen)
+ tunnel_hlen = ip_encap_hlen(&tun_info->encap);
+
+ if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
goto tx_error;
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
@@ -732,7 +750,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
dev_net(dev), tunnel->parms.link,
tunnel->fwmark, skb_get_hash(skb), 0);
- if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+ if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
goto tx_error;
if (connected && md) {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index abea77759b7e..27b8f83c6ea2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -241,6 +241,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
if (!tun_dst)
return 0;
+ ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
}
skb_reset_mac_header(skb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 409ec2a1f95b..5178a3f3cb53 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1089,13 +1089,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
}
void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
- __acquires(RCU)
+ __acquires(ping_table.lock)
{
struct ping_iter_state *state = seq->private;
state->bucket = 0;
state->family = family;
- rcu_read_lock();
+ spin_lock(&ping_table.lock);
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
@@ -1121,9 +1121,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
+ __releases(ping_table.lock)
{
- rcu_read_unlock();
+ spin_unlock(&ping_table.lock);
}
EXPORT_SYMBOL_GPL(ping_seq_stop);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 3cf68695b40d..ff712bf2a98d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -91,12 +91,12 @@ EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
int raw_hash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- struct hlist_nulls_head *hlist;
+ struct hlist_head *hlist;
hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];
spin_lock(&h->lock);
- __sk_nulls_add_node_rcu(sk, hlist);
+ sk_add_node_rcu(sk, hlist);
sock_set_flag(sk, SOCK_RCU_FREE);
spin_unlock(&h->lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -110,7 +110,7 @@ void raw_unhash_sk(struct sock *sk)
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
spin_lock(&h->lock);
- if (__sk_nulls_del_node_init_rcu(sk))
+ if (sk_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock(&h->lock);
}
@@ -163,16 +163,15 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
static int raw_v4_input(struct net *net, struct sk_buff *skb,
const struct iphdr *iph, int hash)
{
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
int sdif = inet_sdif(skb);
+ struct hlist_head *hlist;
int dif = inet_iif(skb);
int delivered = 0;
struct sock *sk;
hlist = &raw_v4_hashinfo.ht[hash];
rcu_read_lock();
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each_rcu(sk, hlist) {
if (!raw_v4_match(net, sk, iph->protocol,
iph->saddr, iph->daddr, dif, sdif))
continue;
@@ -264,10 +263,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
{
struct net *net = dev_net(skb->dev);
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
int dif = skb->dev->ifindex;
int sdif = inet_sdif(skb);
+ struct hlist_head *hlist;
const struct iphdr *iph;
struct sock *sk;
int hash;
@@ -276,7 +274,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
hlist = &raw_v4_hashinfo.ht[hash];
rcu_read_lock();
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each_rcu(sk, hlist) {
iph = (const struct iphdr *)skb->data;
if (!raw_v4_match(net, sk, iph->protocol,
iph->daddr, iph->saddr, dif, sdif))
@@ -950,14 +948,13 @@ static struct sock *raw_get_first(struct seq_file *seq, int bucket)
{
struct raw_hashinfo *h = pde_data(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
+ struct hlist_head *hlist;
struct sock *sk;
for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
++state->bucket) {
hlist = &h->ht[state->bucket];
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each(sk, hlist) {
if (sock_net(sk) == seq_file_net(seq))
return sk;
}
@@ -970,7 +967,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
struct raw_iter_state *state = raw_seq_private(seq);
do {
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
} while (sk && sock_net(sk) != seq_file_net(seq));
if (!sk)
@@ -989,9 +986,12 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
}
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
+ __acquires(&h->lock)
{
- rcu_read_lock();
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+ spin_lock(&h->lock);
+
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1010,9 +1010,11 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(raw_seq_next);
void raw_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
+ __releases(&h->lock)
{
- rcu_read_unlock();
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+ spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_seq_stop);
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index bca49a844f01..63a40e4b678f 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -57,8 +57,7 @@ static bool raw_lookup(struct net *net, const struct sock *sk,
static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
{
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
+ struct hlist_head *hlist;
struct sock *sk;
int slot;
@@ -68,7 +67,7 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2
rcu_read_lock();
for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
hlist = &hashinfo->ht[slot];
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each_rcu(sk, hlist) {
if (raw_lookup(net, sk, r)) {
/*
* Grab it and keep until we fill
@@ -142,9 +141,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
struct inet_diag_dump_data *cb_data;
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
int num, s_num, slot, s_slot;
+ struct hlist_head *hlist;
struct sock *sk = NULL;
struct nlattr *bc;
@@ -161,7 +159,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
num = 0;
hlist = &hashinfo->ht[slot];
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each_rcu(sk, hlist) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0d0cc4ef2b85..40fe70fc2015 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -25,6 +25,7 @@ static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int tcp_app_win_max = 31;
static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
static int tcp_min_snd_mss_max = 65535;
static int ip_privileged_port_min;
@@ -1198,6 +1199,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_app_win_max,
},
{
.procname = "tcp_adv_win_scale",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fd68d49490f2..20db115c38c4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2165,7 +2165,7 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
struct msghdr cmsg_dummy;
msg_control_addr = (unsigned long)zc->msg_control;
- cmsg_dummy.msg_control = (void *)msg_control_addr;
+ cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
cmsg_dummy.msg_controllen =
(__kernel_size_t)zc->msg_controllen;
cmsg_dummy.msg_flags = in_compat_syscall()
@@ -2176,7 +2176,7 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
zc->msg_controllen == cmsg_dummy.msg_controllen) {
tcp_recv_timestamp(&cmsg_dummy, sk, tss);
zc->msg_control = (__u64)
- ((uintptr_t)cmsg_dummy.msg_control);
+ ((uintptr_t)cmsg_dummy.msg_control_user);
zc->msg_controllen =
(__u64)cmsg_dummy.msg_controllen;
zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c31..1b34050a7538 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
return NULL;
}
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
{
- int ret = 0;
-
/* all algorithms must implement these */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+ struct tcp_congestion_ops *existing;
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ existing = tcp_ca_find_key(old_ca->key);
+ if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+ pr_notice("%s not registered or non-unique key\n",
+ ca->name);
+ ret = -EINVAL;
+ } else if (existing != old_ca) {
+ pr_notice("invalid old congestion control algorithm to replace\n");
+ ret = -EINVAL;
+ } else {
+ /* Add the new one before removing the old one to keep
+ * one implementation available all the time.
+ */
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ list_del_rcu(&existing->list);
+ pr_debug("%s updated\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module or struct_ops gets removed entirely.
+ */
+ if (!ret)
+ synchronize_rcu();
+
+ return ret;
+}
+
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 89daa6b953ff..39bda2b1066e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2780,7 +2780,7 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
while (iter->cur_sk < iter->end_sk)
- sock_put(iter->batch[iter->cur_sk++]);
+ sock_gen_put(iter->batch[iter->cur_sk++]);
}
static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
@@ -2941,7 +2941,7 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
* st->bucket. See tcp_seek_last_pos().
*/
st->offset++;
- sock_put(iter->batch[iter->cur_sk++]);
+ sock_gen_put(iter->batch[iter->cur_sk++]);
}
if (iter->cur_sk < iter->end_sk)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e1b679a590c9..2bbf13216a3d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -952,6 +952,7 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
+ net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;
/* By default, rate limit error messages.
* Except for pmtu discovery, it would break it.
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 1f53f2a74480..9edf1f45b1ed 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -362,9 +362,10 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
/*
* We won't send icmp if the destination is known
- * anycast.
+ * anycast unless we need to treat anycast as unicast.
*/
- if (ipv6_anycast_destination(dst, &fl6->daddr)) {
+ if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
+ ipv6_anycast_destination(dst, &fl6->daddr)) {
net_dbg_ratelimited("icmp6_send: acast source\n");
dst_release(dst);
return ERR_PTR(-EINVAL);
@@ -1195,6 +1196,15 @@ static struct ctl_table ipv6_icmp_table_template[] = {
.mode = 0644,
.proc_handler = proc_do_large_bitmap,
},
+ {
+ .procname = "error_anycast_as_unicast",
+ .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
{ },
};
@@ -1212,6 +1222,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
+ table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
}
return table;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0b6140f0179d..9554cf46ed88 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1965,8 +1965,13 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if (proto == IPPROTO_ICMPV6) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ u8 icmp6_type;
- ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
+ if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
+ icmp6_type = fl6->fl6_icmp_type;
+ else
+ icmp6_type = icmp6_hdr(skb)->icmp6_type;
+ ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 2917dd8d198c..ae818ff46224 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -716,6 +716,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
goto done;
msg.msg_controllen = optlen;
+ msg.msg_control_is_user = false;
msg.msg_control = (void *)(opt+1);
ipc6.opt = opt;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4ab62a9c5c8e..7d0adb612bdd 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -141,10 +141,9 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
struct net *net = dev_net(skb->dev);
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
const struct in6_addr *saddr;
const struct in6_addr *daddr;
+ struct hlist_head *hlist;
struct sock *sk;
bool delivered = false;
__u8 hash;
@@ -155,7 +154,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
hash = raw_hashfunc(net, nexthdr);
hlist = &raw_v6_hashinfo.ht[hash];
rcu_read_lock();
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each_rcu(sk, hlist) {
int filtered;
if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
@@ -331,15 +330,14 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
u8 type, u8 code, int inner_offset, __be32 info)
{
struct net *net = dev_net(skb->dev);
- struct hlist_nulls_head *hlist;
- struct hlist_nulls_node *hnode;
+ struct hlist_head *hlist;
struct sock *sk;
int hash;
hash = raw_hashfunc(net, nexthdr);
hlist = &raw_v6_hashinfo.ht[hash];
rcu_read_lock();
- sk_nulls_for_each(sk, hnode, hlist) {
+ sk_for_each_rcu(sk, hlist) {
/* Note: ipv6_hdr(skb) != skb->data */
const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
index 488aec9e1a74..d1876f192225 100644
--- a/net/ipv6/rpl.c
+++ b/net/ipv6/rpl.c
@@ -32,7 +32,8 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
unsigned char cmpre)
{
- return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+ return sizeof(struct ipv6_rpl_sr_hdr) + (n * IPV6_PFXTAIL_LEN(cmpri)) +
+ IPV6_PFXTAIL_LEN(cmpre);
}
void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 70d81bba5093..063560e2cb1a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1024,7 +1024,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
ttl = iph6->hop_limit;
tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
- if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
+ if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) {
ip_rt_put(rt);
goto tx_error;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4caa70a1b871..e5a337e6b970 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1397,9 +1397,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
msg->msg_name = &sin;
msg->msg_namelen = sizeof(sin);
do_udp_sendmsg:
- if (ipv6_only_sock(sk))
- return -ENETUNREACH;
- return udp_sendmsg(sk, msg, len);
+ err = ipv6_only_sock(sk) ?
+ -ENETUNREACH : udp_sendmsg(sk, msg, len);
+ msg->msg_name = sin6;
+ msg->msg_namelen = addr_len;
+ return err;
}
}
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 4db5a554bdbd..41a74fc84ca1 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -677,8 +677,8 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
MODULE_DESCRIPTION("L2TP over IP");
MODULE_VERSION("1.0");
-/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
- * enums
+/* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol,
+ * because __stringify doesn't like enums
*/
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP);
-MODULE_ALIAS_NET_PF_PROTO(PF_INET, IPPROTO_L2TP);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2);
+MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 2478aa60145f..5137ea1861ce 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -806,8 +806,8 @@ MODULE_AUTHOR("Chris Elston <celston@katalix.com>");
MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6");
MODULE_VERSION("1.0");
-/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
- * enums
+/* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol,
+ * because __stringify doesn't like enums
*/
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP);
-MODULE_ALIAS_NET_PF_PROTO(PF_INET6, IPPROTO_L2TP);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2);
+MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115);
diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h
new file mode 100644
index 000000000000..49dc809cab29
--- /dev/null
+++ b/net/mac80211/drop.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mac80211 drop reason list
+ *
+ * Copyright (C) 2023 Intel Corporation
+ */
+
+#ifndef MAC80211_DROP_H
+#define MAC80211_DROP_H
+#include <net/dropreason.h>
+
+typedef unsigned int __bitwise ieee80211_rx_result;
+
+#define MAC80211_DROP_REASONS_MONITOR(R) \
+ R(RX_DROP_M_UNEXPECTED_4ADDR_FRAME) \
+ R(RX_DROP_M_BAD_BCN_KEYIDX) \
+ R(RX_DROP_M_BAD_MGMT_KEYIDX) \
+/* this line for the trailing \ - add before this */
+
+#define MAC80211_DROP_REASONS_UNUSABLE(R) \
+ R(RX_DROP_U_MIC_FAIL) \
+ R(RX_DROP_U_REPLAY) \
+ R(RX_DROP_U_BAD_MMIE) \
+/* this line for the trailing \ - add before this */
+
+/* having two enums allows for checking ieee80211_rx_result use with sparse */
+enum ___mac80211_drop_reason {
+/* if we get to the end of handlers with RX_CONTINUE this will be the reason */
+ ___RX_CONTINUE = SKB_CONSUMED,
+
+/* this never gets used as an argument to kfree_skb_reason() */
+ ___RX_QUEUED = SKB_NOT_DROPPED_YET,
+
+#define ENUM(x) ___ ## x,
+ ___RX_DROP_MONITOR = SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR <<
+ SKB_DROP_REASON_SUBSYS_SHIFT,
+ MAC80211_DROP_REASONS_MONITOR(ENUM)
+
+ ___RX_DROP_UNUSABLE = SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE <<
+ SKB_DROP_REASON_SUBSYS_SHIFT,
+ MAC80211_DROP_REASONS_UNUSABLE(ENUM)
+#undef ENUM
+};
+
+enum mac80211_drop_reason {
+ RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE,
+ RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED,
+ RX_DROP_MONITOR = (__force ieee80211_rx_result)___RX_DROP_MONITOR,
+ RX_DROP_UNUSABLE = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE,
+#define DEF(x) x = (__force ieee80211_rx_result)___ ## x,
+ MAC80211_DROP_REASONS_MONITOR(DEF)
+ MAC80211_DROP_REASONS_UNUSABLE(DEF)
+#undef DEF
+};
+
+#endif /* MAC80211_DROP_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9b7e184430b8..a0a7839cb961 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -33,6 +33,7 @@
#include "key.h"
#include "sta_info.h"
#include "debug.h"
+#include "drop.h"
extern const struct cfg80211_ops mac80211_config_ops;
@@ -170,13 +171,6 @@ struct ieee80211_tx_data {
unsigned int flags;
};
-
-typedef unsigned __bitwise ieee80211_rx_result;
-#define RX_CONTINUE ((__force ieee80211_rx_result) 0u)
-#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u)
-#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
-#define RX_QUEUED ((__force ieee80211_rx_result) 3u)
-
/**
* enum ieee80211_packet_rx_flags - packet RX flags
* @IEEE80211_RX_AMSDU: a-MSDU packet
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ddf2b7811c55..55cdfaef0f5d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -22,6 +22,7 @@
#include <linux/bitmap.h>
#include <linux/inetdevice.h>
#include <net/net_namespace.h>
+#include <net/dropreason.h>
#include <net/cfg80211.h>
#include <net/addrconf.h>
@@ -1542,6 +1543,28 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
}
EXPORT_SYMBOL(ieee80211_free_hw);
+static const char * const drop_reasons_monitor[] = {
+#define V(x) #x,
+ [0] = "RX_DROP_MONITOR",
+ MAC80211_DROP_REASONS_MONITOR(V)
+};
+
+static struct drop_reason_list drop_reason_list_monitor = {
+ .reasons = drop_reasons_monitor,
+ .n_reasons = ARRAY_SIZE(drop_reasons_monitor),
+};
+
+static const char * const drop_reasons_unusable[] = {
+ [0] = "RX_DROP_UNUSABLE",
+ MAC80211_DROP_REASONS_UNUSABLE(V)
+#undef V
+};
+
+static struct drop_reason_list drop_reason_list_unusable = {
+ .reasons = drop_reasons_unusable,
+ .n_reasons = ARRAY_SIZE(drop_reasons_unusable),
+};
+
static int __init ieee80211_init(void)
{
struct sk_buff *skb;
@@ -1559,6 +1582,11 @@ static int __init ieee80211_init(void)
if (ret)
goto err_netdev;
+ drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR,
+ &drop_reason_list_monitor);
+ drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE,
+ &drop_reason_list_unusable);
+
return 0;
err_netdev:
rc80211_minstrel_exit();
@@ -1574,6 +1602,9 @@ static void __exit ieee80211_exit(void)
ieee80211_iface_exit();
+ drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR);
+ drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE);
+
rcu_barrier();
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index db3451f5f2fb..58222c077898 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1826,7 +1826,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
cfg80211_rx_unexpected_4addr_frame(
rx->sdata->dev, sta->sta.addr,
GFP_ATOMIC);
- return RX_DROP_MONITOR;
+ return RX_DROP_M_UNEXPECTED_4ADDR_FRAME;
}
/*
* Update counter and free packet here to avoid
@@ -1961,7 +1961,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
skb->data,
skb->len);
- return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+ return RX_DROP_M_BAD_BCN_KEYIDX;
}
rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx);
@@ -1975,7 +1975,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (mmie_keyidx < NUM_DEFAULT_KEYS ||
mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
- return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+ return RX_DROP_M_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */
if (rx->link_sta) {
if (ieee80211_is_group_privacy_action(skb) &&
test_sta_flag(rx->sta, WLAN_STA_MFP))
@@ -3960,7 +3960,8 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
}
static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
- struct ieee80211_rate *rate)
+ struct ieee80211_rate *rate,
+ ieee80211_rx_result reason)
{
struct ieee80211_sub_if_data *sdata;
struct ieee80211_local *local = rx->local;
@@ -4024,42 +4025,38 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
}
out_free_skb:
- dev_kfree_skb(skb);
+ kfree_skb_reason(skb, (__force u32)reason);
}
static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
ieee80211_rx_result res)
{
- switch (res) {
- case RX_DROP_MONITOR:
- I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
- if (rx->sta)
- rx->link_sta->rx_stats.dropped++;
- fallthrough;
- case RX_CONTINUE: {
- struct ieee80211_rate *rate = NULL;
- struct ieee80211_supported_band *sband;
- struct ieee80211_rx_status *status;
-
- status = IEEE80211_SKB_RXCB((rx->skb));
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_rate *rate = NULL;
- sband = rx->local->hw.wiphy->bands[status->band];
- if (status->encoding == RX_ENC_LEGACY)
- rate = &sband->bitrates[status->rate_idx];
+ if (res == RX_QUEUED) {
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+ return;
+ }
- ieee80211_rx_cooked_monitor(rx, rate);
- break;
- }
- case RX_DROP_UNUSABLE:
+ if (res != RX_CONTINUE) {
I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
if (rx->sta)
rx->link_sta->rx_stats.dropped++;
- dev_kfree_skb(rx->skb);
- break;
- case RX_QUEUED:
- I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
- break;
}
+
+ if (u32_get_bits((__force u32)res, SKB_DROP_REASON_SUBSYS_MASK) ==
+ SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE) {
+ kfree_skb_reason(rx->skb, (__force u32)res);
+ return;
+ }
+
+ sband = rx->local->hw.wiphy->bands[status->band];
+ if (status->encoding == RX_ENC_LEGACY)
+ rate = &sband->bitrates[status->rate_idx];
+
+ ieee80211_rx_cooked_monitor(rx, rate, res);
}
static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 20f742b5503b..4133496da378 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -550,7 +550,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
if (res < 0 ||
(!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
key->u.ccmp.replays++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_REPLAY;
}
if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -564,7 +564,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
data_len,
skb->data + skb->len - mic_len))
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_MIC_FAIL;
}
memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
@@ -746,7 +746,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
if (res < 0 ||
(!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
key->u.gcmp.replays++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_REPLAY;
}
if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -761,7 +761,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
data_len,
skb->data + skb->len -
IEEE80211_GCMP_MIC_LEN))
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_MIC_FAIL;
}
memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
@@ -930,13 +930,13 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
(skb->data + skb->len - sizeof(*mmie));
if (mmie->element_id != WLAN_EID_MMIE ||
mmie->length != sizeof(*mmie) - 2)
- return RX_DROP_UNUSABLE; /* Invalid MMIE */
+ return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */
bip_ipn_swap(ipn, mmie->sequence_number);
if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
key->u.aes_cmac.replays++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_REPLAY;
}
if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -946,7 +946,7 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
skb->data + 24, skb->len - 24, mic);
if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_cmac.icverrors++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_MIC_FAIL;
}
}
@@ -986,7 +986,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
key->u.aes_cmac.replays++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_REPLAY;
}
if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -996,7 +996,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
skb->data + 24, skb->len - 24, mic);
if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_cmac.icverrors++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_MIC_FAIL;
}
}
@@ -1079,13 +1079,13 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
(skb->data + skb->len - sizeof(*mmie));
if (mmie->element_id != WLAN_EID_MMIE ||
mmie->length != sizeof(*mmie) - 2)
- return RX_DROP_UNUSABLE; /* Invalid MMIE */
+ return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */
bip_ipn_swap(ipn, mmie->sequence_number);
if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) {
key->u.aes_gmac.replays++;
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_REPLAY;
}
if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -1104,7 +1104,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_gmac.icverrors++;
kfree(mic);
- return RX_DROP_UNUSABLE;
+ return RX_DROP_U_MIC_FAIL;
}
kfree(mic);
}
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
index d237d142171c..bceaab8dd8e4 100644
--- a/net/mptcp/fastopen.c
+++ b/net/mptcp/fastopen.c
@@ -9,11 +9,18 @@
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
struct request_sock *req)
{
- struct sock *ssk = subflow->tcp_sock;
- struct sock *sk = subflow->conn;
+ struct sock *sk, *ssk;
struct sk_buff *skb;
struct tcp_sock *tp;
+ /* on early fallback the subflow context is deleted by
+ * subflow_syn_recv_sock()
+ */
+ if (!subflow)
+ return;
+
+ ssk = subflow->tcp_sock;
+ sk = subflow->conn;
tp = tcp_sk(ssk);
subflow->is_mptfo = 1;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index b30cea2fbf3f..19a01b6566f1 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -442,7 +442,6 @@ static void clear_3rdack_retransmission(struct sock *sk)
static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
bool snd_data_fin_enable,
unsigned int *size,
- unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -556,7 +555,6 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
bool snd_data_fin_enable,
unsigned int *size,
- unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -580,7 +578,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
opts->ext_copy = *mpext;
}
- remaining -= map_size;
dss_size = map_size;
if (skb && snd_data_fin_enable)
mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
@@ -851,9 +848,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
}
snd_data_fin = mptcp_data_fin_enabled(msk);
- if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
+ if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts))
ret = true;
- else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) {
+ else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) {
unsigned int mp_fail_size;
ret = true;
@@ -1001,7 +998,7 @@ check_notify:
clear_3rdack_retransmission(ssk);
mptcp_pm_subflow_established(msk);
} else {
- mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC);
+ mptcp_pm_fully_established(msk, ssk);
}
return true;
@@ -1192,9 +1189,8 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
if (mp_opt.data_fin && mp_opt.data_len == 1 &&
- mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) &&
- schedule_work(&msk->work))
- sock_hold(subflow->conn);
+ mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64))
+ mptcp_schedule_work((struct sock *)msk);
return true;
}
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 70f0ced3ca86..78c924506e83 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -126,7 +126,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
return true;
}
-void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp)
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
{
struct mptcp_pm_data *pm = &msk->pm;
bool announce = false;
@@ -150,7 +150,7 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk,
spin_unlock_bh(&pm->lock);
if (announce)
- mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp);
+ mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC);
}
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 1c42bebca39e..bc343dab5e3f 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1035,8 +1035,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
lock_sock(newsk);
ssock = __mptcp_nmpc_socket(mptcp_sk(newsk));
release_sock(newsk);
- if (!ssock)
- return -EINVAL;
+ if (IS_ERR(ssock))
+ return PTR_ERR(ssock);
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index a02d3cbf2a1b..27a275805c06 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -25,8 +25,8 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk)
}
}
-int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry)
+static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
{
DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_pm_addr_entry *match = NULL;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2d26b9114373..08dc53f56bc2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -49,18 +49,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk);
DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;
-/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
- * completed yet or has failed, return the subflow socket.
- * Otherwise return NULL.
- */
-struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
-{
- if (!msk->subflow || READ_ONCE(msk->can_ack))
- return NULL;
-
- return msk->subflow;
-}
-
/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
{
@@ -116,6 +104,31 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
return 0;
}
+/* If the MPC handshake is not started, returns the first subflow,
+ * eventually allocating it.
+ */
+struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret;
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ return ERR_PTR(-EINVAL);
+
+ if (!msk->subflow) {
+ if (msk->first)
+ return ERR_PTR(-EINVAL);
+
+ ret = __mptcp_socket_create(msk);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mptcp_sockopt_sync(msk, msk->first);
+ }
+
+ return msk->subflow;
+}
+
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
@@ -1662,13 +1675,31 @@ static void mptcp_set_nospace(struct sock *sk)
static int mptcp_disconnect(struct sock *sk, int flags);
-static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg,
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
size_t len, int *copied_syn)
{
unsigned int saved_flags = msg->msg_flags;
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ struct sock *ssk;
int ret;
+ /* on flags based fastopen the mptcp is supposed to create the
+ * first subflow right now. Otherwise we are in the defer_connect
+ * path, and the first subflow must be already present.
+ * Since the defer_connect flag is cleared after the first succsful
+ * fastopen attempt, no need to check for additional subflow status.
+ */
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ ssock = __mptcp_nmpc_socket(msk);
+ if (IS_ERR(ssock))
+ return PTR_ERR(ssock);
+ }
+ if (!msk->first)
+ return -EINVAL;
+
+ ssk = msk->first;
+
lock_sock(ssk);
msg->msg_flags |= MSG_DONTWAIT;
msk->connect_flags = O_NONBLOCK;
@@ -1691,6 +1722,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh
} else if (ret && ret != -EINPROGRESS) {
mptcp_disconnect(sk, 0);
}
+ inet_sk(sk)->defer_connect = 0;
return ret;
}
@@ -1699,7 +1731,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
- struct socket *ssock;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1709,12 +1740,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect ||
- msg->msg_flags & MSG_FASTOPEN))) {
+ if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
- ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
+ ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn);
copied += copied_syn;
if (ret == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -2315,7 +2344,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
unsigned int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- bool need_push, dispose_it;
+ bool dispose_it, need_push = false;
+
+ /* If the first subflow moved to a close state before accept, e.g. due
+ * to an incoming reset, mptcp either:
+ * - if either the subflow or the msk are dead, destroy the context
+ * (the subflow socket is deleted by inet_child_forget) and the msk
+ * - otherwise do nothing at the moment and take action at accept and/or
+ * listener shutdown - user-space must be able to accept() the closed
+ * socket.
+ */
+ if (msk->in_accept_queue && msk->first == ssk) {
+ if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD))
+ return;
+
+ /* ensure later check in mptcp_worker() will dispose the msk */
+ sock_set_flag(sk, SOCK_DEAD);
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ mptcp_subflow_drop_ctx(ssk);
+ goto out_release;
+ }
dispose_it = !msk->subflow || ssk != msk->subflow->sk;
if (dispose_it)
@@ -2351,28 +2399,22 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (!inet_csk(ssk)->icsk_ulp_ops) {
WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD));
kfree_rcu(subflow, rcu);
- } else if (msk->in_accept_queue && msk->first == ssk) {
- /* if the first subflow moved to a close state, e.g. due to
- * incoming reset and we reach here before inet_child_forget()
- * the TCP stack could later try to close it via
- * inet_csk_listen_stop(), or deliver it to the user space via
- * accept().
- * We can't delete the subflow - or risk a double free - nor let
- * the msk survive - or will be leaked in the non accept scenario:
- * fallback and let TCP cope with the subflow cleanup.
- */
- WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD));
- mptcp_subflow_drop_ctx(ssk);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
- if (ssk->sk_state == TCP_LISTEN)
+ if (ssk->sk_state == TCP_LISTEN) {
+ tcp_set_state(ssk, TCP_CLOSE);
+ mptcp_subflow_queue_clean(sk, ssk);
+ inet_csk_listen_stop(ssk);
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
+ }
__tcp_close(ssk, 0);
/* close acquired an extra ref */
__sock_put(ssk);
}
+
+out_release:
release_sock(ssk);
sock_put(ssk);
@@ -2427,21 +2469,14 @@ static void __mptcp_close_subflow(struct sock *sk)
mptcp_close_ssk(sk, ssk, subflow);
}
- /* if the MPC subflow has been closed before the msk is accepted,
- * msk will never be accept-ed, close it now
- */
- if (!msk->first && msk->in_accept_queue) {
- sock_set_flag(sk, SOCK_DEAD);
- inet_sk_state_store(sk, TCP_CLOSE);
- }
}
-static bool mptcp_check_close_timeout(const struct sock *sk)
+static bool mptcp_should_close(const struct sock *sk)
{
s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
struct mptcp_subflow_context *subflow;
- if (delta >= TCP_TIMEWAIT_LEN)
+ if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue)
return true;
/* if all subflows are in closed status don't bother with additional
@@ -2626,7 +2661,7 @@ static void mptcp_worker(struct work_struct *work)
lock_sock(sk);
state = sk->sk_state;
- if (unlikely(state == TCP_CLOSE))
+ if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto unlock;
mptcp_check_data_fin_ack(sk);
@@ -2649,7 +2684,7 @@ static void mptcp_worker(struct work_struct *work)
* even if it is orphaned and in FIN_WAIT2 state
*/
if (sock_flag(sk, SOCK_DEAD)) {
- if (mptcp_check_close_timeout(sk)) {
+ if (mptcp_should_close(sk)) {
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_do_fastclose(sk);
}
@@ -2728,10 +2763,6 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
- ret = __mptcp_socket_create(mptcp_sk(sk));
- if (ret)
- return ret;
-
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
@@ -2895,6 +2926,14 @@ static void __mptcp_destroy_sock(struct sock *sk)
sock_put(sk);
}
+void __mptcp_unaccepted_force_close(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ __mptcp_destroy_sock(sk);
+}
+
static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
{
/* Concurrent splices from sk_receive_queue into receive_queue will
@@ -2928,10 +2967,13 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_check_readable(msk)) {
- /* the msk has read data, do the MPTCP equivalent of TCP reset */
+ if (mptcp_check_readable(msk) || timeout < 0) {
+ /* If the msk has read data, or the caller explicitly ask it,
+ * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
+ */
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_do_fastclose(sk);
+ timeout = 0;
} else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
}
@@ -3143,7 +3185,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
struct socket *listener;
struct sock *newsk;
- listener = __mptcp_nmpc_socket(msk);
+ listener = msk->subflow;
if (WARN_ON_ONCE(!listener)) {
*err = -EINVAL;
return NULL;
@@ -3363,7 +3405,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum)
struct mptcp_sock *msk = mptcp_sk(sk);
struct socket *ssock;
- ssock = __mptcp_nmpc_socket(msk);
+ ssock = msk->subflow;
pr_debug("msk=%p, subflow=%p", msk, ssock);
if (WARN_ON_ONCE(!ssock))
return -EINVAL;
@@ -3551,8 +3593,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
int err = -EINVAL;
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
- return -EINVAL;
+ if (IS_ERR(ssock))
+ return PTR_ERR(ssock);
mptcp_token_destroy(msk);
inet_sk_state_store(sk, TCP_SYN_SENT);
@@ -3640,8 +3682,8 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
lock_sock(sock->sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
goto unlock;
}
@@ -3677,8 +3719,8 @@ static int mptcp_listen(struct socket *sock, int backlog)
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
goto unlock;
}
@@ -3709,7 +3751,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
pr_debug("msk=%p", msk);
- ssock = __mptcp_nmpc_socket(msk);
+ /* buggy applications can call accept on socket states other then LISTEN
+ * but no need to allocate the first subflow just to error out.
+ */
+ ssock = msk->subflow;
if (!ssock)
return -EINVAL;
@@ -3733,6 +3778,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (!ssk->sk_socket)
mptcp_sock_graft(ssk, newsock);
}
+
+ /* Do late cleanup for the first subflow as necessary. Also
+ * deal with bad peers not doing a complete shutdown.
+ */
+ if (msk->first &&
+ unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) {
+ __mptcp_close_ssk(newsk, msk->first,
+ mptcp_subflow_ctx(msk->first), 0);
+ if (unlikely(list_empty(&msk->conn_list)))
+ inet_sk_state_store(newsk, TCP_CLOSE);
+ }
+
release_sock(newsk);
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e1310bc113be..2d7b2c80a164 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -626,10 +626,12 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow);
void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
-struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
+struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk);
+void __mptcp_unaccepted_force_close(struct sock *sk);
void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk);
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
@@ -782,7 +784,7 @@ bool mptcp_pm_addr_families_match(const struct sock *sk,
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
-void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp);
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk);
@@ -830,8 +832,6 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *
void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
struct list_head *rm_list);
-int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry);
void mptcp_free_local_addr_list(struct mptcp_sock *msk);
int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index b655cebda0f3..d4258869ac48 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -301,9 +301,9 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_BINDTOIFINDEX:
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
+ if (IS_ERR(ssock)) {
release_sock(sk);
- return -EINVAL;
+ return PTR_ERR(ssock);
}
ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
@@ -396,9 +396,9 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
case IPV6_FREEBIND:
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
+ if (IS_ERR(ssock)) {
release_sock(sk);
- return -EINVAL;
+ return PTR_ERR(ssock);
}
ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
@@ -693,9 +693,9 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
+ if (IS_ERR(ssock)) {
release_sock(sk);
- return -EINVAL;
+ return PTR_ERR(ssock);
}
issk = inet_sk(ssock->sk);
@@ -762,13 +762,15 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
{
struct sock *sk = (struct sock *)msk;
struct socket *sock;
- int ret = -EINVAL;
+ int ret;
/* Limit to first subflow, before the connection establishment */
lock_sock(sk);
sock = __mptcp_nmpc_socket(msk);
- if (!sock)
+ if (IS_ERR(sock)) {
+ ret = PTR_ERR(sock);
goto unlock;
+ }
ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen);
@@ -861,7 +863,7 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
{
struct sock *sk = (struct sock *)msk;
struct socket *ssock;
- int ret = -EINVAL;
+ int ret;
struct sock *ssk;
lock_sock(sk);
@@ -872,8 +874,10 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
}
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
+ if (IS_ERR(ssock)) {
+ ret = PTR_ERR(ssock);
goto out;
+ }
ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 33dd27765116..ba065b66551a 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -408,9 +408,8 @@ void mptcp_subflow_reset(struct sock *ssk)
tcp_send_active_reset(ssk, GFP_ATOMIC);
tcp_done(ssk);
- if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
- schedule_work(&mptcp_sk(sk)->work))
- return; /* worker will put sk for us */
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
sock_put(sk);
}
@@ -716,9 +715,12 @@ void mptcp_subflow_drop_ctx(struct sock *ssk)
if (!ctx)
return;
- subflow_ulp_fallback(ssk, ctx);
- if (ctx->conn)
- sock_put(ctx->conn);
+ list_del(&mptcp_subflow_ctx(ssk)->node);
+ if (inet_csk(ssk)->icsk_ulp_ops) {
+ subflow_ulp_fallback(ssk, ctx);
+ if (ctx->conn)
+ sock_put(ctx->conn);
+ }
kfree_rcu(ctx, rcu);
}
@@ -851,7 +853,7 @@ create_child:
*/
if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
mptcp_subflow_fully_established(ctx, &mp_opt);
- mptcp_pm_fully_established(owner, child, GFP_ATOMIC);
+ mptcp_pm_fully_established(owner, child);
ctx->pm_notified = 1;
}
} else if (ctx->mp_join) {
@@ -1101,8 +1103,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
skb_ext_del(skb, SKB_EXT_MPTCP);
return MAPPING_OK;
} else {
- if (updated && schedule_work(&msk->work))
- sock_hold((struct sock *)msk);
+ if (updated)
+ mptcp_schedule_work((struct sock *)msk);
return MAPPING_DATA_FIN;
}
@@ -1205,17 +1207,12 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
/* sched mptcp worker to remove the subflow if no more data is pending */
static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
{
- struct sock *sk = (struct sock *)msk;
-
if (likely(ssk->sk_state != TCP_CLOSE))
return;
if (skb_queue_empty(&ssk->sk_receive_queue) &&
- !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
- sock_hold(sk);
- if (!schedule_work(&msk->work))
- sock_put(sk);
- }
+ !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ mptcp_schedule_work((struct sock *)msk);
}
static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
@@ -1808,6 +1805,77 @@ static void subflow_state_change(struct sock *sk)
}
}
+void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
+{
+ struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
+ struct mptcp_sock *msk, *next, *head = NULL;
+ struct request_sock *req;
+ struct sock *sk;
+
+ /* build a list of all unaccepted mptcp sockets */
+ spin_lock_bh(&queue->rskq_lock);
+ for (req = queue->rskq_accept_head; req; req = req->dl_next) {
+ struct mptcp_subflow_context *subflow;
+ struct sock *ssk = req->sk;
+
+ if (!sk_is_mptcp(ssk))
+ continue;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ if (!subflow || !subflow->conn)
+ continue;
+
+ /* skip if already in list */
+ sk = subflow->conn;
+ msk = mptcp_sk(sk);
+ if (msk->dl_next || msk == head)
+ continue;
+
+ sock_hold(sk);
+ msk->dl_next = head;
+ head = msk;
+ }
+ spin_unlock_bh(&queue->rskq_lock);
+ if (!head)
+ return;
+
+ /* can't acquire the msk socket lock under the subflow one,
+ * or will cause ABBA deadlock
+ */
+ release_sock(listener_ssk);
+
+ for (msk = head; msk; msk = next) {
+ sk = (struct sock *)msk;
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ next = msk->dl_next;
+ msk->dl_next = NULL;
+
+ __mptcp_unaccepted_force_close(sk);
+ release_sock(sk);
+
+ /* lockdep will report a false positive ABBA deadlock
+ * between cancel_work_sync and the listener socket.
+ * The involved locks belong to different sockets WRT
+ * the existing AB chain.
+ * Using a per socket key is problematic as key
+ * deregistration requires process context and must be
+ * performed at socket disposal time, in atomic
+ * context.
+ * Just tell lockdep to consider the listener socket
+ * released here.
+ */
+ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
+ mptcp_cancel_work(sk);
+ mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+
+ sock_put(sk);
+ }
+
+ /* we are still under the listener msk socket lock */
+ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+}
+
static int subflow_ulp_init(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index cd99e6dc1f35..3f821b7ba646 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -192,8 +192,7 @@ BTF_ID(struct, nf_conn___init)
/* Check writes into `struct nf_conn` */
static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, int size, enum bpf_access_type atype,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ int off, int size)
{
const struct btf_type *ncit, *nct, *t;
size_t end;
@@ -401,8 +400,6 @@ __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
*/
__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
{
- if (!nfct)
- return;
nf_ct_put(nfct);
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6004d4b24451..e48ab8dfb541 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3447,6 +3447,64 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
return 0;
}
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+ const struct nft_data *data;
+ int err;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ pctx->level++;
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ pctx->level--;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+struct nft_set_elem_catchall {
+ struct list_head list;
+ struct rcu_head rcu;
+ void *elem;
+};
+
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_elem elem;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ elem.priv = catchall->elem;
+ ret = nft_setelem_validate(ctx, set, NULL, &elem);
+ if (ret < 0)
+ return ret;
+ }
+
+ return ret;
+}
+
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
const struct nft_chain *chain,
const struct nlattr *nla);
@@ -4759,12 +4817,6 @@ err_set_name:
return err;
}
-struct nft_set_elem_catchall {
- struct list_head list;
- struct rcu_head rcu;
- void *elem;
-};
-
static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
struct nft_set *set)
{
@@ -6056,7 +6108,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
+ (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
return -EINVAL;
if (flags != 0) {
@@ -7052,7 +7105,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
}
if (nla[NFTA_OBJ_USERDATA]) {
- obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL);
+ obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
if (obj->udata == NULL)
goto err_userdata;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index cae5a6724163..cecf8ab90e58 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -199,37 +199,6 @@ nla_put_failure:
return -1;
}
-static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- struct nft_ctx *pctx = (struct nft_ctx *)ctx;
- const struct nft_data *data;
- int err;
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- pctx->level++;
- err = nft_chain_validate(ctx, data->verdict.chain);
- if (err < 0)
- return err;
- pctx->level--;
- break;
- default:
- break;
- }
-
- return 0;
-}
-
static int nft_lookup_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **d)
@@ -245,9 +214,12 @@ static int nft_lookup_validate(const struct nft_ctx *ctx,
iter.skip = 0;
iter.count = 0;
iter.err = 0;
- iter.fn = nft_lookup_validate_setelem;
+ iter.fn = nft_setelem_validate;
priv->set->ops->walk(ctx, priv->set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_validate(ctx, priv->set);
+
if (iter.err < 0)
return iter.err;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 877f1da1a8ac..1db4742e443d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1952,7 +1952,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
- size_t copied;
+ size_t copied, max_recvmsg_len;
struct sk_buff *skb, *data_skb;
int err, ret;
@@ -1985,9 +1985,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
#endif
/* Record the max length of recvmsg() calls for future allocations */
- nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
- nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
- SKB_WITH_OVERHEAD(32768));
+ max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
+ max_recvmsg_len = min_t(size_t, max_recvmsg_len,
+ SKB_WITH_OVERHEAD(32768));
+ WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);
copied = data_skb->len;
if (len < copied) {
@@ -2234,6 +2235,7 @@ static int netlink_dump(struct sock *sk)
struct netlink_ext_ack extack = {};
struct netlink_callback *cb;
struct sk_buff *skb = NULL;
+ size_t max_recvmsg_len;
struct module *module;
int err = -ENOBUFS;
int alloc_min_size;
@@ -2256,8 +2258,9 @@ static int netlink_dump(struct sock *sk)
cb = &nlk->cb;
alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
- if (alloc_min_size < nlk->max_recvmsg_len) {
- alloc_size = nlk->max_recvmsg_len;
+ max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
+ if (alloc_min_size < max_recvmsg_len) {
+ alloc_size = max_recvmsg_len;
skb = alloc_skb(alloc_size,
(GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOWARN | __GFP_NORETRY);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ca3ebfdb3023..a8cf9a88758e 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -913,7 +913,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
{
struct vport *vport = ovs_vport_rcu(dp, out_port);
- if (likely(vport)) {
+ if (likely(vport && netif_carrier_ok(vport->dev))) {
u16 mru = OVS_CB(skb)->mru;
u32 cutlen = OVS_CB(skb)->cutlen;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 568f8d76e3c1..6080c0db0814 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb,
}
static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
- size_t *len)
+ size_t *len, int vnet_hdr_sz)
{
- struct virtio_net_hdr vnet_hdr;
+ struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
- if (*len < sizeof(vnet_hdr))
+ if (*len < vnet_hdr_sz)
return -EINVAL;
- *len -= sizeof(vnet_hdr);
+ *len -= vnet_hdr_sz;
- if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
+ if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
return -EINVAL;
- return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
+ return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
}
/*
@@ -2250,7 +2250,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
__u32 ts_status;
bool is_drop_n_account = false;
unsigned int slot_id = 0;
- bool do_vnet = false;
+ int vnet_hdr_sz = 0;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
* We may add members to them until current aligned size without forcing
@@ -2308,10 +2308,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
netoff = TPACKET_ALIGN(po->tp_hdrlen +
(maclen < 16 ? 16 : maclen)) +
po->tp_reserve;
- if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
- netoff += sizeof(struct virtio_net_hdr);
- do_vnet = true;
- }
+ vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
+ if (vnet_hdr_sz)
+ netoff += vnet_hdr_sz;
macoff = netoff - maclen;
}
if (netoff > USHRT_MAX) {
@@ -2337,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
snaplen = po->rx_ring.frame_size - macoff;
if ((int)snaplen < 0) {
snaplen = 0;
- do_vnet = false;
+ vnet_hdr_sz = 0;
}
}
} else if (unlikely(macoff + snaplen >
@@ -2351,7 +2350,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (unlikely((int)snaplen < 0)) {
snaplen = 0;
macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
- do_vnet = false;
+ vnet_hdr_sz = 0;
}
}
spin_lock(&sk->sk_receive_queue.lock);
@@ -2367,7 +2366,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
__set_bit(slot_id, po->rx_ring.rx_owner_map);
}
- if (do_vnet &&
+ if (vnet_hdr_sz &&
virtio_net_hdr_from_skb(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr),
vio_le(), true, 0)) {
@@ -2551,16 +2550,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
}
static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
- struct virtio_net_hdr *vnet_hdr)
+ struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
{
- if (*len < sizeof(*vnet_hdr))
+ int ret;
+
+ if (*len < vnet_hdr_sz)
return -EINVAL;
- *len -= sizeof(*vnet_hdr);
+ *len -= vnet_hdr_sz;
if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
return -EFAULT;
- return __packet_snd_vnet_parse(vnet_hdr, *len);
+ ret = __packet_snd_vnet_parse(vnet_hdr, *len);
+ if (ret)
+ return ret;
+
+ /* move iter to point to the start of mac header */
+ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
+ iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
+
+ return 0;
}
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
@@ -2722,6 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
void *ph;
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+ int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
unsigned char *addr = NULL;
int tp_len, size_max;
void *data;
@@ -2779,8 +2789,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
size_max = po->tx_ring.frame_size
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
- if ((size_max > dev->mtu + reserve + VLAN_HLEN) &&
- !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
+ if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
size_max = dev->mtu + reserve + VLAN_HLEN;
reinit_completion(&po->skb_completion);
@@ -2809,10 +2818,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
status = TP_STATUS_SEND_REQUEST;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
- if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
+ if (vnet_hdr_sz) {
vnet_hdr = data;
- data += sizeof(*vnet_hdr);
- tp_len -= sizeof(*vnet_hdr);
+ data += vnet_hdr_sz;
+ tp_len -= vnet_hdr_sz;
if (tp_len < 0 ||
__packet_snd_vnet_parse(vnet_hdr, tp_len)) {
tp_len = -EINVAL;
@@ -2837,7 +2846,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
addr, hlen, copylen, &sockc);
if (likely(tp_len >= 0) &&
tp_len > dev->mtu + reserve &&
- !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) &&
+ !vnet_hdr_sz &&
!packet_extra_vlan_len_allowed(dev, skb))
tp_len = -EMSGSIZE;
@@ -2856,7 +2865,7 @@ tpacket_error:
}
}
- if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
+ if (vnet_hdr_sz) {
if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
tp_len = -EINVAL;
goto tpacket_error;
@@ -2946,7 +2955,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
struct virtio_net_hdr vnet_hdr = { 0 };
int offset = 0;
struct packet_sock *po = pkt_sk(sk);
- bool has_vnet_hdr = false;
+ int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
int hlen, tlen, linear;
int extra_len = 0;
@@ -2990,11 +2999,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (sock->type == SOCK_RAW)
reserve = dev->hard_header_len;
- if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
- err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
+ if (vnet_hdr_sz) {
+ err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
if (err)
goto out_unlock;
- has_vnet_hdr = true;
}
if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
@@ -3064,11 +3072,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
packet_parse_headers(skb, sock);
- if (has_vnet_hdr) {
+ if (vnet_hdr_sz) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
if (err)
goto out_free;
- len += sizeof(vnet_hdr);
+ len += vnet_hdr_sz;
virtio_net_hdr_set_proto(skb, &vnet_hdr);
}
@@ -3408,7 +3416,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
- int vnet_hdr_len = 0;
+ int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
unsigned int origlen = 0;
err = -EINVAL;
@@ -3449,11 +3457,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
packet_rcv_try_clear_pressure(pkt_sk(sk));
- if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) {
- err = packet_rcv_vnet(msg, skb, &len);
+ if (vnet_hdr_len) {
+ err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
if (err)
goto out_free;
- vnet_hdr_len = sizeof(struct virtio_net_hdr);
}
/* You lose any data beyond the buffer you gave. If it worries
@@ -3915,8 +3922,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
return 0;
}
case PACKET_VNET_HDR:
+ case PACKET_VNET_HDR_SZ:
{
- int val;
+ int val, hdr_len;
if (sock->type != SOCK_RAW)
return -EINVAL;
@@ -3925,11 +3933,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
+ if (optname == PACKET_VNET_HDR_SZ) {
+ if (val && val != sizeof(struct virtio_net_hdr) &&
+ val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
+ return -EINVAL;
+ hdr_len = val;
+ } else {
+ hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
+ }
lock_sock(sk);
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
ret = -EBUSY;
} else {
- packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val);
+ WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
ret = 0;
}
release_sock(sk);
@@ -4062,7 +4078,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
break;
case PACKET_VNET_HDR:
- val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR);
+ val = !!READ_ONCE(po->vnet_hdr_sz);
+ break;
+ case PACKET_VNET_HDR_SZ:
+ val = READ_ONCE(po->vnet_hdr_sz);
break;
case PACKET_VERSION:
val = po->tp_version;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index de4ced5cf3e8..d0c4eda4cdc6 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
pinfo.pdi_flags |= PDI_AUXDATA;
if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV))
pinfo.pdi_flags |= PDI_ORIGDEV;
- if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
+ if (READ_ONCE(po->vnet_hdr_sz))
pinfo.pdi_flags |= PDI_VNETHDR;
if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS))
pinfo.pdi_flags |= PDI_LOSS;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 27930f69f368..63f4865202c1 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -118,6 +118,7 @@ struct packet_sock {
struct mutex pg_vec_lock;
unsigned long flags;
int ifindex; /* bound device */
+ u8 vnet_hdr_sz;
__be16 num;
struct packet_rollover *rollover;
struct packet_mclist *mclist;
@@ -139,7 +140,6 @@ enum packet_sock_flags {
PACKET_SOCK_AUXDATA,
PACKET_SOCK_TX_HAS_OFF,
PACKET_SOCK_TP_LOSS,
- PACKET_SOCK_HAS_VNET_HDR,
PACKET_SOCK_RUNNING,
PACKET_SOCK_PRESSURE,
PACKET_SOCK_QDISC_BYPASS,
diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c
index 5c2fb992803b..76f0434d3d06 100644
--- a/net/qrtr/af_qrtr.c
+++ b/net/qrtr/af_qrtr.c
@@ -393,10 +393,12 @@ static struct qrtr_node *qrtr_node_lookup(unsigned int nid)
struct qrtr_node *node;
unsigned long flags;
+ mutex_lock(&qrtr_node_lock);
spin_lock_irqsave(&qrtr_nodes_lock, flags);
node = radix_tree_lookup(&qrtr_nodes, nid);
node = qrtr_node_acquire(node);
spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
+ mutex_unlock(&qrtr_node_lock);
return node;
}
@@ -496,6 +498,11 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
if (!size || len != ALIGN(size, 4) + hdrlen)
goto err;
+ if ((cb->type == QRTR_TYPE_NEW_SERVER ||
+ cb->type == QRTR_TYPE_RESUME_TX) &&
+ size < sizeof(struct qrtr_ctrl_pkt))
+ goto err;
+
if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA &&
cb->type != QRTR_TYPE_RESUME_TX)
goto err;
@@ -508,9 +515,6 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
/* Remote node endpoint can bridge other distant nodes */
const struct qrtr_ctrl_pkt *pkt;
- if (size < sizeof(*pkt))
- goto err;
-
pkt = data + hdrlen;
qrtr_node_assign(node, le32_to_cpu(pkt->server.node));
}
diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index 722936f7dd98..0f25a386138c 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -274,7 +274,7 @@ err:
return NULL;
}
-static int server_del(struct qrtr_node *node, unsigned int port)
+static int server_del(struct qrtr_node *node, unsigned int port, bool bcast)
{
struct qrtr_lookup *lookup;
struct qrtr_server *srv;
@@ -287,7 +287,7 @@ static int server_del(struct qrtr_node *node, unsigned int port)
radix_tree_delete(&node->servers, port);
/* Broadcast the removal of local servers */
- if (srv->node == qrtr_ns.local_node)
+ if (srv->node == qrtr_ns.local_node && bcast)
service_announce_del(&qrtr_ns.bcast_sq, srv);
/* Announce the service's disappearance to observers */
@@ -373,7 +373,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
}
slot = radix_tree_iter_resume(slot, &iter);
rcu_read_unlock();
- server_del(node, srv->port);
+ server_del(node, srv->port, true);
rcu_read_lock();
}
rcu_read_unlock();
@@ -459,10 +459,13 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
kfree(lookup);
}
- /* Remove the server belonging to this port */
+ /* Remove the server belonging to this port but don't broadcast
+ * DEL_SERVER. Neighbours would've already removed the server belonging
+ * to this port due to the DEL_CLIENT broadcast from qrtr_port_remove().
+ */
node = node_get(node_id);
if (node)
- server_del(node, port);
+ server_del(node, port, false);
/* Advertise the removal of this client to all local servers */
local_node = node_get(qrtr_ns.local_node);
@@ -567,7 +570,7 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from,
if (!node)
return -ENOENT;
- return server_del(node, port);
+ return server_del(node, port, true);
}
static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from,
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 95e9304024b7..8ed285023a40 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -376,8 +376,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
sctph->checksum = sctp_compute_cksum(skb,
skb_network_offset(skb) + ihl);
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum_not_inet = 0;
+ skb_reset_csum_not_inet(skb);
return 1;
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2a6b6be0811b..35785a36c802 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3235,6 +3235,9 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action,
err_miss_alloc:
tcf_exts_destroy(exts);
+#ifdef CONFIG_NET_CLS_ACT
+ exts->actions = NULL;
+#endif
return err;
}
EXPORT_SYMBOL(tcf_exts_init_ex);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index fdd6a6575a54..dc5a0ff50b14 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -5,6 +5,7 @@
* Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
*/
+#include <linux/ethtool_netlink.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -27,15 +28,19 @@ struct mqprio_sched {
u32 flags;
u64 min_rate[TC_QOPT_MAX_QUEUE];
u64 max_rate[TC_QOPT_MAX_QUEUE];
+ u32 fp[TC_QOPT_MAX_QUEUE];
};
static int mqprio_enable_offload(struct Qdisc *sch,
const struct tc_mqprio_qopt *qopt,
struct netlink_ext_ack *extack)
{
- struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
struct mqprio_sched *priv = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct tc_mqprio_qopt_offload mqprio = {
+ .qopt = *qopt,
+ .extack = extack,
+ };
int err, i;
switch (priv->mode) {
@@ -60,6 +65,8 @@ static int mqprio_enable_offload(struct Qdisc *sch,
return -EINVAL;
}
+ mqprio_fp_to_offload(priv->fp, &mqprio);
+
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
&mqprio);
if (err)
@@ -133,48 +140,131 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
/* If ndo_setup_tc is not present then hardware doesn't support offload
* and we should return an error.
*/
- if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support hardware offload");
return -EINVAL;
+ }
return 0;
}
+static const struct
+nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = {
+ [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32,
+ TC_QOPT_MAX_QUEUE),
+ [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
+ TC_FP_EXPRESS,
+ TC_FP_PREEMPTIBLE),
+};
+
static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
[TCA_MQPRIO_MODE] = { .len = sizeof(u16) },
[TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) },
[TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED },
[TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED },
+ [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED },
};
-static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
- const struct nla_policy *policy, int len)
+static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct nlattr *opt,
+ unsigned long *seen_tcs,
+ struct netlink_ext_ack *extack)
{
- int nested_len = nla_len(nla) - NLA_ALIGN(len);
+ struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1];
+ int err, tc;
+
+ err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt,
+ mqprio_tc_entry_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) {
+ NL_SET_ERR_MSG(extack, "TC entry index missing");
+ return -EINVAL;
+ }
+
+ tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]);
+ if (*seen_tcs & BIT(tc)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX],
+ "Duplicate tc entry");
+ return -EINVAL;
+ }
+
+ *seen_tcs |= BIT(tc);
- if (nested_len >= nla_attr_size(0))
- return nla_parse_deprecated(tb, maxtype,
- nla_data(nla) + NLA_ALIGN(len),
- nested_len, policy, NULL);
+ if (tb[TCA_MQPRIO_TC_ENTRY_FP])
+ fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]);
- memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
+static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
+ int nlattr_opt_len,
+ struct netlink_ext_ack *extack)
+{
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool have_preemption = false;
+ unsigned long seen_tcs = 0;
+ u32 fp[TC_QOPT_MAX_QUEUE];
+ struct nlattr *n;
+ int tc, rem;
+ int err = 0;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ fp[tc] = priv->fp[tc];
+
+ nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) {
+ if (nla_type(n) != TCA_MQPRIO_TC_ENTRY)
+ continue;
+
+ err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
+ if (err)
+ goto out;
+ }
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ priv->fp[tc] = fp[tc];
+ if (fp[tc] == TC_FP_PREEMPTIBLE)
+ have_preemption = true;
+ }
+
+ if (have_preemption && !ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG(extack, "Device does not support preemption");
+ return -EOPNOTSUPP;
+ }
+out:
+ return err;
+}
+
+/* Parse the other netlink attributes that represent the payload of
+ * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt.
+ */
static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
- struct nlattr *opt)
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
+ struct nlattr *nlattr_opt = nla_data(opt) + NLA_ALIGN(sizeof(*qopt));
+ int nlattr_opt_len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
struct mqprio_sched *priv = qdisc_priv(sch);
- struct nlattr *tb[TCA_MQPRIO_MAX + 1];
+ struct nlattr *tb[TCA_MQPRIO_MAX + 1] = {};
struct nlattr *attr;
int i, rem, err;
- err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
- sizeof(*qopt));
- if (err < 0)
- return err;
+ if (nlattr_opt_len >= nla_attr_size(0)) {
+ err = nla_parse_deprecated(tb, TCA_MQPRIO_MAX, nlattr_opt,
+ nlattr_opt_len, mqprio_policy,
+ NULL);
+ if (err < 0)
+ return err;
+ }
- if (!qopt->hw)
+ if (!qopt->hw) {
+ NL_SET_ERR_MSG(extack,
+ "mqprio TCA_OPTIONS can only contain netlink attributes in hardware mode");
return -EINVAL;
+ }
if (tb[TCA_MQPRIO_MODE]) {
priv->flags |= TC_MQPRIO_F_MODE;
@@ -187,13 +277,19 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
}
if (tb[TCA_MQPRIO_MIN_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MIN_RATE64],
+ "min_rate accepted only when shaper is in bw_rlimit mode");
return -EINVAL;
+ }
i = 0;
nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
rem) {
- if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
+ if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Attribute type expected to be TCA_MQPRIO_MIN_RATE64");
return -EINVAL;
+ }
if (i >= qopt->num_tc)
break;
priv->min_rate[i] = nla_get_u64(attr);
@@ -203,13 +299,19 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
}
if (tb[TCA_MQPRIO_MAX_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MAX_RATE64],
+ "max_rate accepted only when shaper is in bw_rlimit mode");
return -EINVAL;
+ }
i = 0;
nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
rem) {
- if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
+ if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Attribute type expected to be TCA_MQPRIO_MAX_RATE64");
return -EINVAL;
+ }
if (i >= qopt->num_tc)
break;
priv->max_rate[i] = nla_get_u64(attr);
@@ -218,6 +320,13 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
priv->flags |= TC_MQPRIO_F_MAX_RATE;
}
+ if (tb[TCA_MQPRIO_TC_ENTRY]) {
+ err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len,
+ extack);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -231,7 +340,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
int i, err = -EOPNOTSUPP;
struct tc_mqprio_qopt *qopt = NULL;
struct tc_mqprio_caps caps;
- int len;
+ int len, tc;
BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -249,6 +358,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt || nla_len(opt) < sizeof(*qopt))
return -EINVAL;
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ priv->fp[tc] = TC_FP_EXPRESS;
+
qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
&caps, sizeof(caps));
@@ -258,7 +370,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
if (len > 0) {
- err = mqprio_parse_nlattr(sch, qopt, opt);
+ err = mqprio_parse_nlattr(sch, qopt, opt, extack);
if (err)
return err;
}
@@ -399,6 +511,33 @@ nla_put_failure:
return -1;
}
+static int mqprio_dump_tc_entries(struct mqprio_sched *priv,
+ struct sk_buff *skb)
+{
+ struct nlattr *n;
+ int tc;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY);
+ if (!n)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc]))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, n);
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, n);
+ return -EMSGSIZE;
+}
+
static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
@@ -449,6 +588,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
(dump_rates(priv, &opt, skb) != 0))
goto nla_put_failure;
+ if (mqprio_dump_tc_entries(priv, skb))
+ goto nla_put_failure;
+
return nla_nest_end(skb, nla);
nla_put_failure:
nlmsg_trim(skb, nla);
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
index c58a533b8ec5..83b3793c4012 100644
--- a/net/sched/sch_mqprio_lib.c
+++ b/net/sched/sch_mqprio_lib.c
@@ -114,4 +114,18 @@ void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt
}
EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct tc_mqprio_qopt_offload *mqprio)
+{
+ unsigned long preemptible_tcs = 0;
+ int tc;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ if (fp[tc] == TC_FP_PREEMPTIBLE)
+ preemptible_tcs |= BIT(tc);
+
+ mqprio->preemptible_tcs = preemptible_tcs;
+}
+EXPORT_SYMBOL_GPL(mqprio_fp_to_offload);
+
MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
index 63f725ab8761..079f597072e3 100644
--- a/net/sched/sch_mqprio_lib.h
+++ b/net/sched/sch_mqprio_lib.h
@@ -14,5 +14,7 @@ int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
struct netlink_ext_ack *extack);
void mqprio_qopt_reconstruct(struct net_device *dev,
struct tc_mqprio_qopt *qopt);
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct tc_mqprio_qopt_offload *mqprio);
#endif
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index cf5ebe43b3b4..02098a02943e 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -421,15 +421,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
} else
weight = 1;
- if (tb[TCA_QFQ_LMAX]) {
+ if (tb[TCA_QFQ_LMAX])
lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
- if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
- pr_notice("qfq: invalid max length %u\n", lmax);
- return -EINVAL;
- }
- } else
+ else
lmax = psched_mtu(qdisc_dev(sch));
+ if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
+ pr_notice("qfq: invalid max length %u\n", lmax);
+ return -EINVAL;
+ }
+
inv_w = ONE_FP / weight;
weight = ONE_FP / inv_w;
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 1f469861eae3..76db9a10ef50 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -7,6 +7,7 @@
*/
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -96,6 +97,7 @@ struct taprio_sched {
struct list_head taprio_list;
int cur_txq[TC_MAX_QUEUE];
u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
+ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
u32 txtime_delay;
};
@@ -1002,6 +1004,9 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
[TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 },
[TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 },
+ [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
+ TC_FP_EXPRESS,
+ TC_FP_PREEMPTIBLE),
};
static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
@@ -1520,22 +1525,31 @@ static int taprio_enable_offload(struct net_device *dev,
return -ENOMEM;
}
offload->enable = 1;
+ offload->extack = extack;
mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
+ offload->mqprio.extack = extack;
taprio_sched_to_offload(dev, sched, offload, &caps);
+ mqprio_fp_to_offload(q->fp, &offload->mqprio);
for (tc = 0; tc < TC_MAX_QUEUE; tc++)
offload->max_sdu[tc] = q->max_sdu[tc];
err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
if (err < 0) {
- NL_SET_ERR_MSG(extack,
- "Device failed to setup taprio offload");
+ NL_SET_ERR_MSG_WEAK(extack,
+ "Device failed to setup taprio offload");
goto done;
}
q->offloaded = true;
done:
+ /* The offload structure may linger around via a reference taken by the
+ * device driver, so clear up the netlink extack pointer so that the
+ * driver isn't tempted to dereference data which stopped being valid
+ */
+ offload->extack = NULL;
+ offload->mqprio.extack = NULL;
taprio_offload_free(offload);
return err;
@@ -1663,13 +1677,14 @@ out:
static int taprio_parse_tc_entry(struct Qdisc *sch,
struct nlattr *opt,
u32 max_sdu[TC_QOPT_MAX_QUEUE],
+ u32 fp[TC_QOPT_MAX_QUEUE],
unsigned long *seen_tcs,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
struct net_device *dev = qdisc_dev(sch);
- u32 val = 0;
int err, tc;
+ u32 val;
err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
taprio_tc_policy, extack);
@@ -1694,15 +1709,18 @@ static int taprio_parse_tc_entry(struct Qdisc *sch,
*seen_tcs |= BIT(tc);
- if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
+ if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
+ if (val > dev->max_mtu) {
+ NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
+ return -ERANGE;
+ }
- if (val > dev->max_mtu) {
- NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
- return -ERANGE;
+ max_sdu[tc] = val;
}
- max_sdu[tc] = val;
+ if (tb[TCA_TAPRIO_TC_ENTRY_FP])
+ fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);
return 0;
}
@@ -1712,29 +1730,51 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
u32 max_sdu[TC_QOPT_MAX_QUEUE];
+ bool have_preemption = false;
unsigned long seen_tcs = 0;
+ u32 fp[TC_QOPT_MAX_QUEUE];
struct nlattr *n;
int tc, rem;
int err = 0;
- for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
max_sdu[tc] = q->max_sdu[tc];
+ fp[tc] = q->fp[tc];
+ }
nla_for_each_nested(n, opt, rem) {
if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
continue;
- err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs,
+ err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
extack);
if (err)
- goto out;
+ return err;
}
- for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
q->max_sdu[tc] = max_sdu[tc];
+ q->fp[tc] = fp[tc];
+ if (fp[tc] != TC_FP_EXPRESS)
+ have_preemption = true;
+ }
+
+ if (have_preemption) {
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ NL_SET_ERR_MSG(extack,
+ "Preemption only supported with full offload");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support preemption");
+ return -EOPNOTSUPP;
+ }
+ }
-out:
return err;
}
@@ -2015,7 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- int i;
+ int i, tc;
spin_lock_init(&q->current_entry_lock);
@@ -2072,6 +2112,9 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
q->qdiscs[i] = qdisc;
}
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ q->fp[tc] = TC_FP_EXPRESS;
+
taprio_detect_broken_mqprio(q);
return taprio_change(sch, opt, extack);
@@ -2215,6 +2258,7 @@ error_nest:
}
static int taprio_dump_tc_entries(struct sk_buff *skb,
+ struct taprio_sched *q,
struct sched_gate_list *sched)
{
struct nlattr *n;
@@ -2232,6 +2276,9 @@ static int taprio_dump_tc_entries(struct sk_buff *skb,
sched->max_sdu[tc]))
goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc]))
+ goto nla_put_failure;
+
nla_nest_end(skb, n);
}
@@ -2273,7 +2320,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
goto options_error;
- if (oper && taprio_dump_tc_entries(skb, oper))
+ if (oper && taprio_dump_tc_entries(skb, q, oper))
goto options_error;
if (oper && dump_schedule(skb, oper))
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 63ba5551c13f..796529167e8d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1597,9 +1597,10 @@ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
struct sctp_cookie *cookie,
gfp_t gfp)
{
- int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
+ struct sctp_init_chunk *peer_init = (struct sctp_init_chunk *)(cookie + 1);
+ int var_size2 = ntohs(peer_init->chunk_hdr.length);
int var_size3 = cookie->raw_addr_list_len;
- __u8 *raw = (__u8 *)cookie->peer_init + var_size2;
+ __u8 *raw = (__u8 *)peer_init + var_size2;
return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3,
asoc->ep->base.bind_addr.port, gfp);
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 34964145514e..c58fffc86a0c 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -738,7 +738,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
tfm = asoc->ep->auth_hmacs[hmac_id];
- digest = auth->auth_hdr.hmac;
+ digest = (u8 *)(&auth->auth_hdr + 1);
if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
goto free;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 127bf28a6033..2613c4d74b16 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1150,7 +1150,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
init = (struct sctp_init_chunk *)skb->data;
/* Walk the parameters looking for embedded addresses. */
- sctp_walk_params(params, init, init_hdr.params) {
+ sctp_walk_params(params, init) {
/* Note: Ignoring hostname addresses. */
af = sctp_get_af_specific(param_type2af(params.p->type));
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 20831079fb09..0dc6b8ab9963 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1231,7 +1231,7 @@ static void sctp_sack_update_unack_data(struct sctp_association *assoc,
unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1;
- frags = sack->variable;
+ frags = (union sctp_sack_variable *)(sack + 1);
for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) {
unack_data -= ((ntohs(frags[i].gab.end) -
ntohs(frags[i].gab.start) + 1));
@@ -1252,7 +1252,6 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
struct sctp_transport *transport;
struct sctp_chunk *tchunk = NULL;
struct list_head *lchunk, *transport_list, *temp;
- union sctp_sack_variable *frags = sack->variable;
__u32 sack_ctsn, ctsn, tsn;
__u32 highest_tsn, highest_new_tsn;
__u32 sack_a_rwnd;
@@ -1313,8 +1312,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
/* Get the highest TSN in the sack. */
highest_tsn = sack_ctsn;
- if (gap_ack_blocks)
+ if (gap_ack_blocks) {
+ union sctp_sack_variable *frags =
+ (union sctp_sack_variable *)(sack + 1);
+
highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
+ }
if (TSN_lt(asoc->highest_sacked, highest_tsn))
asoc->highest_sacked = highest_tsn;
@@ -1789,7 +1792,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
* Block are assumed to have been received correctly.
*/
- frags = sack->variable;
+ frags = (union sctp_sack_variable *)(sack + 1);
blocks = ntohs(sack->num_gap_ack_blocks);
tsn_offset = tsn - ctsn;
for (i = 0; i < blocks; ++i) {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c7503fd64915..08527d882e56 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1707,11 +1707,11 @@ static struct sctp_cookie_param *sctp_pack_cookie(
ktime_get_real());
/* Copy the peer's init packet. */
- memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
+ memcpy(cookie + 1, init_chunk->chunk_hdr,
ntohs(init_chunk->chunk_hdr->length));
/* Copy the raw local address list of the association. */
- memcpy((__u8 *)&cookie->c.peer_init[0] +
+ memcpy((__u8 *)(cookie + 1) +
ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
if (sctp_sk(ep->base.sk)->hmac) {
@@ -2207,7 +2207,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
break;
case SCTP_PARAM_HOST_NAME_ADDRESS:
- /* Tell the peer, we won't support this param. */
+ /* This param has been Deprecated, send ABORT. */
sctp_process_hn_param(asoc, param, chunk, err_chunk);
retval = SCTP_IERROR_ABORT;
break;
@@ -2306,7 +2306,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
return sctp_process_inv_mandatory(asoc, chunk, errp);
- sctp_walk_params(param, peer_init, init_hdr.params) {
+ sctp_walk_params(param, peer_init) {
if (param.p->type == SCTP_PARAM_STATE_COOKIE)
has_cookie = true;
}
@@ -2329,7 +2329,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
chunk, errp);
/* Verify all the variable length parameters */
- sctp_walk_params(param, peer_init, init_hdr.params) {
+ sctp_walk_params(param, peer_init) {
result = sctp_verify_param(net, ep, asoc, param, cid,
chunk, errp);
switch (result) {
@@ -2381,7 +2381,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
src_match = 1;
/* Process the initialization parameters. */
- sctp_walk_params(param, peer_init, init_hdr.params) {
+ sctp_walk_params(param, peer_init) {
if (!src_match &&
(param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
param.p->type == SCTP_PARAM_IPV6_ADDRESS)) {
@@ -2589,10 +2589,6 @@ do_addr_param:
asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale);
break;
- case SCTP_PARAM_HOST_NAME_ADDRESS:
- pr_debug("%s: unimplemented SCTP_HOST_NAME_ADDRESS\n", __func__);
- break;
-
case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
/* Turn off the default values first so we'll know which
* ones are really set by the peer.
@@ -2624,10 +2620,6 @@ do_addr_param:
asoc->peer.ipv6_address = 1;
break;
- case SCTP_PARAM_HOST_NAME_ADDRESS:
- asoc->peer.hostname_address = 1;
- break;
-
default: /* Just ignore anything else. */
break;
}
@@ -3210,7 +3202,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
union sctp_params param;
addip = (struct sctp_addip_chunk *)chunk->chunk_hdr;
- sctp_walk_params(param, addip, addip_hdr.params) {
+ sctp_walk_params(param, addip) {
size_t length = ntohs(param.p->length);
*errp = param.p;
@@ -3223,14 +3215,14 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
/* ensure there is only one addr param and it's in the
* beginning of addip_hdr params, or we reject it.
*/
- if (param.v != addip->addip_hdr.params)
+ if (param.v != (addip + 1))
return false;
addr_param_seen = true;
break;
case SCTP_PARAM_IPV6_ADDRESS:
if (length != sizeof(struct sctp_ipv6addr_param))
return false;
- if (param.v != addip->addip_hdr.params)
+ if (param.v != (addip + 1))
return false;
addr_param_seen = true;
break;
@@ -3310,7 +3302,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
goto done;
/* Process the TLVs contained within the ASCONF chunk. */
- sctp_walk_params(param, addip, addip_hdr.params) {
+ sctp_walk_params(param, addip) {
/* Skip preceeding address parameters. */
if (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
param.p->type == SCTP_PARAM_IPV6_ADDRESS)
@@ -3644,7 +3636,7 @@ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc,
return NULL;
reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
- retval->param_hdr.v = reconf->params;
+ retval->param_hdr.v = (u8 *)(reconf + 1);
return retval;
}
@@ -3886,7 +3878,7 @@ bool sctp_verify_reconf(const struct sctp_association *asoc,
__u16 cnt = 0;
hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
- sctp_walk_params(param, hdr, params) {
+ sctp_walk_params(param, hdr) {
__u16 length = ntohs(param.p->length);
*errp = param.p;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 463c4a58d2c3..7fbeb99d8d32 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -984,8 +984,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
{
struct sctp_chunkhdr *unk_chunk_hdr;
- unk_chunk_hdr = (struct sctp_chunkhdr *)
- err_hdr->variable;
+ unk_chunk_hdr = (struct sctp_chunkhdr *)(err_hdr + 1);
switch (unk_chunk_hdr->type) {
/* ADDIP 4.1 A9) If the peer responds to an ASCONF with
* an ERROR chunk reporting that it did not recognized
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index ce5426171206..97f1155a2045 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -794,8 +794,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
/* This is a brand-new association, so these are not yet side
* effects--it is safe to run them here.
*/
- peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
-
+ peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
if (!sctp_process_init(new_asoc, chunk,
&chunk->subh.cookie_hdr->c.peer_addr,
peer_init, GFP_ATOMIC))
@@ -1337,7 +1336,7 @@ static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
* throughout the code today.
*/
errhdr = (struct sctp_errhdr *)buffer;
- addrparm = (union sctp_addr_param *)errhdr->variable;
+ addrparm = (union sctp_addr_param *)(errhdr + 1);
/* Copy into a parm format. */
len = af->to_addr_param(ssa, addrparm);
@@ -1869,8 +1868,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
/* new_asoc is a brand-new association, so these are not yet
* side effects--it is safe to run them here.
*/
- peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
-
+ peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
GFP_ATOMIC))
goto nomem;
@@ -1990,7 +1988,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
/* new_asoc is a brand-new association, so these are not yet
* side effects--it is safe to run them here.
*/
- peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+ peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
GFP_ATOMIC))
goto nomem;
@@ -4142,7 +4140,7 @@ enum sctp_disposition sctp_sf_do_reconf(struct net *net,
(void *)err_param, commands);
hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
- sctp_walk_params(param, hdr, params) {
+ sctp_walk_params(param, hdr) {
struct sctp_chunk *reply = NULL;
struct sctp_ulpevent *ev = NULL;
@@ -4393,7 +4391,7 @@ static enum sctp_ierror sctp_sf_authenticate(
* 3. Compute the new digest
* 4. Compare saved and new digests.
*/
- digest = auth_hdr->hmac;
+ digest = (u8 *)(auth_hdr + 1);
skb_pull(chunk->skb, sig_len);
save_digest = kmemdup(digest, sig_len, GFP_ATOMIC);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b91616f819de..cda8c2874691 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1830,6 +1830,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
if (err)
goto err;
+ if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) {
+ err = -EINVAL;
+ goto err;
+ }
}
if (sctp_state(asoc, CLOSED)) {
@@ -5188,10 +5192,11 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
info->sctpi_peer_rwnd = asoc->peer.rwnd;
info->sctpi_peer_tag = asoc->c.peer_vtag;
- mask = asoc->peer.ecn_capable << 1;
+ mask = asoc->peer.intl_capable << 1;
+ mask = (mask | asoc->peer.ecn_capable) << 1;
mask = (mask | asoc->peer.ipv4_address) << 1;
mask = (mask | asoc->peer.ipv6_address) << 1;
- mask = (mask | asoc->peer.hostname_address) << 1;
+ mask = (mask | asoc->peer.reconf_capable) << 1;
mask = (mask | asoc->peer.asconf_capable) << 1;
mask = (mask | asoc->peer.prsctp_capable) << 1;
mask = (mask | asoc->peer.auth_capable);
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ee6514af830f..c241cc552e8d 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -491,7 +491,7 @@ static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param(
return NULL;
hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
- sctp_walk_params(param, hdr, params) {
+ sctp_walk_params(param, hdr) {
/* sctp_strreset_tsnreq is actually the basic structure
* of all stream reconf params, so it's safe to use it
* to access request_seq.
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 94727feb07b3..840f24045ae2 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1153,8 +1153,9 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
}
#define _sctp_walk_ifwdtsn(pos, chunk, end) \
- for (pos = chunk->subh.ifwdtsn_hdr->skip; \
- (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++)
+ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \
+ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \
+ sizeof(struct sctp_ifwdtsn_skip); pos++)
#define sctp_walk_ifwdtsn(pos, ch) \
_sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c6b4a62276f6..50c38b624f77 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -3270,6 +3270,17 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol,
sk_common_release(sk);
goto out;
}
+
+ /* smc_clcsock_release() does not wait smc->clcsock->sk's
+ * destruction; its sk_state might not be TCP_CLOSE after
+ * smc->sk is close()d, and TCP timers can be fired later,
+ * which need net ref.
+ */
+ sk = smc->clcsock->sk;
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
} else {
smc->clcsock = clcsock;
}
diff --git a/net/socket.c b/net/socket.c
index 73e493da4589..a7b4b37d86df 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -957,6 +957,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
+#ifdef CONFIG_WIRELESS
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb)
{
@@ -972,6 +973,7 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
+#endif
static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb)
diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c
index ce0541e32fc9..95ca783795c5 100644
--- a/net/sunrpc/auth_gss/gss_krb5_test.c
+++ b/net/sunrpc/auth_gss/gss_krb5_test.c
@@ -73,7 +73,6 @@ static void checksum_case(struct kunit *test)
{
const struct gss_krb5_test_param *param = test->param_value;
struct xdr_buf buf = {
- .head[0].iov_base = param->plaintext->data,
.head[0].iov_len = param->plaintext->len,
.len = param->plaintext->len,
};
@@ -99,6 +98,10 @@ static void checksum_case(struct kunit *test)
err = crypto_ahash_setkey(tfm, Kc.data, Kc.len);
KUNIT_ASSERT_EQ(test, err, 0);
+ buf.head[0].iov_base = kunit_kzalloc(test, buf.head[0].iov_len, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf.head[0].iov_base);
+ memcpy(buf.head[0].iov_base, param->plaintext->data, buf.head[0].iov_len);
+
checksum.len = gk5e->cksumlength;
checksum.data = kunit_kzalloc(test, checksum.len, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, checksum.data);
@@ -1327,6 +1330,7 @@ static void rfc6803_encrypt_case(struct kunit *test)
if (!gk5e)
kunit_skip(test, "Encryption type is not available");
+ memset(usage_data, 0, sizeof(usage_data));
usage.data[3] = param->constant;
Ke.len = gk5e->Ke_length;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 983c5891cb56..4246363cb095 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -416,14 +416,23 @@ static int unix_gid_hash(kuid_t uid)
return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS);
}
-static void unix_gid_put(struct kref *kref)
+static void unix_gid_free(struct rcu_head *rcu)
{
- struct cache_head *item = container_of(kref, struct cache_head, ref);
- struct unix_gid *ug = container_of(item, struct unix_gid, h);
+ struct unix_gid *ug = container_of(rcu, struct unix_gid, rcu);
+ struct cache_head *item = &ug->h;
+
if (test_bit(CACHE_VALID, &item->flags) &&
!test_bit(CACHE_NEGATIVE, &item->flags))
put_group_info(ug->gi);
- kfree_rcu(ug, rcu);
+ kfree(ug);
+}
+
+static void unix_gid_put(struct kref *kref)
+{
+ struct cache_head *item = container_of(kref, struct cache_head, ref);
+ struct unix_gid *ug = container_of(item, struct unix_gid, h);
+
+ call_rcu(&ug->rcu, unix_gid_free);
}
static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index adcbedc244d6..6cacd70a15ff 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2158,6 +2158,7 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
switch (skst) {
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
+ case TCP_LAST_ACK:
break;
case TCP_ESTABLISHED:
case TCP_CLOSE_WAIT:
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index dde3c870bddd..e4878551f140 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -94,6 +94,11 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
info->op,
info->flags);
+ if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
+ WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
+ goto out;
+ }
+
return skb;
out:
@@ -1336,6 +1341,11 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
goto free_pkt;
}
+ if (!skb_set_owner_sk_safe(skb, sk)) {
+ WARN_ONCE(1, "receiving vsock socket has sk_refcnt == 0\n");
+ goto free_pkt;
+ }
+
vsk = vsock_sk(sk);
lock_sock(sk);
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index a5375c97f5b0..b370070194fa 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1849,7 +1849,13 @@ static ssize_t vmci_transport_stream_enqueue(
struct msghdr *msg,
size_t len)
{
- return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0);
+ ssize_t err;
+
+ err = vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0);
+ if (err < 0)
+ err = -ENOMEM;
+
+ return err;
}
static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index e3afc0c866f5..5c6360df1f31 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -31,8 +31,7 @@ static int vsock_loopback_send_pkt(struct sk_buff *skb)
struct vsock_loopback *vsock = &the_vsock_loopback;
int len = skb->len;
- skb_queue_tail(&vsock->pkt_queue, skb);
-
+ virtio_vsock_skb_queue_tail(&vsock->pkt_queue, skb);
queue_work(vsock->workqueue, &vsock->pkt_work);
return len;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2ac58b282b5e..cc1e7f15fa73 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1301,9 +1301,10 @@ static int xsk_mmap(struct file *file, struct socket *sock,
loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
unsigned long size = vma->vm_end - vma->vm_start;
struct xdp_sock *xs = xdp_sk(sock->sk);
+ int state = READ_ONCE(xs->state);
struct xsk_queue *q = NULL;
- if (READ_ONCE(xs->state) != XSK_READY)
+ if (state != XSK_READY && state != XSK_BOUND)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) {
@@ -1314,9 +1315,11 @@ static int xsk_mmap(struct file *file, struct socket *sock,
/* Matches the smp_wmb() in XDP_UMEM_REG */
smp_rmb();
if (offset == XDP_UMEM_PGOFF_FILL_RING)
- q = READ_ONCE(xs->fq_tmp);
+ q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
+ READ_ONCE(xs->pool->fq);
else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
- q = READ_ONCE(xs->cq_tmp);
+ q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
+ READ_ONCE(xs->pool->cq);
}
if (!q)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bfb2a7e50c26..6d40a77fccbe 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -133,16 +133,12 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 chunk, chunk_end;
+ u64 offset = desc->addr & (pool->chunk_size - 1);
- chunk = xp_aligned_extract_addr(pool, desc->addr);
- if (likely(desc->len)) {
- chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len - 1);
- if (chunk != chunk_end)
- return false;
- }
+ if (offset + desc->len > pool->chunk_size)
+ return false;
- if (chunk >= pool->addrs_cnt)
+ if (desc->addr >= pool->addrs_cnt)
return false;
if (desc->options)
@@ -153,15 +149,12 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 addr, base_addr;
-
- base_addr = xp_unaligned_extract_addr(desc->addr);
- addr = xp_unaligned_add_offset_to_addr(desc->addr);
+ u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
if (desc->len > pool->chunk_size)
return false;
- if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+ if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
return false;
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 0c38d7175922..2c1427074a3b 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -162,8 +162,8 @@ static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
return ERR_PTR(-EOPNOTSUPP);
}
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock __rcu **map_entry;
@@ -223,7 +223,7 @@ out:
return err;
}
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+static long xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock __rcu **map_entry;
@@ -243,7 +243,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
return __bpf_xdp_redirect_map(map, index, flags, 0,
__xsk_map_lookup_elem);
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 95f1436bf6a2..bef28c6187eb 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -287,7 +287,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
return (is_packet_offload) ? -EINVAL : 0;
}
- if (x->props.flags & XFRM_STATE_ESN &&
+ if (!is_packet_offload && x->props.flags & XFRM_STATE_ESN &&
!dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN");
xso->dev = NULL;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 436d29640ac2..39fb91ff23d9 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -231,9 +231,6 @@ static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb)
{
int err = -EINVAL;
- if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
- goto out;
-
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out;
@@ -269,8 +266,6 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb)
{
int err = -EINVAL;
- if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6)
- goto out;
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto out;
@@ -331,22 +326,26 @@ out:
*/
static int
xfrm_inner_mode_encap_remove(struct xfrm_state *x,
- const struct xfrm_mode *inner_mode,
struct sk_buff *skb)
{
- switch (inner_mode->encap) {
+ switch (x->props.mode) {
case XFRM_MODE_BEET:
- if (inner_mode->family == AF_INET)
+ switch (XFRM_MODE_SKB_CB(skb)->protocol) {
+ case IPPROTO_IPIP:
+ case IPPROTO_BEETPH:
return xfrm4_remove_beet_encap(x, skb);
- if (inner_mode->family == AF_INET6)
+ case IPPROTO_IPV6:
return xfrm6_remove_beet_encap(x, skb);
+ }
break;
case XFRM_MODE_TUNNEL:
- if (inner_mode->family == AF_INET)
+ switch (XFRM_MODE_SKB_CB(skb)->protocol) {
+ case IPPROTO_IPIP:
return xfrm4_remove_tunnel_encap(x, skb);
- if (inner_mode->family == AF_INET6)
+ case IPPROTO_IPV6:
return xfrm6_remove_tunnel_encap(x, skb);
break;
+ }
}
WARN_ON_ONCE(1);
@@ -355,9 +354,7 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x,
static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
{
- const struct xfrm_mode *inner_mode = &x->inner_mode;
-
- switch (x->outer_mode.family) {
+ switch (x->props.family) {
case AF_INET:
xfrm4_extract_header(skb);
break;
@@ -369,17 +366,12 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
return -EAFNOSUPPORT;
}
- if (x->sel.family == AF_UNSPEC) {
- inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
- if (!inner_mode)
- return -EAFNOSUPPORT;
- }
-
- switch (inner_mode->family) {
- case AF_INET:
+ switch (XFRM_MODE_SKB_CB(skb)->protocol) {
+ case IPPROTO_IPIP:
+ case IPPROTO_BEETPH:
skb->protocol = htons(ETH_P_IP);
break;
- case AF_INET6:
+ case IPPROTO_IPV6:
skb->protocol = htons(ETH_P_IPV6);
break;
default:
@@ -387,7 +379,7 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
break;
}
- return xfrm_inner_mode_encap_remove(x, inner_mode, skb);
+ return xfrm_inner_mode_encap_remove(x, skb);
}
/* Remove encapsulation header.
@@ -433,17 +425,16 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
}
static int xfrm_inner_mode_input(struct xfrm_state *x,
- const struct xfrm_mode *inner_mode,
struct sk_buff *skb)
{
- switch (inner_mode->encap) {
+ switch (x->props.mode) {
case XFRM_MODE_BEET:
case XFRM_MODE_TUNNEL:
return xfrm_prepare_input(x, skb);
case XFRM_MODE_TRANSPORT:
- if (inner_mode->family == AF_INET)
+ if (x->props.family == AF_INET)
return xfrm4_transport_input(x, skb);
- if (inner_mode->family == AF_INET6)
+ if (x->props.family == AF_INET6)
return xfrm6_transport_input(x, skb);
break;
case XFRM_MODE_ROUTEOPTIMIZATION:
@@ -461,7 +452,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
{
const struct xfrm_state_afinfo *afinfo;
struct net *net = dev_net(skb->dev);
- const struct xfrm_mode *inner_mode;
int err;
__be32 seq;
__be32 seq_hi;
@@ -491,7 +481,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
goto drop;
}
- family = x->outer_mode.family;
+ family = x->props.family;
/* An encap_type of -1 indicates async resumption. */
if (encap_type == -1) {
@@ -676,17 +666,7 @@ resume:
XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
- inner_mode = &x->inner_mode;
-
- if (x->sel.family == AF_UNSPEC) {
- inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
- if (inner_mode == NULL) {
- XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
- goto drop;
- }
- }
-
- if (xfrm_inner_mode_input(x, inner_mode, skb)) {
+ if (xfrm_inner_mode_input(x, skb)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
goto drop;
}
@@ -701,7 +681,7 @@ resume:
* transport mode so the outer address is identical.
*/
daddr = &x->id.daddr;
- family = x->outer_mode.family;
+ family = x->props.family;
err = xfrm_parse_spi(skb, nexthdr, &spi, &seq);
if (err < 0) {
@@ -732,7 +712,7 @@ resume:
err = -EAFNOSUPPORT;
rcu_read_lock();
- afinfo = xfrm_state_afinfo_get_rcu(x->inner_mode.family);
+ afinfo = xfrm_state_afinfo_get_rcu(x->props.family);
if (likely(afinfo))
err = afinfo->transport_finish(skb, xfrm_gro || async);
rcu_read_unlock();
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index ff114d68cc43..369e5de8558f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -412,7 +412,7 @@ static int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
skb->protocol = htons(ETH_P_IP);
- switch (x->outer_mode.encap) {
+ switch (x->props.mode) {
case XFRM_MODE_BEET:
return xfrm4_beet_encap_add(x, skb);
case XFRM_MODE_TUNNEL:
@@ -435,7 +435,7 @@ static int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
skb->ignore_df = 1;
skb->protocol = htons(ETH_P_IPV6);
- switch (x->outer_mode.encap) {
+ switch (x->props.mode) {
case XFRM_MODE_BEET:
return xfrm6_beet_encap_add(x, skb);
case XFRM_MODE_TUNNEL:
@@ -451,22 +451,22 @@ static int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb)
{
- switch (x->outer_mode.encap) {
+ switch (x->props.mode) {
case XFRM_MODE_BEET:
case XFRM_MODE_TUNNEL:
- if (x->outer_mode.family == AF_INET)
+ if (x->props.family == AF_INET)
return xfrm4_prepare_output(x, skb);
- if (x->outer_mode.family == AF_INET6)
+ if (x->props.family == AF_INET6)
return xfrm6_prepare_output(x, skb);
break;
case XFRM_MODE_TRANSPORT:
- if (x->outer_mode.family == AF_INET)
+ if (x->props.family == AF_INET)
return xfrm4_transport_output(x, skb);
- if (x->outer_mode.family == AF_INET6)
+ if (x->props.family == AF_INET6)
return xfrm6_transport_output(x, skb);
break;
case XFRM_MODE_ROUTEOPTIMIZATION:
- if (x->outer_mode.family == AF_INET6)
+ if (x->props.family == AF_INET6)
return xfrm6_ro_output(x, skb);
WARN_ON_ONCE(1);
break;
@@ -875,21 +875,10 @@ static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
{
- const struct xfrm_mode *inner_mode;
-
- if (x->sel.family == AF_UNSPEC)
- inner_mode = xfrm_ip2inner_mode(x,
- xfrm_af2proto(skb_dst(skb)->ops->family));
- else
- inner_mode = &x->inner_mode;
-
- if (inner_mode == NULL)
- return -EAFNOSUPPORT;
-
- switch (inner_mode->family) {
- case AF_INET:
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
return xfrm4_extract_output(x, skb);
- case AF_INET6:
+ case htons(ETH_P_IPV6):
return xfrm6_extract_output(x, skb);
}