summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/appletalk/aarp.c2
-rw-r--r--net/ax25/af_ax25.c2
-rw-r--r--net/bluetooth/l2cap_sock.c2
-rw-r--r--net/can/j1939/socket.c2
-rw-r--r--net/can/raw.c2
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/core/dev.c22
-rw-r--r--net/core/dst.c10
-rw-r--r--net/core/netdev-genl.c12
-rw-r--r--net/core/pktgen.c102
-rw-r--r--net/core/rtnetlink.c36
-rw-r--r--net/core/skbuff.c19
-rw-r--r--net/core/sock.c178
-rw-r--r--net/core/xdp.c4
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c10
-rw-r--r--net/devlink/core.c217
-rw-r--r--net/devlink/dev.c50
-rw-r--r--net/devlink/devl_internal.h34
-rw-r--r--net/devlink/linecard.c80
-rw-r--r--net/devlink/netlink.c26
-rw-r--r--net/devlink/port.c55
-rw-r--r--net/dsa/port.c5
-rw-r--r--net/dsa/port.h3
-rw-r--r--net/dsa/slave.c9
-rw-r--r--net/dsa/tag_ksz.c8
-rw-r--r--net/handshake/genl.c2
-rw-r--r--net/handshake/netlink.c2
-rw-r--r--net/handshake/tlshd.c6
-rw-r--r--net/ipv4/datagram.c6
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/ip_output.c15
-rw-r--r--net/ipv4/ip_sockglue.c192
-rw-r--r--net/ipv4/ping.c13
-rw-r--r--net/ipv4/raw.c19
-rw-r--r--net/ipv4/route.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c12
-rw-r--r--net/ipv4/tcp_bbr.c13
-rw-r--r--net/ipv4/tcp_input.c36
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_metrics.c22
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_output.c42
-rw-r--r--net/ipv4/tcp_timer.c19
-rw-r--r--net/ipv4/udp.c92
-rw-r--r--net/ipv4/udp_offload.c4
-rw-r--r--net/ipv4/udp_tunnel_core.c2
-rw-r--r--net/ipv4/udp_tunnel_nic.c11
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_input.c4
-rw-r--r--net/ipv6/addrconf.c38
-rw-r--r--net/ipv6/af_inet6.c9
-rw-r--r--net/ipv6/datagram.c15
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/inet6_connection_sock.c2
-rw-r--r--net/ipv6/ip6_flowlabel.c8
-rw-r--r--net/ipv6/ip6_output.c46
-rw-r--r--net/ipv6/ip6_udp_tunnel.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c242
-rw-r--r--net/ipv6/mcast.c6
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/ping.c4
-rw-r--r--net/ipv6/raw.c16
-rw-r--r--net/ipv6/route.c6
-rw-r--r--net/ipv6/tcp_ipv6.c25
-rw-r--r--net/ipv6/udp.c46
-rw-r--r--net/ipv6/udplite.c1
-rw-r--r--net/ipv6/xfrm6_input.c4
-rw-r--r--net/l2tp/l2tp_core.c6
-rw-r--r--net/l2tp/l2tp_eth.c34
-rw-r--r--net/l2tp/l2tp_ip6.c6
-rw-r--r--net/mac80211/tdls.c2
-rw-r--r--net/mptcp/sockopt.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c16
-rw-r--r--net/netfilter/nf_nat_proto.c64
-rw-r--r--net/netfilter/nf_tables_api.c43
-rw-r--r--net/netrom/af_netrom.c2
-rw-r--r--net/openvswitch/actions.c27
-rw-r--r--net/openvswitch/meter.h4
-rw-r--r--net/rose/af_rose.c2
-rw-r--r--net/sched/cls_route.c37
-rw-r--r--net/sched/em_meta.c2
-rw-r--r--net/sched/sch_fq.c385
-rw-r--r--net/sched/sch_frag.c4
-rw-r--r--net/sched/sch_generic.c9
-rw-r--r--net/sctp/ipv6.c9
-rw-r--r--net/sctp/protocol.c4
-rw-r--r--net/sctp/sm_make_chunk.c2
-rw-r--r--net/smc/af_smc.c2
-rw-r--r--net/tipc/link.c4
-rw-r--r--net/tls/tls_sw.c2
-rw-r--r--net/vmw_vsock/af_vsock.c3
-rw-r--r--net/vmw_vsock/virtio_transport.c92
-rw-r--r--net/vmw_vsock/virtio_transport_common.c307
-rw-r--r--net/x25/af_x25.c2
-rw-r--r--net/xdp/xsk.c4
-rw-r--r--net/xdp/xsk_buff_pool.c3
-rw-r--r--net/xfrm/xfrm_policy.c2
100 files changed, 2059 insertions, 957 deletions
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index c7236daa2415..9fa0b246902b 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -664,7 +664,7 @@ out_unlock:
sendit:
if (skb->sk)
- skb->priority = skb->sk->sk_priority;
+ skb->priority = READ_ONCE(skb->sk->sk_priority);
if (dev_queue_xmit(skb))
goto drop;
sent:
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 5db805d5f74d..558e158c98d0 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -939,7 +939,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sock_init_data(NULL, sk);
sk->sk_type = osk->sk_type;
- sk->sk_priority = osk->sk_priority;
+ sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 3bdfc3f1e73d..e50d3d102078 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1615,7 +1615,7 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
return ERR_PTR(-ENOTCONN);
}
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
bt_cb(skb)->l2cap.chan = chan;
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index b28c976f52a0..14c431663233 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -884,7 +884,7 @@ static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
skcb = j1939_skb_to_cb(skb);
memset(skcb, 0, sizeof(*skcb));
skcb->addr = jsk->addr;
- skcb->priority = j1939_prio(sk->sk_priority);
+ skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority));
if (msg->msg_name) {
struct sockaddr_can *addr = msg->msg_name;
diff --git a/net/can/raw.c b/net/can/raw.c
index d50c3f3d892f..73468d2ebd51 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -881,7 +881,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
skb->dev = dev;
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
skb->mark = READ_ONCE(sk->sk_mark);
skb->tstamp = sockc.transmit_time;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index faabad6603db..f263f7e91a21 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1136,6 +1136,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
GFP_KERNEL);
if (!monc->monmap)
return -ENOMEM;
+ monc->monmap->num_mon = num_mon;
for (i = 0; i < num_mon; i++) {
struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
@@ -1147,7 +1148,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
inst->name.type = CEPH_ENTITY_TYPE_MON;
inst->name.num = cpu_to_le64(i);
}
- monc->monmap->num_mon = num_mon;
return 0;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..606a366cc209 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9023,6 +9023,28 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
+static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+ rtnl_lock();
+ dev->dpll_pin = dpll_pin;
+ rtnl_unlock();
+#endif
+}
+
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+ WARN_ON(!dpll_pin);
+ netdev_dpll_pin_assign(dev, dpll_pin);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_set);
+
+void netdev_dpll_pin_clear(struct net_device *dev)
+{
+ netdev_dpll_pin_assign(dev, NULL);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_clear);
+
/**
* dev_change_proto_down - set carrier according to proto_down.
*
diff --git a/net/core/dst.c b/net/core/dst.c
index 980e2fd2f013..6838d3212c37 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,7 +45,7 @@ const struct dst_metrics dst_default_metrics = {
EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- struct net_device *dev, int initial_ref, int initial_obsolete,
+ struct net_device *dev, int initial_obsolete,
unsigned short flags)
{
dst->dev = dev;
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->tclassid = 0;
#endif
dst->lwtstate = NULL;
- rcuref_init(&dst->__rcuref, initial_ref);
+ rcuref_init(&dst->__rcuref, 1);
INIT_LIST_HEAD(&dst->rt_uncached);
dst->__use = 0;
dst->lastuse = jiffies;
@@ -77,7 +77,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
EXPORT_SYMBOL(dst_init);
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+ int initial_obsolete, unsigned short flags)
{
struct dst_entry *dst;
@@ -90,7 +90,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
if (!dst)
return NULL;
- dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+ dst_init(dst, ops, dev, initial_obsolete, flags);
return dst;
}
@@ -270,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst,
struct dst_entry *dst;
dst = &md_dst->dst;
- dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
+ dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
DST_METADATA | DST_NOCOUNT);
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
md_dst->type = type;
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index c1aea8b756b6..fe61f85bcf33 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -5,6 +5,7 @@
#include <linux/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/xdp.h>
#include "netdev-genl-gen.h"
@@ -12,15 +13,24 @@ static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
+ u64 xdp_rx_meta = 0;
void *hdr;
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+ xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
- netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+ netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ xdp_rx_meta, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f56b8d697014..5e865af82e5b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -200,6 +200,7 @@
pf(VID_RND) /* Random VLAN ID */ \
pf(SVID_RND) /* Random SVLAN ID */ \
pf(NODE) /* Node memory alloc*/ \
+ pf(SHARED) /* Shared SKB */ \
#define pf(flag) flag##_SHIFT,
enum pkt_flags {
@@ -1198,7 +1199,8 @@ static ssize_t pktgen_if_write(struct file *file,
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
- if (value > 0 && pkt_dev->n_imix_entries > 0)
+ if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ !(pkt_dev->flags & F_SHARED)))
return -EINVAL;
i += len;
@@ -1257,6 +1259,10 @@ static ssize_t pktgen_if_write(struct file *file,
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
+
+ if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ return -EINVAL;
+
pkt_dev->burst = value < 1 ? 1 : value;
sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
@@ -1318,9 +1324,10 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "flag")) {
+ bool disable = false;
__u32 flag;
char f[32];
- bool disable = false;
+ char *end;
memset(f, 0, 32);
len = strn_len(&user_buffer[i], sizeof(f) - 1);
@@ -1332,28 +1339,42 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
flag = pktgen_read_flag(f, &disable);
-
if (flag) {
- if (disable)
+ if (disable) {
+ /* If "clone_skb", or "burst" parameters are
+ * configured, it means that the skb still
+ * needs to be referenced by the pktgen, so
+ * the skb must be shared.
+ */
+ if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ pkt_dev->burst > 1))
+ return -EINVAL;
pkt_dev->flags &= ~flag;
- else
+ } else {
pkt_dev->flags |= flag;
- } else {
- sprintf(pg_result,
- "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
- "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
- "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
- "NO_TIMESTAMP, "
-#ifdef CONFIG_XFRM
- "IPSEC, "
-#endif
- "NODE_ALLOC\n");
+ }
+
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
- sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+
+ /* Unknown flag */
+ end = pkt_dev->result + sizeof(pkt_dev->result);
+ pg_result += sprintf(pg_result,
+ "Flag -:%s:- unknown\n"
+ "Available flags, (prepend ! to un-set flag):\n", f);
+
+ for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ continue;
+ pg_result += snprintf(pg_result, end - pg_result,
+ "%s, ", pkt_flag_names[n]);
+ }
+ if (!WARN_ON_ONCE(pg_result >= end)) {
+ /* Remove the comma and whitespace at the end */
+ *(pg_result - 2) = '\0';
+ }
+
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
@@ -3440,12 +3461,24 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
- unsigned int burst = READ_ONCE(pkt_dev->burst);
+ bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ unsigned int burst = 1;
struct sk_buff *skb;
+ int clone_skb = 0;
int ret;
+ /* If 'skb_shared' is false, the read of possible
+ * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ * prevent some concurrent changes from slipping in. And the stabilized
+ * config will be read in during the next run of pktgen_xmit.
+ */
+ if (skb_shared) {
+ burst = READ_ONCE(pkt_dev->burst);
+ clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ }
+
/* If device is offline, then don't send */
if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
pktgen_stop_device(pkt_dev);
@@ -3462,7 +3495,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* If no skb or clone count exhausted then get new one */
if (!pkt_dev->skb || (pkt_dev->last_ok &&
- ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ ++pkt_dev->clone_count >= clone_skb)) {
/* build a new pkt */
kfree_skb(pkt_dev->skb);
@@ -3483,7 +3516,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
skb = pkt_dev->skb;
skb->protocol = eth_type_trans(skb, skb->dev);
- refcount_add(burst, &skb->users);
+ if (skb_shared)
+ refcount_add(burst, &skb->users);
local_bh_disable();
do {
ret = netif_receive_skb(skb);
@@ -3491,6 +3525,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
+ if (unlikely(!skb_shared)) {
+ pkt_dev->skb = NULL;
+ break;
+ }
if (refcount_read(&skb->users) != burst) {
/* skb was queued by rps/rfs or taps,
* so cannot reuse this skb
@@ -3509,9 +3547,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
local_bh_disable();
- refcount_inc(&pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_inc(&pkt_dev->skb->users);
ret = dev_queue_xmit(pkt_dev->skb);
+
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NET_XMIT_SUCCESS:
pkt_dev->sofar++;
@@ -3549,11 +3592,15 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
goto unlock;
}
- refcount_add(burst, &pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_add(burst, &pkt_dev->skb->users);
xmit_more:
ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NETDEV_TX_OK:
pkt_dev->last_ok = 1;
@@ -3575,7 +3622,8 @@ xmit_more:
fallthrough;
case NETDEV_TX_BUSY:
/* Retry it next time */
- refcount_dec(&(pkt_dev->skb->users));
+ if (skb_shared)
+ refcount_dec(&pkt_dev->skb->users);
pkt_dev->last_ok = 0;
}
if (unlikely(burst))
@@ -3588,7 +3636,8 @@ out:
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- pktgen_wait_for_skb(pkt_dev);
+ if (pkt_dev->skb)
+ pktgen_wait_for_skb(pkt_dev);
/* Done with this */
pktgen_stop_device(pkt_dev);
@@ -3771,6 +3820,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
pkt_dev->node = NUMA_NO_NODE;
+ pkt_dev->flags = F_SHARED; /* SKB shared by default */
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4a2ec33bfb51..7452a6d190c5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -57,6 +57,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
#endif
+#include <linux/dpll.h>
#include "dev.h"
@@ -1055,6 +1056,15 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev)
return size;
}
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+ size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev));
+
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1111,6 +1121,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_prop_list_size(dev)
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ rtnl_devlink_port_size(dev)
+ + rtnl_dpll_pin_size(dev)
+ 0;
}
@@ -1774,6 +1785,28 @@ nest_cancel:
return ret;
}
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *dpll_pin_nest;
+ int ret;
+
+ dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+ if (!dpll_pin_nest)
+ return -EMSGSIZE;
+
+ ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev));
+ if (ret < 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, dpll_pin_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, dpll_pin_nest);
+ return ret;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
@@ -1916,6 +1949,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_fill_devlink_port(skb, dev))
goto nla_put_failure;
+ if (rtnl_fill_dpll_pin(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4eaf7ed0d1f4..da3f96bdd6f6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -847,6 +847,8 @@ EXPORT_SYMBOL(__napi_alloc_skb);
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_fill_page_desc(skb, i, page, off, size);
skb->len += size;
skb->data_len += size;
@@ -859,6 +861,8 @@ void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_frag_size_add(frag, size);
skb->len += size;
skb->data_len += size;
@@ -3718,10 +3722,19 @@ EXPORT_SYMBOL(skb_dequeue_tail);
void skb_queue_purge_reason(struct sk_buff_head *list,
enum skb_drop_reason reason)
{
- struct sk_buff *skb;
+ struct sk_buff_head tmp;
+ unsigned long flags;
+
+ if (skb_queue_empty_lockless(list))
+ return;
+
+ __skb_queue_head_init(&tmp);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_splice_init(list, &tmp);
+ spin_unlock_irqrestore(&list->lock, flags);
- while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb_reason(skb, reason);
+ __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);
diff --git a/net/core/sock.c b/net/core/sock.c
index 16584e2dd648..290165954379 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -600,7 +600,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- sk->sk_dst_pending_confirm = 0;
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -759,7 +759,7 @@ out:
return ret;
}
-bool sk_mc_loop(struct sock *sk)
+bool sk_mc_loop(const struct sock *sk)
{
if (dev_recursion_level())
return false;
@@ -771,7 +771,7 @@ bool sk_mc_loop(struct sock *sk)
return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- return inet6_sk(sk)->mc_loop;
+ return inet6_test_bit(MC6_LOOP, sk);
#endif
}
WARN_ON_ONCE(1);
@@ -806,9 +806,7 @@ EXPORT_SYMBOL(sock_no_linger);
void sock_set_priority(struct sock *sk, u32 priority)
{
- lock_sock(sk);
WRITE_ONCE(sk->sk_priority, priority);
- release_sock(sk);
}
EXPORT_SYMBOL(sock_set_priority);
@@ -1118,6 +1116,83 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
valbool = val ? 1 : 0;
+ /* handle options which do not require locking the socket. */
+ switch (optname) {
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ sock_set_priority(sk, val);
+ return 0;
+ }
+ return -EPERM;
+ case SO_PASSSEC:
+ assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
+ return 0;
+ case SO_PASSCRED:
+ assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
+ return 0;
+ case SO_PASSPIDFD:
+ assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
+ return 0;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ return -ENOPROTOOPT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ if (val < 0)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ return 0;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ return 0;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
+ !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (val < 0 || val > U16_MAX)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
+ return 0;
+#endif
+ case SO_MAX_PACING_RATE:
+ {
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+ unsigned long pacing_rate;
+
+ if (sizeof(ulval) != sizeof(val) &&
+ optlen >= sizeof(ulval) &&
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+ return -EFAULT;
+ }
+ if (ulval != ~0UL)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
+ /* Pairs with READ_ONCE() from sk_getsockopt() */
+ WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+ pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+ if (ulval < pacing_rate)
+ WRITE_ONCE(sk->sk_pacing_rate, ulval);
+ return 0;
+ }
+ case SO_TXREHASH:
+ if (val < -1 || val > 1)
+ return -EINVAL;
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+ /* Paired with READ_ONCE() in tcp_rtx_synack()
+ * and sk_getsockopt().
+ */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ return 0;
+ }
+
sockopt_lock_sock(sk);
switch (optname) {
@@ -1133,12 +1208,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
- case SO_TYPE:
- case SO_PROTOCOL:
- case SO_DOMAIN:
- case SO_ERROR:
- ret = -ENOPROTOOPT;
- break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
sk_dst_reset(sk);
@@ -1213,15 +1282,6 @@ set_sndbuf:
sk->sk_no_check_tx = valbool;
break;
- case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- WRITE_ONCE(sk->sk_priority, val);
- else
- ret = -EPERM;
- break;
-
case SO_LINGER:
if (optlen < sizeof(ling)) {
ret = -EINVAL; /* 1003.1g */
@@ -1247,14 +1307,6 @@ set_sndbuf:
case SO_BSDCOMPAT:
break;
- case SO_PASSCRED:
- assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
- break;
-
- case SO_PASSPIDFD:
- assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
- break;
-
case SO_TIMESTAMP_OLD:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
@@ -1360,9 +1412,6 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
break;
- case SO_PASSSEC:
- assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
- break;
case SO_MARK:
if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
@@ -1404,50 +1453,7 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- case SO_BUSY_POLL:
- if (val < 0)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_ll_usec, val);
- break;
- case SO_PREFER_BUSY_POLL:
- if (valbool && !sockopt_capable(CAP_NET_ADMIN))
- ret = -EPERM;
- else
- WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
- break;
- case SO_BUSY_POLL_BUDGET:
- if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else {
- if (val < 0 || val > U16_MAX)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_busy_poll_budget, val);
- }
- break;
-#endif
-
- case SO_MAX_PACING_RATE:
- {
- unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
- if (sizeof(ulval) != sizeof(val) &&
- optlen >= sizeof(ulval) &&
- copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
- ret = -EFAULT;
- break;
- }
- if (ulval != ~0UL)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- /* Pairs with READ_ONCE() from sk_getsockopt() */
- WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
- sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
- break;
- }
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
break;
@@ -1532,19 +1538,6 @@ set_sndbuf:
break;
}
- case SO_TXREHASH:
- if (val < -1 || val > 1) {
- ret = -EINVAL;
- break;
- }
- if ((u8)val == SOCK_TXREHASH_DEFAULT)
- val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
- /* Paired with READ_ONCE() in tcp_rtx_synack()
- * and sk_getsockopt().
- */
- WRITE_ONCE(sk->sk_txrehash, (u8)val);
- break;
-
default:
ret = -ENOPROTOOPT;
break;
@@ -3001,6 +2994,11 @@ void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
+
+ if (sk->sk_prot->release_cb)
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
+
spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);
@@ -3519,11 +3517,9 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail)
__release_sock(sk);
- /* Warning : release_cb() might need to release sk ownership,
- * ie call sock_release_ownership(sk) before us.
- */
if (sk->sk_prot->release_cb)
- sk->sk_prot->release_cb(sk);
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a70670fe9a2d..df4789ab512d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
__diag_pop();
BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
};
BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 69453b936bd5..1b8cbfda6e5d 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -511,7 +511,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt),
- inet_sk(sk)->tos);
+ READ_ONCE(inet_sk(sk)->tos));
rcu_read_unlock();
err = net_xmit_eval(err);
}
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c693a570682f..8d344b219f84 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -180,7 +180,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- if (!sock_owned_by_user(sk) && np->recverr) {
+ if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
sk->sk_err = err;
sk_error_report(sk);
} else {
@@ -239,7 +239,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
if (!opt)
opt = rcu_dereference(np->opt);
err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt,
- np->tclass, sk->sk_priority);
+ np->tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -671,10 +671,10 @@ ipv6_pktoptions:
if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
np->mcast_oif = inet6_iif(opt_skb);
if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
- np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+ WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit);
if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
if (ipv6_opt_accepted(sk, opt_skb,
&DCCP_SKB_CB(opt_skb)->header.h6)) {
@@ -839,7 +839,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
memset(&fl6, 0, sizeof(fl6));
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
IP6_ECN_flow_init(fl6.flowlabel);
if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
diff --git a/net/devlink/core.c b/net/devlink/core.c
index 6cec4afb01fb..bcbbb952569f 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -16,6 +16,219 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
+static struct devlink *devlinks_xa_get(unsigned long index)
+{
+ struct devlink *devlink;
+
+ rcu_read_lock();
+ devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED);
+ if (!devlink || !devlink_try_get(devlink))
+ devlink = NULL;
+ rcu_read_unlock();
+ return devlink;
+}
+
+/* devlink_rels xarray contains 1:1 relationships between
+ * devlink object and related nested devlink instance.
+ * The xarray index is used to get the nested object from
+ * the nested-in object code.
+ */
+static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1);
+
+#define DEVLINK_REL_IN_USE XA_MARK_0
+
+struct devlink_rel {
+ u32 index;
+ refcount_t refcount;
+ u32 devlink_index;
+ struct {
+ u32 devlink_index;
+ u32 obj_index;
+ devlink_rel_notify_cb_t *notify_cb;
+ devlink_rel_cleanup_cb_t *cleanup_cb;
+ struct work_struct notify_work;
+ } nested_in;
+};
+
+static void devlink_rel_free(struct devlink_rel *rel)
+{
+ xa_erase(&devlink_rels, rel->index);
+ kfree(rel);
+}
+
+static void __devlink_rel_get(struct devlink_rel *rel)
+{
+ refcount_inc(&rel->refcount);
+}
+
+static void __devlink_rel_put(struct devlink_rel *rel)
+{
+ if (refcount_dec_and_test(&rel->refcount))
+ devlink_rel_free(rel);
+}
+
+static void devlink_rel_nested_in_notify_work(struct work_struct *work)
+{
+ struct devlink_rel *rel = container_of(work, struct devlink_rel,
+ nested_in.notify_work);
+ struct devlink *devlink;
+
+ devlink = devlinks_xa_get(rel->nested_in.devlink_index);
+ if (!devlink)
+ goto rel_put;
+ if (!devl_trylock(devlink)) {
+ devlink_put(devlink);
+ goto reschedule_work;
+ }
+ if (!devl_is_registered(devlink)) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ goto rel_put;
+ }
+ if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE))
+ rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index);
+ rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index);
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+rel_put:
+ __devlink_rel_put(rel);
+ return;
+
+reschedule_work:
+ schedule_work(&rel->nested_in.notify_work);
+}
+
+static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel)
+{
+ __devlink_rel_get(rel);
+ schedule_work(&rel->nested_in.notify_work);
+}
+
+static struct devlink_rel *devlink_rel_alloc(void)
+{
+ struct devlink_rel *rel;
+ static u32 next;
+ int err;
+
+ rel = kzalloc(sizeof(*rel), GFP_KERNEL);
+ if (!rel)
+ return ERR_PTR(-ENOMEM);
+
+ err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel,
+ xa_limit_32b, &next, GFP_KERNEL);
+ if (err) {
+ kfree(rel);
+ return ERR_PTR(err);
+ }
+
+ refcount_set(&rel->refcount, 1);
+ INIT_WORK(&rel->nested_in.notify_work,
+ &devlink_rel_nested_in_notify_work);
+ return rel;
+}
+
+static void devlink_rel_put(struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink->rel;
+
+ if (!rel)
+ return;
+ xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE);
+ devlink_rel_nested_in_notify_work_schedule(rel);
+ __devlink_rel_put(rel);
+ devlink->rel = NULL;
+}
+
+void devlink_rel_nested_in_clear(u32 rel_index)
+{
+ xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE);
+}
+
+int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
+ u32 obj_index, devlink_rel_notify_cb_t *notify_cb,
+ devlink_rel_cleanup_cb_t *cleanup_cb,
+ struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink_rel_alloc();
+
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ if (IS_ERR(rel))
+ return PTR_ERR(rel);
+
+ rel->devlink_index = devlink->index;
+ rel->nested_in.devlink_index = devlink_index;
+ rel->nested_in.obj_index = obj_index;
+ rel->nested_in.notify_cb = notify_cb;
+ rel->nested_in.cleanup_cb = cleanup_cb;
+ *rel_index = rel->index;
+ xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE);
+ devlink->rel = rel;
+ return 0;
+}
+
+void devlink_rel_nested_in_notify(struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink->rel;
+
+ if (!rel)
+ return;
+ devlink_rel_nested_in_notify_work_schedule(rel);
+}
+
+static struct devlink_rel *devlink_rel_find(unsigned long rel_index)
+{
+ return xa_find(&devlink_rels, &rel_index, rel_index,
+ DEVLINK_REL_IN_USE);
+}
+
+static struct devlink *devlink_rel_devlink_get_lock(u32 rel_index)
+{
+ struct devlink *devlink;
+ struct devlink_rel *rel;
+ u32 devlink_index;
+
+ if (!rel_index)
+ return NULL;
+ xa_lock(&devlink_rels);
+ rel = devlink_rel_find(rel_index);
+ if (rel)
+ devlink_index = rel->devlink_index;
+ xa_unlock(&devlink_rels);
+ if (!rel)
+ return NULL;
+ devlink = devlinks_xa_get(devlink_index);
+ if (!devlink)
+ return NULL;
+ devl_lock(devlink);
+ if (!devl_is_registered(devlink)) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return NULL;
+ }
+ return devlink;
+}
+
+int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
+ u32 rel_index, int attrtype,
+ bool *msg_updated)
+{
+ struct net *net = devlink_net(devlink);
+ struct devlink *rel_devlink;
+ int err;
+
+ rel_devlink = devlink_rel_devlink_get_lock(rel_index);
+ if (!rel_devlink)
+ return 0;
+ err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype);
+ devl_unlock(rel_devlink);
+ devlink_put(rel_devlink);
+ if (!err && msg_updated)
+ *msg_updated = true;
+ return err;
+}
+
void *devlink_priv(struct devlink *devlink)
{
return &devlink->priv;
@@ -142,6 +355,7 @@ int devl_register(struct devlink *devlink)
xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
devlink_notify_register(devlink);
+ devlink_rel_nested_in_notify(devlink);
return 0;
}
@@ -166,6 +380,7 @@ void devl_unregister(struct devlink *devlink)
devlink_notify_unregister(devlink);
xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ devlink_rel_put(devlink);
}
EXPORT_SYMBOL_GPL(devl_unregister);
@@ -215,6 +430,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC);
xa_init_flags(&devlink->params, XA_FLAGS_ALLOC);
xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC);
write_pnet(&devlink->_net, net);
INIT_LIST_HEAD(&devlink->rate_list);
INIT_LIST_HEAD(&devlink->linecard_list);
@@ -261,6 +477,7 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->linecard_list));
WARN_ON(!xa_empty(&devlink->ports));
+ xa_destroy(&devlink->nested_rels);
xa_destroy(&devlink->snapshot_ids);
xa_destroy(&devlink->params);
xa_destroy(&devlink->ports);
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index bba4ace7d22b..dc8039ca2b38 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -138,6 +138,23 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int devlink_nl_nested_fill(struct sk_buff *msg, struct devlink *devlink)
+{
+ unsigned long rel_index;
+ void *unused;
+ int err;
+
+ xa_for_each(&devlink->nested_rels, rel_index, unused) {
+ err = devlink_rel_devlink_handle_put(msg, devlink,
+ rel_index,
+ DEVLINK_ATTR_NESTED_DEVLINK,
+ NULL);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
@@ -164,6 +181,10 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
goto dev_stats_nest_cancel;
nla_nest_end(msg, dev_stats);
+
+ if (devlink_nl_nested_fill(msg, devlink))
+ goto nla_put_failure;
+
genlmsg_end(msg, hdr);
return 0;
@@ -230,6 +251,34 @@ int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one);
}
+static void devlink_rel_notify_cb(struct devlink *devlink, u32 obj_index)
+{
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void devlink_rel_cleanup_cb(struct devlink *devlink, u32 obj_index,
+ u32 rel_index)
+{
+ xa_erase(&devlink->nested_rels, rel_index);
+}
+
+int devl_nested_devlink_set(struct devlink *devlink,
+ struct devlink *nested_devlink)
+{
+ u32 rel_index;
+ int err;
+
+ err = devlink_rel_nested_in_add(&rel_index, devlink->index, 0,
+ devlink_rel_notify_cb,
+ devlink_rel_cleanup_cb,
+ nested_devlink);
+ if (err)
+ return err;
+ return xa_insert(&devlink->nested_rels, rel_index,
+ xa_mk_value(0), GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(devl_nested_devlink_set);
+
void devlink_notify_register(struct devlink *devlink)
{
devlink_notify(devlink, DEVLINK_CMD_NEW);
@@ -372,6 +421,7 @@ static void devlink_reload_netns_change(struct devlink *devlink,
devlink_notify_unregister(devlink);
write_pnet(&devlink->_net, dest_net);
devlink_notify_register(devlink);
+ devlink_rel_nested_in_notify(devlink);
}
int devlink_reload(struct devlink *devlink, struct net *dest_net,
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index f6b5fea2e13c..741d1bf1bec8 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -17,6 +17,8 @@
#include "netlink_gen.h"
+struct devlink_rel;
+
#define DEVLINK_REGISTERED XA_MARK_1
#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \
@@ -55,6 +57,8 @@ struct devlink {
u8 reload_failed:1;
refcount_t refcount;
struct rcu_work rwork;
+ struct devlink_rel *rel;
+ struct xarray nested_rels;
char priv[] __aligned(NETDEV_ALIGN);
};
@@ -92,6 +96,20 @@ static inline bool devl_is_registered(struct devlink *devlink)
return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
}
+typedef void devlink_rel_notify_cb_t(struct devlink *devlink, u32 obj_index);
+typedef void devlink_rel_cleanup_cb_t(struct devlink *devlink, u32 obj_index,
+ u32 rel_index);
+
+void devlink_rel_nested_in_clear(u32 rel_index);
+int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
+ u32 obj_index, devlink_rel_notify_cb_t *notify_cb,
+ devlink_rel_cleanup_cb_t *cleanup_cb,
+ struct devlink *devlink);
+void devlink_rel_nested_in_notify(struct devlink *devlink);
+int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
+ u32 rel_index, int attrtype,
+ bool *msg_updated);
+
/* Netlink */
#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
@@ -145,6 +163,8 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
return 0;
}
+int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
+ struct devlink *devlink, int attrtype);
int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info);
/* Notify */
@@ -206,19 +226,7 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
struct netlink_ext_ack *extack);
/* Linecards */
-struct devlink_linecard {
- struct list_head list;
- struct devlink *devlink;
- unsigned int index;
- const struct devlink_linecard_ops *ops;
- void *priv;
- enum devlink_linecard_state state;
- struct mutex state_lock; /* Protects state */
- const char *type;
- struct devlink_linecard_type *types;
- unsigned int types_count;
- struct devlink *nested_devlink;
-};
+unsigned int devlink_linecard_index(struct devlink_linecard *linecard);
/* Devlink nl cmds */
int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c
index 85c32c314b0f..9ff1813f88c5 100644
--- a/net/devlink/linecard.c
+++ b/net/devlink/linecard.c
@@ -6,6 +6,25 @@
#include "devl_internal.h"
+struct devlink_linecard {
+ struct list_head list;
+ struct devlink *devlink;
+ unsigned int index;
+ const struct devlink_linecard_ops *ops;
+ void *priv;
+ enum devlink_linecard_state state;
+ struct mutex state_lock; /* Protects state */
+ const char *type;
+ struct devlink_linecard_type *types;
+ unsigned int types_count;
+ u32 rel_index;
+};
+
+unsigned int devlink_linecard_index(struct devlink_linecard *linecard)
+{
+ return linecard->index;
+}
+
static struct devlink_linecard *
devlink_linecard_get_by_index(struct devlink *devlink,
unsigned int linecard_index)
@@ -46,24 +65,6 @@ devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
return devlink_linecard_get_from_attrs(devlink, info->attrs);
}
-static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink)
-{
- struct nlattr *nested_attr;
-
- nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK);
- if (!nested_attr)
- return -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
-
- nla_nest_end(msg, nested_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(msg, nested_attr);
- return -EMSGSIZE;
-}
-
struct devlink_linecard_type {
const char *type;
const void *priv;
@@ -111,8 +112,10 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
nla_nest_end(msg, attr);
}
- if (linecard->nested_devlink &&
- devlink_nl_put_nested_handle(msg, linecard->nested_devlink))
+ if (devlink_rel_devlink_handle_put(msg, devlink,
+ linecard->rel_index,
+ DEVLINK_ATTR_NESTED_DEVLINK,
+ NULL))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -521,7 +524,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
{
mutex_lock(&linecard->state_lock);
- WARN_ON(linecard->nested_devlink);
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
linecard->type = NULL;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
@@ -540,7 +542,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
{
mutex_lock(&linecard->state_lock);
- WARN_ON(linecard->nested_devlink);
linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
@@ -588,6 +589,27 @@ void devlink_linecard_deactivate(struct devlink_linecard *linecard)
}
EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
+static void devlink_linecard_rel_notify_cb(struct devlink *devlink,
+ u32 linecard_index)
+{
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (!linecard)
+ return;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+}
+
+static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink,
+ u32 linecard_index, u32 rel_index)
+{
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (linecard && linecard->rel_index == rel_index)
+ linecard->rel_index = 0;
+}
+
/**
* devlink_linecard_nested_dl_set - Attach/detach nested devlink
* instance to linecard.
@@ -595,12 +617,14 @@ EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
* @linecard: devlink linecard
* @nested_devlink: devlink instance to attach or NULL to detach
*/
-void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
- struct devlink *nested_devlink)
+int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+ struct devlink *nested_devlink)
{
- mutex_lock(&linecard->state_lock);
- linecard->nested_devlink = nested_devlink;
- devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
- mutex_unlock(&linecard->state_lock);
+ return devlink_rel_nested_in_add(&linecard->rel_index,
+ linecard->devlink->index,
+ linecard->index,
+ devlink_linecard_rel_notify_cb,
+ devlink_linecard_rel_cleanup_cb,
+ nested_devlink);
}
EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
index fc3e7c029a3b..499304d9de49 100644
--- a/net/devlink/netlink.c
+++ b/net/devlink/netlink.c
@@ -82,6 +82,32 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG },
};
+int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
+ struct devlink *devlink, int attrtype)
+{
+ struct nlattr *nested_attr;
+
+ nested_attr = nla_nest_start(msg, attrtype);
+ if (!nested_attr)
+ return -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (!net_eq(net, devlink_net(devlink))) {
+ int id = peernet2id_alloc(net, devlink_net(devlink),
+ GFP_KERNEL);
+
+ if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(msg, nested_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nested_attr);
+ return -EMSGSIZE;
+}
+
int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info)
{
int err;
diff --git a/net/devlink/port.c b/net/devlink/port.c
index 4763b42885fb..4e9003242448 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -428,6 +428,13 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
if (err)
goto out;
err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_rel_devlink_handle_put(msg, port->devlink,
+ port->rel_index,
+ DEVLINK_PORT_FN_ATTR_DEVLINK,
+ &msg_updated);
+
out:
if (err || !msg_updated)
nla_nest_cancel(msg, function_attr);
@@ -483,7 +490,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
goto nla_put_failure;
if (devlink_port->linecard &&
nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX,
- devlink_port->linecard->index))
+ devlink_linecard_index(devlink_port->linecard)))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -1392,6 +1399,50 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
+static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index)
+{
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index,
+ u32 rel_index)
+{
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (devlink_port && devlink_port->rel_index == rel_index)
+ devlink_port->rel_index = 0;
+}
+
+/**
+ * devl_port_fn_devlink_set - Attach peer devlink
+ * instance to port function.
+ * @devlink_port: devlink port
+ * @fn_devlink: devlink instance to attach
+ */
+int devl_port_fn_devlink_set(struct devlink_port *devlink_port,
+ struct devlink *fn_devlink)
+{
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+ if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF ||
+ devlink_port->attrs.pci_sf.external))
+ return -EINVAL;
+
+ return devlink_rel_nested_in_add(&devlink_port->rel_index,
+ devlink_port->devlink->index,
+ devlink_port->index,
+ devlink_port_rel_notify_cb,
+ devlink_port_rel_cleanup_cb,
+ fn_devlink);
+}
+EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set);
+
/**
* devlink_port_linecard_set - Link port with a linecard
*
@@ -1420,7 +1471,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
if (devlink_port->linecard)
n = snprintf(name, len, "l%u",
- devlink_port->linecard->index);
+ devlink_linecard_index(devlink_port->linecard));
if (n < len)
n += snprintf(name + n, len - n, "p%u",
attrs->phys.port_number);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 37ab238e8304..5f01bd4f9dec 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -2024,7 +2024,8 @@ void dsa_shared_port_link_unregister_of(struct dsa_port *dp)
dsa_shared_port_setup_phy_of(dp, false);
}
-int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr)
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+ struct netlink_ext_ack *extack)
{
struct dsa_switch *ds = dp->ds;
int err;
@@ -2034,7 +2035,7 @@ int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr)
dp->hsr_dev = hsr;
- err = ds->ops->port_hsr_join(ds, dp->index, hsr);
+ err = ds->ops->port_hsr_join(ds, dp->index, hsr, extack);
if (err)
dp->hsr_dev = NULL;
diff --git a/net/dsa/port.h b/net/dsa/port.h
index dc812512fd0e..334879964e2c 100644
--- a/net/dsa/port.h
+++ b/net/dsa/port.h
@@ -103,7 +103,8 @@ int dsa_port_phylink_create(struct dsa_port *dp);
void dsa_port_phylink_destroy(struct dsa_port *dp);
int dsa_shared_port_link_register_of(struct dsa_port *dp);
void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
-int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr);
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+ struct netlink_ext_ack *extack);
void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 48db91b33390..4c3e502d7e16 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -457,6 +457,13 @@ static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
+ if (ds->ops->port_set_mac_address) {
+ err = ds->ops->port_set_mac_address(ds, dp->index,
+ addr->sa_data);
+ if (err)
+ return err;
+ }
+
/* If the port is down, the address isn't synced yet to hardware or
* to the DSA master, so there is nothing to change.
*/
@@ -2862,7 +2869,7 @@ static int dsa_slave_changeupper(struct net_device *dev,
}
} else if (is_hsr_master(info->upper_dev)) {
if (info->linking) {
- err = dsa_port_hsr_join(dp, info->upper_dev);
+ err = dsa_port_hsr_join(dp, info->upper_dev, extack);
if (err == -EOPNOTSUPP) {
NL_SET_ERR_MSG_WEAK_MOD(extack,
"Offloading not supported");
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index ea100bd25939..3632e47dea9e 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -293,6 +293,14 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
if (is_link_local_ether_addr(hdr->h_dest))
val |= KSZ9477_TAIL_TAG_OVERRIDE;
+ if (dev->features & NETIF_F_HW_HSR_DUP) {
+ struct net_device *hsr_dev = dp->hsr_dev;
+ struct dsa_port *other_dp;
+
+ dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev)
+ val |= BIT(other_dp->index);
+ }
+
*tag = cpu_to_be16(val);
return ksz_defer_xmit(dp, skb);
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
index 233be5cbfec9..f55d14d7b726 100644
--- a/net/handshake/genl.c
+++ b/net/handshake/genl.c
@@ -18,7 +18,7 @@ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HAN
/* HANDSHAKE_CMD_DONE - do */
static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
[HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
- [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_U32, },
+ [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_S32, },
[HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
};
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
index d0bc1dd8e65a..64a0046dd611 100644
--- a/net/handshake/netlink.c
+++ b/net/handshake/netlink.c
@@ -163,7 +163,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD))
return -EINVAL;
- fd = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
+ fd = nla_get_s32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
sock = sockfd_lookup(fd, &err);
if (!sock)
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
index bbfb4095ddd6..d697f68c598c 100644
--- a/net/handshake/tlshd.c
+++ b/net/handshake/tlshd.c
@@ -173,9 +173,9 @@ static int tls_handshake_put_certificate(struct sk_buff *msg,
if (!entry_attr)
return -EMSGSIZE;
- if (nla_put_u32(msg, HANDSHAKE_A_X509_CERT,
+ if (nla_put_s32(msg, HANDSHAKE_A_X509_CERT,
treq->th_certificate) ||
- nla_put_u32(msg, HANDSHAKE_A_X509_PRIVKEY,
+ nla_put_s32(msg, HANDSHAKE_A_X509_PRIVKEY,
treq->th_privkey)) {
nla_nest_cancel(msg, entry_attr);
return -EMSGSIZE;
@@ -214,7 +214,7 @@ static int tls_handshake_accept(struct handshake_req *req,
goto out_cancel;
ret = -EMSGSIZE;
- ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
+ ret = nla_put_s32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
if (ret < 0)
goto out_cancel;
ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index cb5dbee9e018..2cc50cbfc2a3 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,11 +39,11 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
- oif = inet->mc_index;
+ oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!oif) {
- oif = inet->uc_index;
+ oif = READ_ONCE(inet->uc_index);
}
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 418e5fb58fd3..76c3ea75b8dd 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2944,8 +2944,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
continue;
state->im = rcu_dereference(state->idev->mc_list);
}
- if (!state->im)
- break;
spin_lock_bh(&state->im->lock);
psf = state->im->sources;
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e13a84433413..f01aee832aab 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -134,7 +134,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
* hence this needs to be included regardless of socket family.
*/
if (ext & (1 << (INET_DIAG_TOS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
goto errout;
#if IS_ENABLED(CONFIG_IPV6)
@@ -165,7 +165,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
* For cgroup2 classid is always zero.
*/
if (!classid)
- classid = sk->sk_priority;
+ classid = READ_ONCE(sk->sk_priority);
if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
goto errout;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4ab877cf6d35..89e62ed08dad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -544,7 +544,7 @@ EXPORT_SYMBOL(__ip_queue_xmit);
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
- return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+ return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
}
EXPORT_SYMBOL(ip_queue_xmit);
@@ -1387,8 +1387,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
struct ip_options *opt = NULL;
struct rtable *rt = (struct rtable *)cork->dst;
struct iphdr *iph;
+ u8 pmtudisc, ttl;
__be16 df = 0;
- __u8 ttl;
skb = __skb_dequeue(queue);
if (!skb)
@@ -1418,8 +1418,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
/* DF bit is set when we want to see DF on outgoing frames.
* If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
- inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ pmtudisc = READ_ONCE(inet->pmtudisc);
+ if (pmtudisc == IP_PMTUDISC_DO ||
+ pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1430,14 +1431,14 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (cork->ttl != 0)
ttl = cork->ttl;
else if (rt->rt_type == RTN_MULTICAST)
- ttl = inet->mc_ttl;
+ ttl = READ_ONCE(inet->mc_ttl);
else
ttl = ip_select_ttl(inet, &rt->dst);
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+ iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
iph->frag_off = df;
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
@@ -1449,7 +1450,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_options_build(skb, opt, cork->addr, rt);
}
- skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
+ skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
skb->mark = cork->mark;
skb->tstamp = cork->transmit_time;
/*
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cce9cb25f3b3..0b74ac49d6a6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -585,25 +585,20 @@ out:
return err;
}
-void __ip_sock_set_tos(struct sock *sk, int val)
+void ip_sock_set_tos(struct sock *sk, int val)
{
+ u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
+
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
- val |= inet_sk(sk)->tos & INET_ECN_MASK;
+ val |= old_tos & INET_ECN_MASK;
}
- if (inet_sk(sk)->tos != val) {
- inet_sk(sk)->tos = val;
+ if (old_tos != val) {
+ WRITE_ONCE(inet_sk(sk)->tos, val);
WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
sk_dst_reset(sk);
}
}
-
-void ip_sock_set_tos(struct sock *sk, int val)
-{
- lock_sock(sk);
- __ip_sock_set_tos(sk, val);
- release_sock(sk);
-}
EXPORT_SYMBOL(ip_sock_set_tos);
void ip_sock_set_freebind(struct sock *sk)
@@ -622,9 +617,7 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val)
{
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
return -EINVAL;
- lock_sock(sk);
- inet_sk(sk)->pmtudisc = val;
- release_sock(sk);
+ WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
return 0;
}
EXPORT_SYMBOL(ip_sock_set_mtu_discover);
@@ -1039,6 +1032,22 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(inet->min_ttl, val);
return 0;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ return -EINVAL;
+ if (optlen < 1)
+ return -EINVAL;
+ if (val == -1)
+ val = 1;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ WRITE_ONCE(inet->mc_ttl, val);
+ return 0;
+ case IP_MTU_DISCOVER:
+ return ip_sock_set_mtu_discover(sk, val);
+ case IP_TOS: /* This sets both TOS and Precedence */
+ ip_sock_set_tos(sk, val);
+ return 0;
}
err = 0;
@@ -1093,25 +1102,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
}
}
break;
- case IP_TOS: /* This sets both TOS and Precedence */
- __ip_sock_set_tos(sk, val);
- break;
- case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
- goto e_inval;
- inet->pmtudisc = val;
- break;
- case IP_MULTICAST_TTL:
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- if (optlen < 1)
- goto e_inval;
- if (val == -1)
- val = 1;
- if (val < 0 || val > 255)
- goto e_inval;
- inet->mc_ttl = val;
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -1123,7 +1113,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
ifindex = (__force int)ntohl((__force __be32)val);
if (ifindex == 0) {
- inet->uc_index = 0;
+ WRITE_ONCE(inet->uc_index, 0);
err = 0;
break;
}
@@ -1140,7 +1130,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
break;
- inet->uc_index = ifindex;
+ WRITE_ONCE(inet->uc_index, ifindex);
err = 0;
break;
}
@@ -1178,8 +1168,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (!mreq.imr_ifindex) {
if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
- inet->mc_index = 0;
- inet->mc_addr = 0;
+ WRITE_ONCE(inet->mc_index, 0);
+ WRITE_ONCE(inet->mc_addr, 0);
err = 0;
break;
}
@@ -1204,8 +1194,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
midx != sk->sk_bound_dev_if)
break;
- inet->mc_index = mreq.imr_ifindex;
- inet->mc_addr = mreq.imr_address.s_addr;
+ WRITE_ONCE(inet->mc_index, mreq.imr_ifindex);
+ WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr);
err = 0;
break;
}
@@ -1592,27 +1582,29 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MINTTL:
val = READ_ONCE(inet->min_ttl);
goto copyval;
- }
-
- if (needs_rtnl)
- rtnl_lock();
- sockopt_lock_sock(sk);
-
- switch (optname) {
+ case IP_MULTICAST_TTL:
+ val = READ_ONCE(inet->mc_ttl);
+ goto copyval;
+ case IP_MTU_DISCOVER:
+ val = READ_ONCE(inet->pmtudisc);
+ goto copyval;
+ case IP_TOS:
+ val = READ_ONCE(inet->tos);
+ goto copyval;
case IP_OPTIONS:
{
unsigned char optbuf[sizeof(struct ip_options)+40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct ip_options_rcu *inet_opt;
- inet_opt = rcu_dereference_protected(inet->inet_opt,
- lockdep_sock_is_held(sk));
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
opt->optlen = 0;
if (inet_opt)
memcpy(optbuf, &inet_opt->opt,
sizeof(struct ip_options) +
inet_opt->opt.optlen);
- sockopt_release_sock(sk);
+ rcu_read_unlock();
if (opt->optlen == 0) {
len = 0;
@@ -1628,12 +1620,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
- case IP_TOS:
- val = inet->tos;
- break;
- case IP_MTU_DISCOVER:
- val = inet->pmtudisc;
- break;
case IP_MTU:
{
struct dst_entry *dst;
@@ -1643,24 +1629,55 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = dst_mtu(dst);
dst_release(dst);
}
- if (!val) {
- sockopt_release_sock(sk);
+ if (!val)
return -ENOTCONN;
+ goto copyval;
+ }
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
}
- break;
+ msg.msg_controllen = len;
+ msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
+
+ if (inet_test_bit(PKTINFO, sk)) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_ifindex = READ_ONCE(inet->mc_index);
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet_test_bit(TTL, sk)) {
+ int hlim = READ_ONCE(inet->mc_ttl);
+
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ if (inet_test_bit(TOS, sk)) {
+ int tos = READ_ONCE(inet->rcv_tos);
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
+ len -= msg.msg_controllen;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
- case IP_MULTICAST_TTL:
- val = inet->mc_ttl;
- break;
case IP_UNICAST_IF:
- val = (__force int)htonl((__u32) inet->uc_index);
- break;
+ val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
+ goto copyval;
case IP_MULTICAST_IF:
{
struct in_addr addr;
len = min_t(unsigned int, len, sizeof(struct in_addr));
- addr.s_addr = inet->mc_addr;
- sockopt_release_sock(sk);
+ addr.s_addr = READ_ONCE(inet->mc_addr);
if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
@@ -1668,6 +1685,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
+ }
+
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
case IP_MSFILTER:
{
struct ip_msfilter msf;
@@ -1690,44 +1714,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
else
err = ip_get_mcast_msfilter(sk, optval, optlen, len);
goto out;
- case IP_PKTOPTIONS:
- {
- struct msghdr msg;
-
- sockopt_release_sock(sk);
-
- if (sk->sk_type != SOCK_STREAM)
- return -ENOPROTOOPT;
-
- if (optval.is_kernel) {
- msg.msg_control_is_user = false;
- msg.msg_control = optval.kernel;
- } else {
- msg.msg_control_is_user = true;
- msg.msg_control_user = optval.user;
- }
- msg.msg_controllen = len;
- msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
-
- if (inet_test_bit(PKTINFO, sk)) {
- struct in_pktinfo info;
-
- info.ipi_addr.s_addr = inet->inet_rcv_saddr;
- info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
- info.ipi_ifindex = inet->mc_index;
- put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
- }
- if (inet_test_bit(TTL, sk)) {
- int hlim = inet->mc_ttl;
- put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
- }
- if (inet_test_bit(TOS, sk)) {
- int tos = inet->rcv_tos;
- put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
- }
- len -= msg.msg_controllen;
- return copy_to_sockptr(optlen, &len, sizeof(int));
- }
case IP_LOCAL_PORT_RANGE:
val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
break;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 75e0aee35eb7..2c61f444e1c7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -551,7 +551,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
@@ -581,7 +581,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
* 4.1.3.3.
*/
if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
- (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+ (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else {
@@ -773,11 +773,11 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
- ipc.oif = inet->mc_index;
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!ipc.oif)
- ipc.oif = inet->uc_index;
+ ipc.oif = READ_ONCE(inet->uc_index);
flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
@@ -899,7 +899,6 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
#if IS_ENABLED(CONFIG_IPV6)
} else if (family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *ip6 = ipv6_hdr(skb);
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
@@ -908,7 +907,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
sin6->sin6_port = 0;
sin6->sin6_addr = ip6->saddr;
sin6->sin6_flowinfo = 0;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
sin6->sin6_flowinfo = ip6_flowinfo(ip6);
sin6->sin6_scope_id =
ipv6_iface_scope_id(&sin6->sin6_addr,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b5db5d1edc2..27da9d7294c0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -239,7 +239,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
if (code > NR_ICMP_UNREACH)
break;
if (code == ICMP_FRAG_NEEDED) {
- harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
err = EMSGSIZE;
} else {
err = icmp_err_convert[code].errno;
@@ -482,7 +482,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int free = 0;
__be32 daddr;
__be32 saddr;
- int err;
+ int uc_index, err;
struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
int hdrincl;
@@ -576,24 +576,25 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
tos = get_rttos(&ipc, inet);
scope = ip_sendmsg_scope(inet, &ipc, msg);
+ uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
- ipc.oif = inet->mc_index;
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!ipc.oif) {
- ipc.oif = inet->uc_index;
- } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
/* oif is set, packet is to local broadcast
* and uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
- if (ipc.oif != inet->uc_index &&
+ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
- inet->uc_index)) {
- ipc.oif = inet->uc_index;
+ uc_index)) {
+ ipc.oif = uc_index;
}
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b214b5a2e045..e2bf4602b559 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1632,7 +1632,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
{
struct rtable *rt;
- rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
(noxfrm ? DST_NOXFRM : 0));
if (rt) {
@@ -1660,7 +1660,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
{
struct rtable *new_rt;
- new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
rt->dst.flags);
if (new_rt) {
@@ -2834,7 +2834,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
struct rtable *ort = (struct rtable *) dst_orig;
struct rtable *rt;
- rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
if (rt) {
struct dst_entry *new = &rt->dst;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6ac890b4073f..e7f024d93572 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1367,6 +1367,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
},
{
+ .procname = "tcp_backlog_ack_defer",
+ .data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "tcp_reflect_tos",
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
.maxlen = sizeof(u8),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3f66cdeef7de..9a8b134d8ada 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3756,7 +3756,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
- info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+ info->tcpi_ato = jiffies_to_usecs(min(icsk->icsk_ack.ato,
+ tcp_delack_max(sk)));
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
@@ -3812,6 +3813,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rcv_wnd = tp->rcv_wnd;
info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
+
+ info->tcpi_total_rto = tp->total_rto;
+ info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
+ info->tcpi_total_rto_time = tp->total_rto_time;
+ if (tp->rto_stamp) {
+ info->tcpi_total_rto_time += tcp_time_stamp_raw() -
+ tp->rto_stamp;
+ }
+
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 146792cd26fe..22358032dd48 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -258,7 +258,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
u64 rate = bw;
rate = bbr_rate_bytes_per_sec(sk, rate, gain);
- rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
return rate;
}
@@ -278,7 +278,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
}
bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
do_div(bw, rtt_us);
- sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+ WRITE_ONCE(sk->sk_pacing_rate,
+ bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
}
/* Pace using current bw estimate and a gain factor. */
@@ -290,14 +291,14 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk);
- if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
- sk->sk_pacing_rate = rate;
+ if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate))
+ WRITE_ONCE(sk->sk_pacing_rate, rate);
}
/* override sysctl_tcp_min_tso_segs */
__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
{
- return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
static u32 bbr_tso_segs_goal(struct sock *sk)
@@ -309,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
* driver provided sk_gso_max_size.
*/
bytes = min_t(unsigned long,
- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+ READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8afb0950a697..4b8f2e74d71d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -940,8 +940,8 @@ static void tcp_update_pacing_rate(struct sock *sk)
* without any lock. We want to make sure compiler wont store
* intermediate values in this location.
*/
- WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
- sk->sk_max_pacing_rate));
+ WRITE_ONCE(sk->sk_pacing_rate,
+ min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -2101,6 +2101,10 @@ void tcp_clear_retrans(struct tcp_sock *tp)
tp->undo_marker = 0;
tp->undo_retrans = -1;
tp->sacked_out = 0;
+ tp->rto_stamp = 0;
+ tp->total_rto = 0;
+ tp->total_rto_recoveries = 0;
+ tp->total_rto_time = 0;
}
static inline void tcp_init_undo(struct tcp_sock *tp)
@@ -2838,6 +2842,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
+static void tcp_update_rto_time(struct tcp_sock *tp)
+{
+ if (tp->rto_stamp) {
+ tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp;
+ tp->rto_stamp = 0;
+ }
+}
+
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
@@ -3042,6 +3054,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, num_dupack, rexmit);
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_rto_time(tp);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
@@ -5566,6 +5580,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+ /* If we are running from __release_sock() in user context,
+ * Defer the ack until tcp_release_cb().
+ */
+ if (sock_owned_by_user_nocheck(sk) &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+ set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
+ return;
+ }
send_now:
tcp_send_ack(sk);
return;
@@ -6449,22 +6471,24 @@ reset_and_undo:
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct request_sock *req;
/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
* undo. If peer SACKs triggered fast recovery, we can't undo here.
*/
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
- tcp_try_undo_loss(sk, false);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
+ tcp_try_undo_recovery(sk);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
- tcp_sk(sk)->retrans_stamp = 0;
+ tcp_update_rto_time(tp);
+ tp->retrans_stamp = 0;
inet_csk(sk)->icsk_retransmits = 0;
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
- req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ req = rcu_dereference_protected(tp->fastopen_rsk,
lockdep_sock_is_held(sk));
reqsk_fastopen_remove(sk, req, false);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 27140e5cdc06..a441740616d7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -828,7 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_priority : sk->sk_priority;
+ inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
transmit_time = tcp_transmit_time(sk);
xfrm_sk_clone_policy(ctl_sk, sk);
txhash = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -1024,10 +1024,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
- (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
- (inet_sk(sk)->tos & INET_ECN_MASK) :
- inet_sk(sk)->tos;
+ tos = READ_ONCE(inet_sk(sk)->tos);
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (tos & INET_ECN_MASK);
if (!INET_ECN_is_capable(tos) &&
tcp_bpf_ca_needs_ecn((struct sock *)req))
@@ -3263,6 +3264,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+ net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c196759f1d3b..c2a925538542 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -470,11 +470,15 @@ void tcp_init_metrics(struct sock *sk)
u32 val, crtt = 0; /* cached RTT scaled by 8 */
sk_dst_confirm(sk);
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
if (!dst)
goto reset;
rcu_read_lock();
- tm = tcp_get_metrics(sk, dst, true);
+ tm = tcp_get_metrics(sk, dst, false);
if (!tm) {
rcu_read_unlock();
goto reset;
@@ -489,11 +493,6 @@ void tcp_init_metrics(struct sock *sk)
tp->snd_ssthresh = val;
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
tp->snd_ssthresh = tp->snd_cwnd_clamp;
- } else {
- /* ssthresh may have been reduced unnecessarily during.
- * 3WHS. Restore it back to its initial default.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val)
@@ -899,22 +898,25 @@ static void tcp_metrics_flush_all(struct net *net)
unsigned int row;
for (row = 0; row < max_rows; row++, hb++) {
- struct tcp_metrics_block __rcu **pp;
+ struct tcp_metrics_block __rcu **pp = &hb->chain;
bool match;
+ if (!rcu_access_pointer(*pp))
+ continue;
+
spin_lock_bh(&tcp_metrics_lock);
- pp = &hb->chain;
for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
match = net ? net_eq(tm_net(tm), net) :
!refcount_read(&tm_net(tm)->ns.count);
if (match) {
- *pp = tm->tcpm_next;
+ rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
} else {
pp = &tm->tcpm_next;
}
}
spin_unlock_bh(&tcp_metrics_lock);
+ cond_resched();
}
}
@@ -949,7 +951,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
if (addr_same(&tm->tcpm_daddr, &daddr) &&
(!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
net_eq(tm_net(tm), net)) {
- *pp = tm->tcpm_next;
+ rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
found = true;
} else {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b98d476f1594..3f87611077ef 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -292,7 +292,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
tw->tw_mark = sk->sk_mark;
- tw->tw_priority = sk->sk_priority;
+ tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -565,6 +565,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->undo_marker = treq->snt_isn;
newtp->retrans_stamp = div_u64(treq->snt_synack,
USEC_PER_SEC / TCP_TS_HZ);
+ newtp->total_rto = req->num_timeout;
+ newtp->total_rto_recoveries = 1;
+ newtp->total_rto_time = tcp_time_stamp_raw() -
+ newtp->retrans_stamp;
}
newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index aa0fc8c766e5..f207712eece1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1076,7 +1076,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
- TCPF_MTU_REDUCED_DEFERRED)
+ TCPF_MTU_REDUCED_DEFERRED | \
+ TCPF_ACK_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -1100,16 +1101,6 @@ void tcp_release_cb(struct sock *sk)
tcp_tsq_write(sk);
__sock_put(sk);
}
- /* Here begins the tricky part :
- * We are called from release_sock() with :
- * 1) BH disabled
- * 2) sk_lock.slock spinlock held
- * 3) socket owned by us (sk->sk_lock.owned == 1)
- *
- * But following code is meant to be called from BH handlers,
- * so we should keep BH disabled, but early release socket ownership
- */
- sock_release_ownership(sk);
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
@@ -1123,6 +1114,8 @@ void tcp_release_cb(struct sock *sk)
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
+ if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
+ tcp_send_ack(sk);
}
EXPORT_SYMBOL(tcp_release_cb);
@@ -1207,7 +1200,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
if (sk->sk_pacing_status != SK_PACING_NONE) {
- unsigned long rate = sk->sk_pacing_rate;
+ unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
/* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets,
@@ -1331,7 +1324,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+ skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
@@ -1979,7 +1972,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
unsigned long bytes;
u32 r;
- bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
@@ -2559,7 +2552,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
+ READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
@@ -2567,7 +2560,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
tcp_sk(sk)->tcp_tx_delay) {
- u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+ u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
+ tcp_sk(sk)->tcp_tx_delay;
/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
* approximate our needs assuming an ~100% skb->truesize overhead.
@@ -3983,6 +3977,20 @@ int tcp_connect(struct sock *sk)
}
EXPORT_SYMBOL(tcp_connect);
+u32 tcp_delack_max(const struct sock *sk)
+{
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ u32 delack_max = inet_csk(sk)->icsk_delack_max;
+
+ if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) {
+ u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
+ u32 delack_from_rto_min = max_t(int, 1, rto_min - 1);
+
+ delack_max = min_t(u32, delack_max, delack_from_rto_min);
+ }
+ return delack_max;
+}
+
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
@@ -4018,7 +4026,7 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
- ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+ ato = min_t(u32, ato, tcp_delack_max(sk));
/* Stay within the limit we were given */
timeout = jiffies + ato;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 984ab4a0421e..3f61c6a70a1f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -394,7 +394,7 @@ static void tcp_probe_timer(struct sock *sk)
if (user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
msecs_to_jiffies(user_timeout))
- goto abort;
+ goto abort;
}
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
@@ -415,6 +415,19 @@ abort: tcp_write_err(sk);
}
}
+static void tcp_update_rto_stats(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!icsk->icsk_retransmits) {
+ tp->total_rto_recoveries++;
+ tp->rto_stamp = tcp_time_stamp(tp);
+ }
+ icsk->icsk_retransmits++;
+ tp->total_rto++;
+}
+
/*
* Timer for Fast Open socket to retransmit SYNACK. Note that the
* sk here is the child socket, not the parent (listener) socket.
@@ -447,7 +460,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
- icsk->icsk_retransmits++;
+ tcp_update_rto_stats(sk);
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_time_stamp(tp);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -575,7 +588,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk);
- icsk->icsk_retransmits++;
+ tcp_update_rto_stats(sk);
if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* Let senders fight for local resources conservatively.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f39b9c844580..7f7724beca33 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -714,7 +714,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex,
inet_sdif(skb), udptable, NULL);
- if (!sk || udp_sk(sk)->encap_type) {
+ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
/* No socket for error: try tunnels before discarding */
if (static_branch_unlikely(&udp_encap_needed_key)) {
sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
@@ -750,7 +750,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
@@ -1051,10 +1051,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
u8 tos, scope;
__be16 dport;
int err, is_udplite = IS_UDPLITE(sk);
- int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
+ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
+ int uc_index;
if (len > 0xFFFF)
return -EMSGSIZE;
@@ -1173,25 +1174,26 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (scope == RT_SCOPE_LINK)
connected = 0;
+ uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
- ipc.oif = inet->mc_index;
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
connected = 0;
} else if (!ipc.oif) {
- ipc.oif = inet->uc_index;
- } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
/* oif is set, packet is to local broadcast and
* uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
- if (ipc.oif != inet->uc_index &&
+ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
- inet->uc_index)) {
- ipc.oif = inet->uc_index;
+ uc_index)) {
+ ipc.oif = uc_index;
}
}
@@ -1315,11 +1317,11 @@ void udp_splice_eof(struct socket *sock)
struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
- if (!up->pending || READ_ONCE(up->corkflag))
+ if (!up->pending || udp_test_bit(CORK, sk))
return;
lock_sock(sk);
- if (up->pending && !READ_ONCE(up->corkflag))
+ if (up->pending && !udp_test_bit(CORK, sk))
udp_push_pending_frames(sk);
release_sock(sk);
}
@@ -1868,7 +1870,7 @@ try_again:
(struct sockaddr *)sin);
}
- if (udp_sk(sk)->gro_enabled)
+ if (udp_test_bit(GRO_ENABLED, sk))
udp_cmsg_recv(msg, sk, skb);
if (inet_cmsg_flags(inet))
@@ -2081,7 +2083,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
}
nf_reset_ct(skb);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
+ if (static_branch_unlikely(&udp_encap_needed_key) &&
+ READ_ONCE(up->encap_type)) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -2119,7 +2122,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ u16 pcrlen = READ_ONCE(up->pcrlen);
/*
* MIB statistics other than incrementing the error count are
@@ -2132,7 +2136,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
* delivery of packets with coverage values less than a value
* provided by the application."
*/
- if (up->pcrlen == 0) { /* full coverage was set */
+ if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
@@ -2143,9 +2147,9 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
* that it wants x while sender emits packets of smaller size y.
* Therefore the above ...()->partial_cov statement is essential.
*/
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ UDP_SKB_CB(skb)->cscov, pcrlen);
goto drop;
}
}
@@ -2618,7 +2622,7 @@ void udp_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
- if (up->encap_enabled)
+ if (udp_test_bit(ENCAP_ENABLED, sk))
static_branch_dec(&udp_encap_needed_key);
}
}
@@ -2658,9 +2662,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
if (val != 0) {
- WRITE_ONCE(up->corkflag, 1);
+ udp_set_bit(CORK, sk);
} else {
- WRITE_ONCE(up->corkflag, 0);
+ udp_clear_bit(CORK, sk);
lock_sock(sk);
push_pending_frames(sk);
release_sock(sk);
@@ -2675,17 +2679,17 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_ENCAP_ESPINUDP_NON_IKE:
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
- up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv;
+ WRITE_ONCE(up->encap_rcv,
+ ipv6_stub->xfrm6_udp_encap_rcv);
else
#endif
- up->encap_rcv = xfrm4_udp_encap_rcv;
+ WRITE_ONCE(up->encap_rcv,
+ xfrm4_udp_encap_rcv);
#endif
fallthrough;
case UDP_ENCAP_L2TPINUDP:
- up->encap_type = val;
- lock_sock(sk);
- udp_tunnel_encap_enable(sk->sk_socket);
- release_sock(sk);
+ WRITE_ONCE(up->encap_type, val);
+ udp_tunnel_encap_enable(sk);
break;
default:
err = -ENOPROTOOPT;
@@ -2694,11 +2698,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_NO_CHECK6_TX:
- up->no_check6_tx = valbool;
+ udp_set_no_check6_tx(sk, valbool);
break;
case UDP_NO_CHECK6_RX:
- up->no_check6_rx = valbool;
+ udp_set_no_check6_rx(sk, valbool);
break;
case UDP_SEGMENT:
@@ -2708,14 +2712,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_GRO:
- lock_sock(sk);
/* when enabling GRO, accept the related GSO packet type */
if (valbool)
- udp_tunnel_encap_enable(sk->sk_socket);
- up->gro_enabled = valbool;
- up->accept_udp_l4 = valbool;
- release_sock(sk);
+ udp_tunnel_encap_enable(sk);
+ udp_assign_bit(GRO_ENABLED, sk, valbool);
+ udp_assign_bit(ACCEPT_L4, sk, valbool);
break;
/*
@@ -2730,8 +2732,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
val = 8;
else if (val > USHRT_MAX)
val = USHRT_MAX;
- up->pcslen = val;
- up->pcflag |= UDPLITE_SEND_CC;
+ WRITE_ONCE(up->pcslen, val);
+ udp_set_bit(UDPLITE_SEND_CC, sk);
break;
/* The receiver specifies a minimum checksum coverage value. To make
@@ -2744,8 +2746,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
val = 8;
else if (val > USHRT_MAX)
val = USHRT_MAX;
- up->pcrlen = val;
- up->pcflag |= UDPLITE_RECV_CC;
+ WRITE_ONCE(up->pcrlen, val);
+ udp_set_bit(UDPLITE_RECV_CC, sk);
break;
default:
@@ -2783,19 +2785,19 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
- val = READ_ONCE(up->corkflag);
+ val = udp_test_bit(CORK, sk);
break;
case UDP_ENCAP:
- val = up->encap_type;
+ val = READ_ONCE(up->encap_type);
break;
case UDP_NO_CHECK6_TX:
- val = up->no_check6_tx;
+ val = udp_get_no_check6_tx(sk);
break;
case UDP_NO_CHECK6_RX:
- val = up->no_check6_rx;
+ val = udp_get_no_check6_rx(sk);
break;
case UDP_SEGMENT:
@@ -2803,17 +2805,17 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
break;
case UDP_GRO:
- val = up->gro_enabled;
+ val = udp_test_bit(GRO_ENABLED, sk);
break;
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
- val = up->pcslen;
+ val = READ_ONCE(up->pcslen);
break;
case UDPLITE_RECV_CSCOV:
- val = up->pcrlen;
+ val = READ_ONCE(up->pcrlen);
break;
default:
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0f46b3c2e4ac..6c95d28d0c4a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -557,10 +557,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
NAPI_GRO_CB(skb)->is_flist = 0;
if (!sk || !udp_sk(sk)->gro_receive) {
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
- NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
+ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
- (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
+ (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
return call_gro_receive(udp_gro_receive_segment, head, skb);
/* no GRO, be sure flush the current packet */
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 9b18f371af0d..1e7e4aecdc48 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -78,7 +78,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
- udp_tunnel_encap_enable(sock);
+ udp_tunnel_encap_enable(sk);
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index 029219749785..b6d2d16189c0 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -47,7 +47,7 @@ struct udp_tunnel_nic {
unsigned int n_tables;
unsigned long missed;
- struct udp_tunnel_nic_table_entry **entries;
+ struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables);
};
/* We ensure all work structs are done using driver state, but not the code.
@@ -725,16 +725,12 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
struct udp_tunnel_nic *utn;
unsigned int i;
- utn = kzalloc(sizeof(*utn), GFP_KERNEL);
+ utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL);
if (!utn)
return NULL;
utn->n_tables = n_tables;
INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);
- utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL);
- if (!utn->entries)
- goto err_free_utn;
-
for (i = 0; i < n_tables; i++) {
utn->entries[i] = kcalloc(info->tables[i].n_entries,
sizeof(*utn->entries[i]), GFP_KERNEL);
@@ -747,8 +743,6 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
err_free_prev_entries:
while (i--)
kfree(utn->entries[i]);
- kfree(utn->entries);
-err_free_utn:
kfree(utn);
return NULL;
}
@@ -759,7 +753,6 @@ static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn)
for (i = 0; i < utn->n_tables; i++)
kfree(utn->entries[i]);
- kfree(utn->entries);
kfree(utn);
}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 39ecdad1b50c..af37af3ab727 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -21,7 +21,6 @@ EXPORT_SYMBOL(udplite_table);
static int udplite_sk_init(struct sock *sk)
{
udp_init_sock(sk);
- udp_sk(sk)->pcflag = UDPLITE_BIT;
pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
"please contact the netdev mailing list\n");
return 0;
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index eac206a290d0..183f6dc37242 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -85,11 +85,11 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
struct udphdr *uh;
struct iphdr *iph;
int iphlen, len;
-
__u8 *udpdata;
__be32 *udpdata32;
- __u16 encap_type = up->encap_type;
+ u16 encap_type;
+ encap_type = READ_ONCE(up->encap_type);
/* if this is not encapsulated socket, then just return now */
if (!encap_type)
return 1;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0b6ee962c84e..c2d471ad7922 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.ioam6_id = IOAM6_DEFAULT_IF_ID,
.ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
.ndisc_evict_nocarrier = 1,
+ .ra_honor_pio_life = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -297,6 +298,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.ioam6_id = IOAM6_DEFAULT_IF_ID,
.ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
.ndisc_evict_nocarrier = 1,
+ .ra_honor_pio_life = 0,
};
/* Check if link is ready: is it up and is a valid qdisc available */
@@ -2657,22 +2659,23 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
else
stored_lft = 0;
- if (!create && stored_lft) {
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = !create && stored_lft;
+
+ if (update_lft && !in6_dev->cnf.ra_honor_pio_life) {
const u32 minimum_lft = min_t(u32,
stored_lft, MIN_VALID_LIFETIME);
valid_lft = max(valid_lft, minimum_lft);
-
- /* RFC4862 Section 5.5.3e:
- * "Note that the preferred lifetime of the
- * corresponding address is always reset to
- * the Preferred Lifetime in the received
- * Prefix Information option, regardless of
- * whether the valid lifetime is also reset or
- * ignored."
- *
- * So we should always update prefered_lft here.
- */
- update_lft = 1;
}
if (update_lft) {
@@ -6846,6 +6849,15 @@ static const struct ctl_table addrconf_sysctl[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "ra_honor_pio_life",
+ .data = &ipv6_devconf.ra_honor_pio_life,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
#ifdef CONFIG_IPV6_ROUTER_PREF
{
.procname = "accept_ra_rtr_pref",
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 368824fe9719..c6ad0d6e99b5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -217,10 +217,11 @@ lookup_protocol:
inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
np->hop_limit = -1;
np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
- np->mc_loop = 1;
- np->mc_all = 1;
+ inet6_set_bit(MC6_LOOP, sk);
+ inet6_set_bit(MC6_ALL, sk);
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
+ inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
+ FLOWLABEL_REFLECT_ESTABLISHED);
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
@@ -536,7 +537,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
}
sin->sin6_port = inet->inet_dport;
sin->sin6_addr = sk->sk_v6_daddr;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
sin->sin6_flowinfo = np->flow_label;
BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
CGROUP_INET6_GETPEERNAME);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 41ebc4e57473..cc6a502db39d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -80,7 +80,8 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
struct flowi6 fl6;
int err = 0;
- if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+ if (inet6_test_bit(SNDFLOW, sk) &&
+ (np->flow_label & IPV6_FLOWLABEL_MASK)) {
flowlabel = fl6_sock_lookup(sk, np->flow_label);
if (IS_ERR(flowlabel))
return -EINVAL;
@@ -163,7 +164,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
if (ipv6_addr_any(&usin->sin6_addr)) {
@@ -305,11 +306,10 @@ static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb,
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
struct icmp6hdr *icmph = icmp6_hdr(skb);
struct sock_exterr_skb *serr;
- if (!np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk))
return;
skb = skb_clone(skb, GFP_ATOMIC);
@@ -332,7 +332,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__skb_pull(skb, payload - skb->data);
- if (inet6_sk(sk)->recverr_rfc4884)
+ if (inet6_test_bit(RECVERR6_RFC4884, sk))
ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
skb_reset_transport_header(skb);
@@ -344,12 +344,11 @@ EXPORT_SYMBOL_GPL(ipv6_icmp_error);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
- const struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
struct ipv6hdr *iph;
struct sk_buff *skb;
- if (!np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk))
return;
skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
@@ -493,7 +492,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
struct ipv6hdr, daddr);
sin->sin6_addr = ip6h->daddr;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
sin->sin6_flowinfo = ip6_flowinfo(ip6h);
sin->sin6_scope_id =
ipv6_iface_scope_id(&sin->sin6_addr,
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 93a594a901d1..8fb4a791881a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -588,7 +588,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- ipcm6_init_sk(&ipc6, np);
+ ipcm6_init_sk(&ipc6, sk);
ipc6.sockc.mark = mark;
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
@@ -791,7 +791,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
msg.offset = 0;
msg.type = type;
- ipcm6_init_sk(&ipc6, np);
+ ipcm6_init_sk(&ipc6, sk);
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
ipc6.sockc.mark = mark;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 0c50dcd35fe8..80043e46117c 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
fl6.daddr = sk->sk_v6_daddr;
res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass, sk->sk_priority);
+ np->tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
return res;
}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b3ca4beb4405..eca07e10e21f 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -513,7 +513,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
return 0;
}
- if (np->repflow) {
+ if (inet6_test_bit(REPFLOW, sk)) {
freq->flr_label = np->flow_label;
return 0;
}
@@ -551,10 +551,10 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
if (freq->flr_flags & IPV6_FL_F_REFLECT) {
if (sk->sk_protocol != IPPROTO_TCP)
return -ENOPROTOOPT;
- if (!np->repflow)
+ if (!inet6_test_bit(REPFLOW, sk))
return -ESRCH;
np->flow_label = 0;
- np->repflow = 0;
+ inet6_clear_bit(REPFLOW, sk);
return 0;
}
@@ -626,7 +626,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
if (sk->sk_protocol != IPPROTO_TCP)
return -ENOPROTOOPT;
- np->repflow = 1;
+ inet6_set_bit(REPFLOW, sk);
return 0;
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 54fc4c711f2c..cdaa9275e990 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -232,12 +232,11 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(ip6_output);
-bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
{
- if (!np->autoflowlabel_set)
+ if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
return ip6_default_np_autolabel(net);
- else
- return np->autoflowlabel;
+ return inet6_test_bit(AUTOFLOWLABEL, sk);
}
/*
@@ -309,12 +308,12 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
* Fill in the IPv6 header
*/
if (np)
- hlimit = np->hop_limit;
+ hlimit = READ_ONCE(np->hop_limit);
if (hlimit < 0)
hlimit = ip6_dst_hoplimit(dst);
ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
- ip6_autoflowlabel(net, np), fl6));
+ ip6_autoflowlabel(net, sk), fl6));
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
@@ -369,9 +368,8 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
if (sk && ra->sel == sel &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == skb->dev->ifindex)) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- if (np && np->rtalert_isolate &&
+ if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
!net_eq(sock_net(sk), dev_net(skb->dev))) {
continue;
}
@@ -881,9 +879,11 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
mtu = IPV6_MIN_MTU;
}
- if (np && np->frag_size < mtu) {
- if (np->frag_size)
- mtu = np->frag_size;
+ if (np) {
+ u32 frag_size = READ_ONCE(np->frag_size);
+
+ if (frag_size && frag_size < mtu)
+ mtu = frag_size;
}
if (mtu < hlen + sizeof(struct frag_hdr) + 8)
goto fail_toobig;
@@ -1113,7 +1113,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
rcu_read_lock();
from = rt ? rcu_dereference(rt->from) : NULL;
err = ip6_route_get_saddr(net, from, &fl6->daddr,
- sk ? inet6_sk(sk)->srcprefs : 0,
+ sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
&fl6->saddr);
rcu_read_unlock();
@@ -1392,7 +1392,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
struct rt6_info *rt)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- unsigned int mtu;
+ unsigned int mtu, frag_size;
struct ipv6_txoptions *nopt, *opt = ipc6->opt;
/* callers pass dst together with a reference, set it first so
@@ -1436,15 +1436,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
v6_cork->hop_limit = ipc6->hlimit;
v6_cork->tclass = ipc6->tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
- mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+ mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
else
- mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+ mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
- if (np->frag_size < mtu) {
- if (np->frag_size)
- mtu = np->frag_size;
- }
+
+ frag_size = READ_ONCE(np->frag_size);
+ if (frag_size && frag_size < mtu)
+ mtu = frag_size;
+
cork->base.fragsize = mtu;
cork->base.gso_size = ipc6->gso_size;
cork->base.tx_flags = 0;
@@ -1935,7 +1936,6 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct in6_addr *final_dst;
- struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1978,13 +1978,13 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ip6_flow_hdr(hdr, v6_cork->tclass,
ip6_make_flowlabel(net, skb, fl6->flowlabel,
- ip6_autoflowlabel(net, np), fl6));
+ ip6_autoflowlabel(net, sk), fl6));
hdr->hop_limit = v6_cork->hop_limit;
hdr->nexthdr = proto;
hdr->saddr = fl6->saddr;
hdr->daddr = *final_dst;
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
skb->mark = cork->base.mark;
skb->tstamp = cork->base.transmit_time;
@@ -2091,7 +2091,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
return ERR_PTR(err);
}
if (ipc6->dontfrag < 0)
- ipc6->dontfrag = inet6_sk(sk)->dontfrag;
+ ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
err = __ip6_append_data(sk, &queue, cork, &v6_cork,
&current->task_frag, getfrag, from,
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index cdc4d4ee2420..70d38705c92f 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -75,8 +75,9 @@ EXPORT_SYMBOL_GPL(udp_sock_create6);
int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb,
- struct net_device *dev, struct in6_addr *saddr,
- struct in6_addr *daddr,
+ struct net_device *dev,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
__u8 prio, __u8 ttl, __be32 label,
__be16 src_port, __be16 dst_port, bool nocheck)
{
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 0e2a0847b387..7d661735cb9d 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -415,6 +415,101 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (ip6_mroute_opt(optname))
return ip6_mroute_setsockopt(sk, optname, optval, optlen);
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case IPV6_UNICAST_HOPS:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val > 255 || val < -1)
+ return -EINVAL;
+ WRITE_ONCE(np->hop_limit, val);
+ return 0;
+ case IPV6_MULTICAST_LOOP:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val != valbool)
+ return -EINVAL;
+ inet6_assign_bit(MC6_LOOP, sk, valbool);
+ return 0;
+ case IPV6_MULTICAST_HOPS:
+ if (sk->sk_type == SOCK_STREAM)
+ return retv;
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val > 255 || val < -1)
+ return -EINVAL;
+ WRITE_ONCE(np->mcast_hops,
+ val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
+ return 0;
+ case IPV6_MTU:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val && val < IPV6_MIN_MTU)
+ return -EINVAL;
+ WRITE_ONCE(np->frag_size, val);
+ return 0;
+ case IPV6_MINHOPCOUNT:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&ip6_min_hopcount);
+
+ /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
+ * while we are changing it.
+ */
+ WRITE_ONCE(np->min_hopcount, val);
+ return 0;
+ case IPV6_RECVERR_RFC4884:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+ inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
+ return 0;
+ case IPV6_MULTICAST_ALL:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(MC6_ALL, sk, valbool);
+ return 0;
+ case IPV6_AUTOFLOWLABEL:
+ inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
+ inet6_set_bit(AUTOFLOWLABEL_SET, sk);
+ return 0;
+ case IPV6_DONTFRAG:
+ inet6_assign_bit(DONTFRAG, sk, valbool);
+ return 0;
+ case IPV6_RECVERR:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(RECVERR6, sk, valbool);
+ if (!val)
+ skb_errqueue_purge(&sk->sk_error_queue);
+ return 0;
+ case IPV6_ROUTER_ALERT_ISOLATE:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
+ return 0;
+ case IPV6_MTU_DISCOVER:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
+ return -EINVAL;
+ WRITE_ONCE(np->pmtudisc, val);
+ return 0;
+ case IPV6_FLOWINFO_SEND:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(SNDFLOW, sk, valbool);
+ return 0;
+ case IPV6_ADDR_PREFERENCES:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ return ip6_sock_set_addr_preferences(sk, val);
+ }
if (needs_rtnl)
rtnl_lock();
sockopt_lock_sock(sk);
@@ -733,34 +828,7 @@ done:
}
break;
}
- case IPV6_UNICAST_HOPS:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val > 255 || val < -1)
- goto e_inval;
- np->hop_limit = val;
- retv = 0;
- break;
- case IPV6_MULTICAST_HOPS:
- if (sk->sk_type == SOCK_STREAM)
- break;
- if (optlen < sizeof(int))
- goto e_inval;
- if (val > 255 || val < -1)
- goto e_inval;
- np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
- retv = 0;
- break;
-
- case IPV6_MULTICAST_LOOP:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val != valbool)
- goto e_inval;
- np->mc_loop = valbool;
- retv = 0;
- break;
case IPV6_UNICAST_IF:
{
@@ -862,13 +930,6 @@ done:
retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
break;
}
- case IPV6_MULTICAST_ALL:
- if (optlen < sizeof(int))
- goto e_inval;
- np->mc_all = valbool;
- retv = 0;
- break;
-
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
if (in_compat_syscall())
@@ -896,42 +957,6 @@ done:
goto e_inval;
retv = ip6_ra_control(sk, val);
break;
- case IPV6_ROUTER_ALERT_ISOLATE:
- if (optlen < sizeof(int))
- goto e_inval;
- np->rtalert_isolate = valbool;
- retv = 0;
- break;
- case IPV6_MTU_DISCOVER:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
- goto e_inval;
- np->pmtudisc = val;
- retv = 0;
- break;
- case IPV6_MTU:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val && val < IPV6_MIN_MTU)
- goto e_inval;
- np->frag_size = val;
- retv = 0;
- break;
- case IPV6_RECVERR:
- if (optlen < sizeof(int))
- goto e_inval;
- np->recverr = valbool;
- if (!val)
- skb_errqueue_purge(&sk->sk_error_queue);
- retv = 0;
- break;
- case IPV6_FLOWINFO_SEND:
- if (optlen < sizeof(int))
- goto e_inval;
- np->sndflow = valbool;
- retv = 0;
- break;
case IPV6_FLOWLABEL_MGR:
retv = ipv6_flowlabel_opt(sk, optval, optlen);
break;
@@ -943,47 +968,10 @@ done:
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
- case IPV6_ADDR_PREFERENCES:
- if (optlen < sizeof(int))
- goto e_inval;
- retv = __ip6_sock_set_addr_preferences(sk, val);
- break;
- case IPV6_MINHOPCOUNT:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val < 0 || val > 255)
- goto e_inval;
-
- if (val)
- static_branch_enable(&ip6_min_hopcount);
-
- /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
- * while we are changing it.
- */
- WRITE_ONCE(np->min_hopcount, val);
- retv = 0;
- break;
- case IPV6_DONTFRAG:
- np->dontfrag = valbool;
- retv = 0;
- break;
- case IPV6_AUTOFLOWLABEL:
- np->autoflowlabel = valbool;
- np->autoflowlabel_set = 1;
- retv = 0;
- break;
case IPV6_RECVFRAGSIZE:
np->rxopt.bits.recvfragsize = valbool;
retv = 0;
break;
- case IPV6_RECVERR_RFC4884:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val < 0 || val > 1)
- goto e_inval;
- np->recverr_rfc4884 = valbool;
- retv = 0;
- break;
}
unlock:
@@ -1180,7 +1168,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxhlim) {
- int hlim = np->mcast_hops;
+ int hlim = READ_ONCE(np->mcast_hops);
+
put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.rxtclass) {
@@ -1197,7 +1186,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxohlim) {
- int hlim = np->mcast_hops;
+ int hlim = READ_ONCE(np->mcast_hops);
+
put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.rxflow) {
@@ -1347,9 +1337,9 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
struct dst_entry *dst;
if (optname == IPV6_UNICAST_HOPS)
- val = np->hop_limit;
+ val = READ_ONCE(np->hop_limit);
else
- val = np->mcast_hops;
+ val = READ_ONCE(np->mcast_hops);
if (val < 0) {
rcu_read_lock();
@@ -1365,7 +1355,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
}
case IPV6_MULTICAST_LOOP:
- val = np->mc_loop;
+ val = inet6_test_bit(MC6_LOOP, sk);
break;
case IPV6_MULTICAST_IF:
@@ -1373,7 +1363,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_MULTICAST_ALL:
- val = np->mc_all;
+ val = inet6_test_bit(MC6_ALL, sk);
break;
case IPV6_UNICAST_IF:
@@ -1381,15 +1371,15 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_MTU_DISCOVER:
- val = np->pmtudisc;
+ val = READ_ONCE(np->pmtudisc);
break;
case IPV6_RECVERR:
- val = np->recverr;
+ val = inet6_test_bit(RECVERR6, sk);
break;
case IPV6_FLOWINFO_SEND:
- val = np->sndflow;
+ val = inet6_test_bit(SNDFLOW, sk);
break;
case IPV6_FLOWLABEL_MGR:
@@ -1424,33 +1414,35 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
}
case IPV6_ADDR_PREFERENCES:
+ {
+ u8 srcprefs = READ_ONCE(np->srcprefs);
val = 0;
- if (np->srcprefs & IPV6_PREFER_SRC_TMP)
+ if (srcprefs & IPV6_PREFER_SRC_TMP)
val |= IPV6_PREFER_SRC_TMP;
- else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC)
+ else if (srcprefs & IPV6_PREFER_SRC_PUBLIC)
val |= IPV6_PREFER_SRC_PUBLIC;
else {
/* XXX: should we return system default? */
val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
}
- if (np->srcprefs & IPV6_PREFER_SRC_COA)
+ if (srcprefs & IPV6_PREFER_SRC_COA)
val |= IPV6_PREFER_SRC_COA;
else
val |= IPV6_PREFER_SRC_HOME;
break;
-
+ }
case IPV6_MINHOPCOUNT:
- val = np->min_hopcount;
+ val = READ_ONCE(np->min_hopcount);
break;
case IPV6_DONTFRAG:
- val = np->dontfrag;
+ val = inet6_test_bit(DONTFRAG, sk);
break;
case IPV6_AUTOFLOWLABEL:
- val = ip6_autoflowlabel(sock_net(sk), np);
+ val = ip6_autoflowlabel(sock_net(sk), sk);
break;
case IPV6_RECVFRAGSIZE:
@@ -1458,11 +1450,11 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_ROUTER_ALERT_ISOLATE:
- val = np->rtalert_isolate;
+ val = inet6_test_bit(RTALERT_ISOLATE, sk);
break;
case IPV6_RECVERR_RFC4884:
- val = np->recverr_rfc4884;
+ val = inet6_test_bit(RECVERR6_RFC4884, sk);
break;
default:
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 5ce25bcb9974..99e28b444a4c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -642,7 +642,7 @@ bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
}
if (!mc) {
rcu_read_unlock();
- return np->mc_all;
+ return inet6_test_bit(MC6_ALL, sk);
}
psl = rcu_dereference(mc->sflist);
if (!psl) {
@@ -1716,7 +1716,7 @@ static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
hdr->payload_len = htons(len);
hdr->nexthdr = proto;
- hdr->hop_limit = inet6_sk(sk)->hop_limit;
+ hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);
hdr->saddr = *saddr;
hdr->daddr = *daddr;
@@ -3011,8 +3011,6 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
continue;
state->im = rcu_dereference(state->idev->mc_list);
}
- if (!state->im)
- break;
psf = rcu_dereference(state->im->mca_sources);
}
out:
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 553c8664e0a7..679443d7ecb5 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -500,7 +500,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
csum_partial(icmp6h,
skb->len, 0));
- ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len);
+ ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
rcu_read_lock();
idev = __in6_dev_get(dst->dev);
@@ -1996,7 +1996,7 @@ static int __net_init ndisc_net_init(struct net *net)
np = inet6_sk(sk);
np->hop_limit = 255;
/* Do not loopback ndisc messages */
- np->mc_loop = 0;
+ inet6_clear_bit(MC6_LOOP, sk);
return 0;
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 5831aaa53d75..e8fb0d275cc2 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -89,7 +89,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EAFNOSUPPORT;
}
daddr = &(u->sin6_addr);
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK;
if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
oif = u->sin6_scope_id;
@@ -118,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if))
return -EINVAL;
- ipcm6_init_sk(&ipc6, np);
+ ipcm6_init_sk(&ipc6, sk);
ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 42fcec3ecf5e..a2aa54a2baae 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -291,6 +291,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
+ bool recverr = inet6_test_bit(RECVERR6, sk);
struct ipv6_pinfo *np = inet6_sk(sk);
int err;
int harderr;
@@ -300,26 +301,26 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
2. Socket is connected (otherwise the error indication
is useless without recverr and error is hard.
*/
- if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+ if (!recverr && sk->sk_state != TCP_ESTABLISHED)
return;
harderr = icmpv6_err_convert(type, code, &err);
if (type == ICMPV6_PKT_TOOBIG) {
ip6_sk_update_pmtu(skb, sk, info);
- harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
+ harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
}
if (type == NDISC_REDIRECT) {
ip6_sk_redirect(skb, sk);
return;
}
- if (np->recverr) {
+ if (recverr) {
u8 *payload = skb->data;
if (!inet_test_bit(HDRINCL, sk))
payload += offset;
ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
}
- if (np->recverr || harderr) {
+ if (recverr || harderr) {
sk->sk_err = err;
sk_error_report(sk);
}
@@ -587,7 +588,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
struct flowi6 *fl6, struct dst_entry **dstp,
unsigned int flags, const struct sockcm_cookie *sockc)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
struct ipv6hdr *iph;
struct sk_buff *skb;
@@ -668,7 +668,7 @@ out:
error:
IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
error_check:
- if (err == -ENOBUFS && !np->recverr)
+ if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
err = 0;
return err;
}
@@ -795,7 +795,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EINVAL;
daddr = &sin6->sin6_addr;
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
@@ -898,7 +898,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
if (ipc6.dontfrag < 0)
- ipc6.dontfrag = np->dontfrag;
+ ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9c687b357e6a..b132feae3393 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -341,7 +341,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
- 1, DST_OBSOLETE_FORCE_CHK, flags);
+ DST_OBSOLETE_FORCE_CHK, flags);
if (rt) {
rt6_info_init(rt);
@@ -2622,7 +2622,7 @@ static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
if (!any_src)
flags |= RT6_LOOKUP_F_HAS_SADDR;
else if (sk)
- flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
+ flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
@@ -2655,7 +2655,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
struct net_device *loopback_dev = net->loopback_dev;
struct dst_entry *new = NULL;
- rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
+ rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
DST_OBSOLETE_DEAD, 0);
if (rt) {
rt6_info_init(rt);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 44b6949d72b2..bfe7d19ff4fd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -163,7 +163,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
memset(&fl6, 0, sizeof(fl6));
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
IP6_ECN_flow_init(fl6.flowlabel);
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
@@ -508,7 +508,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
tcp_ld_RTO_revert(sk, seq);
}
- if (!sock_owned_by_user(sk) && np->recverr) {
+ if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
WRITE_ONCE(sk->sk_err, err);
sk_error_report(sk);
} else {
@@ -548,7 +548,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
&ireq->ir_v6_rmt_addr);
fl6->daddr = ireq->ir_v6_rmt_addr;
- if (np->repflow && ireq->pktopts)
+ if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
@@ -565,7 +565,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!opt)
opt = rcu_dereference(np->opt);
err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
- opt, tclass, sk->sk_priority);
+ opt, tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -797,7 +797,7 @@ static void tcp_v6_init_req(struct request_sock *req,
(ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo ||
np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
- np->rxopt.bits.rxohlim || np->repflow)) {
+ np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
refcount_inc(&skb->users);
ireq->pktopts = skb;
}
@@ -1055,12 +1055,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk) {
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk)) {
- const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
-
trace_tcp_send_reset(sk, skb);
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
label = ip6_flowlabel(ipv6h);
- priority = sk->sk_priority;
+ priority = READ_ONCE(sk->sk_priority);
txhash = sk->sk_txhash;
}
if (sk->sk_state == TCP_TIME_WAIT) {
@@ -1247,7 +1245,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newnp->mcast_oif = inet_iif(skb);
newnp->mcast_hops = ip_hdr(skb)->ttl;
newnp->rcv_flowinfo = 0;
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
newnp->flow_label = 0;
/*
@@ -1320,7 +1318,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newnp->mcast_oif = tcp_v6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
/* Set ToS of the new socket based upon the value of incoming SYN.
@@ -1542,10 +1540,11 @@ ipv6_pktoptions:
if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
np->mcast_oif = tcp_v6_iif(opt_skb);
if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
- np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+ WRITE_ONCE(np->mcast_hops,
+ ipv6_hdr(opt_skb)->hop_limit);
if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
tcp_v6_restore_cb(opt_skb);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 86b5d509a468..5e9312eefed0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -413,7 +413,7 @@ try_again:
(struct sockaddr *)sin6);
}
- if (udp_sk(sk)->gro_enabled)
+ if (udp_test_bit(GRO_ENABLED, sk))
udp_cmsg_recv(msg, sk, skb);
if (np->rxopt.all)
@@ -571,7 +571,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
- if (!sk || udp_sk(sk)->encap_type) {
+ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
/* No socket for error: try tunnels before discarding */
if (static_branch_unlikely(&udpv6_encap_needed_key)) {
sk = __udp6_lib_err_encap(net, hdr, offset, uh,
@@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!ip6_sk_accept_pmtu(sk))
goto out;
ip6_sk_update_pmtu(skb, sk, info);
- if (np->pmtudisc != IPV6_PMTUDISC_DONT)
+ if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT)
harderr = 1;
}
if (type == NDISC_REDIRECT) {
@@ -619,7 +619,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- if (!np->recverr) {
+ if (!inet6_test_bit(RECVERR6, sk)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else {
@@ -688,7 +688,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
}
nf_reset_ct(skb);
- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
+ if (static_branch_unlikely(&udpv6_encap_needed_key) &&
+ READ_ONCE(up->encap_type)) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -726,16 +727,17 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
*/
- if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ u16 pcrlen = READ_ONCE(up->pcrlen);
- if (up->pcrlen == 0) { /* full coverage was set */
+ if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ UDP_SKB_CB(skb)->cscov, pcrlen);
goto drop;
}
}
@@ -858,7 +860,7 @@ start_lookup:
/* If zero checksum and no_check is not on for
* the socket then skip it.
*/
- if (!uh->check && !udp_sk(sk)->no_check6_rx)
+ if (!uh->check && !udp_get_no_check6_rx(sk))
continue;
if (!first) {
first = sk;
@@ -980,7 +982,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp6_sk_rx_dst_set(sk, dst);
- if (!uh->check && !udp_sk(sk)->no_check6_rx) {
+ if (!uh->check && !udp_get_no_check6_rx(sk)) {
if (refcounted)
sock_put(sk);
goto report_csum_error;
@@ -1002,7 +1004,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
/* Unicast */
sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk) {
- if (!uh->check && !udp_sk(sk)->no_check6_rx)
+ if (!uh->check && !udp_get_no_check6_rx(sk))
goto report_csum_error;
return udp6_unicast_rcv_skb(sk, skb, uh);
}
@@ -1241,7 +1243,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
kfree_skb(skb);
return -EINVAL;
}
- if (udp_sk(sk)->no_check6_tx) {
+ if (udp_get_no_check6_tx(sk)) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1262,7 +1264,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
if (is_udplite)
csum = udplite_csum(skb);
- else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
+ else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
@@ -1281,7 +1283,7 @@ csum_partial:
send:
err = ip6_send_skb(skb);
if (err) {
- if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
+ if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
@@ -1332,7 +1334,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int addr_len = msg->msg_namelen;
bool connected = false;
int ulen = len;
- int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
+ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int err;
int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
@@ -1427,7 +1429,7 @@ do_udp_sendmsg:
fl6->fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
@@ -1593,7 +1595,7 @@ back_from_confirm:
do_append_data:
if (ipc6.dontfrag < 0)
- ipc6.dontfrag = np->dontfrag;
+ ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
&ipc6, fl6, (struct rt6_info *)dst,
@@ -1606,7 +1608,7 @@ do_append_data:
up->pending = 0;
if (err > 0)
- err = np->recverr ? net_xmit_errno(err) : 0;
+ err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0;
release_sock(sk);
out:
@@ -1644,11 +1646,11 @@ static void udpv6_splice_eof(struct socket *sock)
struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
- if (!up->pending || READ_ONCE(up->corkflag))
+ if (!up->pending || udp_test_bit(CORK, sk))
return;
lock_sock(sk);
- if (up->pending && !READ_ONCE(up->corkflag))
+ if (up->pending && !udp_test_bit(CORK, sk))
udp_v6_push_pending_frames(sk);
release_sock(sk);
}
@@ -1670,7 +1672,7 @@ void udpv6_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
- if (up->encap_enabled) {
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
static_branch_dec(&udpv6_encap_needed_key);
udp_encap_disable();
}
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 267d491e9707..a60bec9b14f1 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -17,7 +17,6 @@
static int udplitev6_sk_init(struct sock *sk)
{
udpv6_init_sock(sk);
- udp_sk(sk)->pcflag = UDPLITE_BIT;
pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
"please contact the netdev mailing list\n");
return 0;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 4907ab241d6b..4156387248e4 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -81,14 +81,14 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
struct ipv6hdr *ip6h;
int len;
int ip6hlen = sizeof(struct ipv6hdr);
-
__u8 *udpdata;
__be32 *udpdata32;
- __u16 encap_type = up->encap_type;
+ u16 encap_type;
if (skb->protocol == htons(ETH_P_IP))
return xfrm4_udp_encap_rcv(sk, skb);
+ encap_type = READ_ONCE(up->encap_type);
/* if this is not encapsulated socket, then just return now */
if (!encap_type)
return 1;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 03608d3ded4b..8d21ff25f160 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1139,9 +1139,9 @@ static void l2tp_tunnel_destruct(struct sock *sk)
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
/* No longer an encapsulation socket. See net/ipv4/udp.c */
- (udp_sk(sk))->encap_type = 0;
- (udp_sk(sk))->encap_rcv = NULL;
- (udp_sk(sk))->encap_destroy = NULL;
+ WRITE_ONCE(udp_sk(sk)->encap_type, 0);
+ udp_sk(sk)->encap_rcv = NULL;
+ udp_sk(sk)->encap_destroy = NULL;
break;
case L2TP_ENCAPTYPE_IP:
break;
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index f2ae03c40473..25ca89f80414 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -37,12 +37,6 @@
/* via netdev_priv() */
struct l2tp_eth {
struct l2tp_session *session;
- atomic_long_t tx_bytes;
- atomic_long_t tx_packets;
- atomic_long_t tx_dropped;
- atomic_long_t rx_bytes;
- atomic_long_t rx_packets;
- atomic_long_t rx_errors;
};
/* via l2tp_session_priv() */
@@ -79,10 +73,10 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev
int ret = l2tp_xmit_skb(session, skb);
if (likely(ret == NET_XMIT_SUCCESS)) {
- atomic_long_add(len, &priv->tx_bytes);
- atomic_long_inc(&priv->tx_packets);
+ DEV_STATS_ADD(dev, tx_bytes, len);
+ DEV_STATS_INC(dev, tx_packets);
} else {
- atomic_long_inc(&priv->tx_dropped);
+ DEV_STATS_INC(dev, tx_dropped);
}
return NETDEV_TX_OK;
}
@@ -90,14 +84,12 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev
static void l2tp_eth_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
- struct l2tp_eth *priv = netdev_priv(dev);
-
- stats->tx_bytes = (unsigned long)atomic_long_read(&priv->tx_bytes);
- stats->tx_packets = (unsigned long)atomic_long_read(&priv->tx_packets);
- stats->tx_dropped = (unsigned long)atomic_long_read(&priv->tx_dropped);
- stats->rx_bytes = (unsigned long)atomic_long_read(&priv->rx_bytes);
- stats->rx_packets = (unsigned long)atomic_long_read(&priv->rx_packets);
- stats->rx_errors = (unsigned long)atomic_long_read(&priv->rx_errors);
+ stats->tx_bytes = DEV_STATS_READ(dev, tx_bytes);
+ stats->tx_packets = DEV_STATS_READ(dev, tx_packets);
+ stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
+ stats->rx_bytes = DEV_STATS_READ(dev, rx_bytes);
+ stats->rx_packets = DEV_STATS_READ(dev, rx_packets);
+ stats->rx_errors = DEV_STATS_READ(dev, rx_errors);
}
static const struct net_device_ops l2tp_eth_netdev_ops = {
@@ -126,7 +118,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
{
struct l2tp_eth_sess *spriv = l2tp_session_priv(session);
struct net_device *dev;
- struct l2tp_eth *priv;
if (!pskb_may_pull(skb, ETH_HLEN))
goto error;
@@ -144,12 +135,11 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
if (!dev)
goto error_rcu;
- priv = netdev_priv(dev);
if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
- atomic_long_inc(&priv->rx_packets);
- atomic_long_add(data_len, &priv->rx_bytes);
+ DEV_STATS_INC(dev, rx_packets);
+ DEV_STATS_ADD(dev, rx_bytes, data_len);
} else {
- atomic_long_inc(&priv->rx_errors);
+ DEV_STATS_INC(dev, rx_errors);
}
rcu_read_unlock();
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 11f3d375cec0..bb373e249237 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -431,7 +431,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
return -ENOTCONN;
lsa->l2tp_conn_id = lsk->peer_conn_id;
lsa->l2tp_addr = sk->sk_v6_daddr;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
lsa->l2tp_flowinfo = np->flow_label;
} else {
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -528,7 +528,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EAFNOSUPPORT;
daddr = &lsa->l2tp_addr;
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
@@ -620,7 +620,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
if (ipc6.dontfrag < 0)
- ipc6.dontfrag = np->dontfrag;
+ ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
if (msg->msg_flags & MSG_CONFIRM)
goto do_confirm;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index a4af3b7675ef..c3a60fd19c37 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -309,7 +309,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta)
{
/* IEEE802.11ac-2013 Table E-4 */
- u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 };
+ static const u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 };
struct cfg80211_chan_def uc = sta->tdls_chandef;
enum nl80211_chan_width max_width =
ieee80211_sta_cap_chan_bw(&sta->deflink);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 8260202c0066..18ce624bfde2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -89,7 +89,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
break;
case SO_PRIORITY:
- ssk->sk_priority = val;
+ WRITE_ONCE(ssk->sk_priority, val);
break;
case SO_SNDBUF:
case SO_SNDBUFFORCE:
@@ -734,11 +734,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
lock_sock(sk);
sockopt_seq_inc(msk);
- val = inet_sk(sk)->tos;
+ val = READ_ONCE(inet_sk(sk)->tos);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- __ip_sock_set_tos(ssk, val);
+ ip_sock_set_tos(ssk, val);
}
release_sock(sk);
@@ -1343,7 +1343,7 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
switch (optname) {
case IP_TOS:
- return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
+ return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
}
return -EOPNOTSUPP;
@@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
ssk->sk_ipv6only = sk->sk_ipv6only;
- __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
+ ip_sock_set_tos(ssk, inet_sk(sk)->tos);
if (sk->sk_userlocks & tx_rx_locks) {
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 4174076c66fa..eaf9f2ed0067 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1298,17 +1298,13 @@ static void set_sock_size(struct sock *sk, int mode, int val)
static void set_mcast_loop(struct sock *sk, u_char loop)
{
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
- lock_sock(sk);
inet_assign_bit(MC_LOOP, sk, loop);
#ifdef CONFIG_IP_VS_IPV6
- if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
+ if (READ_ONCE(sk->sk_family) == AF_INET6) {
/* IPV6_MULTICAST_LOOP */
- np->mc_loop = loop ? 1 : 0;
+ inet6_assign_bit(MC6_LOOP, sk, loop);
}
#endif
- release_sock(sk);
}
/*
@@ -1320,13 +1316,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
- inet->mc_ttl = ttl;
+ WRITE_ONCE(inet->mc_ttl, ttl);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_HOPS */
- np->mcast_hops = ttl;
+ WRITE_ONCE(np->mcast_hops, ttl);
}
#endif
release_sock(sk);
@@ -1339,13 +1335,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
lock_sock(sk);
- inet->pmtudisc = val;
+ WRITE_ONCE(inet->pmtudisc, val);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MTU_DISCOVER */
- np->pmtudisc = val;
+ WRITE_ONCE(np->pmtudisc, val);
}
#endif
release_sock(sk);
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 48cc60084d28..5a049740758f 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -697,6 +697,31 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int
}
#endif
+static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport)
+{
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ const struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ break;
+ default:
+ return false;
+ }
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir != IP_CT_DIR_ORIGINAL)
+ return false;
+
+ return ct->tuplehash[!dir].tuple.dst.u.all != sport;
+}
+
static unsigned int
nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -707,8 +732,20 @@ nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
ret = nf_nat_ipv4_fn(priv, skb, state);
- if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr &&
- !inet_sk_transparent(sk))
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* skb has a socket assigned via tcp edemux. We need to check
+ * if nf_nat_ipv4_fn() has mangled the packet in a way that
+ * edemux would not have found this socket.
+ *
+ * This includes both changes to the source address and changes
+ * to the source port, which are both handled by the
+ * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux
+ * might have found a socket for the old (pre-snat) address.
+ */
+ if (saddr != ip_hdr(skb)->saddr ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
skb_orphan(skb); /* TCP edemux obtained wrong socket */
return ret;
@@ -938,6 +975,27 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
}
static unsigned int
+nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct in6_addr saddr = ipv6_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* see nf_nat_ipv4_local_in */
+ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
+ skb_orphan(skb);
+
+ return ret;
+}
+
+static unsigned int
nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -1051,7 +1109,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv6_fn,
+ .hook = nf_nat_ipv6_local_in,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a72b6aeefb1b..b4405db710b0 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3316,7 +3316,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
[NFTA_RULE_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE] = { .type = NLA_U64 },
- [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
[NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
[NFTA_RULE_POSITION] = { .type = NLA_U64 },
[NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
@@ -4254,12 +4254,16 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
[NFTA_SET_EXPR] = { .type = NLA_NESTED },
- [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
+};
+
+static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
+ [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
- [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED },
+ [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy),
};
static struct nft_set *nft_set_lookup(const struct nft_table *table,
@@ -4695,8 +4699,10 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
return -EINVAL;
set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
@@ -4713,10 +4719,6 @@ err_fill_set_info:
return err;
}
-static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
- [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
-};
-
static int nft_set_desc_concat_parse(const struct nlattr *attr,
struct nft_set_desc *desc)
{
@@ -5498,7 +5500,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
.len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
- [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -5506,7 +5508,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
- [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy),
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
@@ -6025,8 +6027,10 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
}
set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
@@ -6919,8 +6923,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
if (!list_empty(&set->bindings) &&
(set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS)))
@@ -7195,8 +7201,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
}
set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
if (nft_set_is_anonymous(set))
return -EOPNOTSUPP;
@@ -8692,6 +8700,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const nla[])
{
+ struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_cur(info->net);
u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
@@ -8717,13 +8726,17 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
genmask, 0);
- if (IS_ERR(table))
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
genmask);
- if (IS_ERR(flowtable))
+ if (IS_ERR(flowtable)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return PTR_ERR(flowtable);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 96e91ab71573..0eed00184adf 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -487,7 +487,7 @@ static struct sock *nr_make_new(struct sock *osk)
sock_init_data(NULL, sk);
sk->sk_type = osk->sk_type;
- sk->sk_priority = osk->sk_priority;
+ sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index fd66014d8a76..6fcd7e2ca81f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -311,11 +311,18 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
return 0;
}
-static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
- const struct nshhdr *nh)
+static noinline_for_stack int push_nsh(struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct nlattr *a)
{
+ u8 buffer[NSH_HDR_MAX_LEN];
+ struct nshhdr *nh = (struct nshhdr *)buffer;
int err;
+ err = nsh_hdr_from_nlattr(a, nh, NSH_HDR_MAX_LEN);
+ if (err)
+ return err;
+
err = nsh_push(skb, nh);
if (err)
return err;
@@ -873,7 +880,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
- dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
+ dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_rt.dst.dev = vport->dev;
@@ -890,7 +897,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
- dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
+ dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_rt.dst.dev = vport->dev;
@@ -1439,17 +1446,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
err = pop_eth(skb, key);
break;
- case OVS_ACTION_ATTR_PUSH_NSH: {
- u8 buffer[NSH_HDR_MAX_LEN];
- struct nshhdr *nh = (struct nshhdr *)buffer;
-
- err = nsh_hdr_from_nlattr(nla_data(a), nh,
- NSH_HDR_MAX_LEN);
- if (unlikely(err))
- break;
- err = push_nsh(skb, key, nh);
+ case OVS_ACTION_ATTR_PUSH_NSH:
+ err = push_nsh(skb, key, nla_data(a));
break;
- }
case OVS_ACTION_ATTR_POP_NSH:
err = pop_nsh(skb, key);
diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h
index 0c33889a8515..ed11cd12b512 100644
--- a/net/openvswitch/meter.h
+++ b/net/openvswitch/meter.h
@@ -39,13 +39,13 @@ struct dp_meter {
u32 max_delta_t;
u64 used;
struct ovs_flow_stats stats;
- struct dp_meter_band bands[];
+ struct dp_meter_band bands[] __counted_by(n_bands);
};
struct dp_meter_instance {
struct rcu_head rcu;
u32 n_meters;
- struct dp_meter __rcu *dp_meters[];
+ struct dp_meter __rcu *dp_meters[] __counted_by(n_meters);
};
struct dp_meter_table {
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 49dafe9ac72f..0cc5a4e19900 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -583,7 +583,7 @@ static struct sock *rose_make_new(struct sock *osk)
#endif
sk->sk_type = osk->sk_type;
- sk->sk_priority = osk->sk_priority;
+ sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 1e20bbd687f1..1424bfeaca73 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -375,9 +375,9 @@ out:
static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
[TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
- [TCA_ROUTE4_TO] = { .type = NLA_U32 },
- [TCA_ROUTE4_FROM] = { .type = NLA_U32 },
- [TCA_ROUTE4_IIF] = { .type = NLA_U32 },
+ [TCA_ROUTE4_TO] = NLA_POLICY_MAX(NLA_U32, 0xFF),
+ [TCA_ROUTE4_FROM] = NLA_POLICY_MAX(NLA_U32, 0xFF),
+ [TCA_ROUTE4_IIF] = NLA_POLICY_MAX(NLA_U32, 0x7FFF),
};
static int route4_set_parms(struct net *net, struct tcf_proto *tp,
@@ -397,33 +397,37 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
return err;
if (tb[TCA_ROUTE4_TO]) {
- if (new && handle & 0x8000)
+ if (new && handle & 0x8000) {
+ NL_SET_ERR_MSG(extack, "Invalid handle");
return -EINVAL;
+ }
to = nla_get_u32(tb[TCA_ROUTE4_TO]);
- if (to > 0xFF)
- return -EINVAL;
nhandle = to;
}
+ if (tb[TCA_ROUTE4_FROM] && tb[TCA_ROUTE4_IIF]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_ROUTE4_FROM],
+ "'from' and 'fromif' are mutually exclusive");
+ return -EINVAL;
+ }
+
if (tb[TCA_ROUTE4_FROM]) {
- if (tb[TCA_ROUTE4_IIF])
- return -EINVAL;
id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
- if (id > 0xFF)
- return -EINVAL;
nhandle |= id << 16;
} else if (tb[TCA_ROUTE4_IIF]) {
id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
- if (id > 0x7FFF)
- return -EINVAL;
nhandle |= (id | 0x8000) << 16;
} else
nhandle |= 0xFFFF << 16;
if (handle && new) {
nhandle |= handle & 0x7F00;
- if (nhandle != handle)
+ if (nhandle != handle) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Handle mismatch constructed: %x (expected: %x)",
+ handle, nhandle);
return -EINVAL;
+ }
}
if (!nhandle) {
@@ -478,7 +482,6 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
struct route4_filter __rcu **fp;
struct route4_filter *fold, *f1, *pfp, *f = NULL;
struct route4_bucket *b;
- struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ROUTE4_MAX + 1];
unsigned int h, th;
int err;
@@ -489,10 +492,12 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
return -EINVAL;
}
- if (opt == NULL)
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing options");
return -EINVAL;
+ }
- err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, opt,
+ err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, tca[TCA_OPTIONS],
route4_policy, NULL);
if (err < 0)
return err;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index da34fd4c9269..09d8afd04a2a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -546,7 +546,7 @@ META_COLLECTOR(int_sk_prio)
*err = -1;
return;
}
- dst->value = sk->sk_priority;
+ dst->value = READ_ONCE(sk->sk_priority);
}
META_COLLECTOR(int_sk_rcvlowat)
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index f59a2cb2c803..8eacdb54e72f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -2,7 +2,7 @@
/*
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
*
- * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
+ * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
*
* Meant to be mostly used for locally generated traffic :
* Fast classification depends on skb->sk being set before reaching us.
@@ -51,7 +51,8 @@
#include <net/tcp.h>
struct fq_skb_cb {
- u64 time_to_send;
+ u64 time_to_send;
+ u8 band;
};
static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
@@ -73,37 +74,41 @@ struct fq_flow {
struct sk_buff *tail; /* last skb in the list */
unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
};
- struct rb_node fq_node; /* anchor in fq_root[] trees */
+ union {
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ /* Following field is only used for q->internal,
+ * because q->internal is not hashed in fq_root[]
+ */
+ u64 stat_fastpath_packets;
+ };
struct sock *sk;
u32 socket_hash; /* sk_hash */
int qlen; /* number of packets in flow queue */
-/* Second cache line, used in fq_dequeue() */
+/* Second cache line */
int credit;
- /* 32bit hole on 64bit arches */
-
+ int band;
struct fq_flow *next; /* next pointer in RR lists */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
-} ____cacheline_aligned_in_smp;
+};
struct fq_flow_head {
struct fq_flow *first;
struct fq_flow *last;
};
-struct fq_sched_data {
+struct fq_perband_flows {
struct fq_flow_head new_flows;
-
struct fq_flow_head old_flows;
+ int credit;
+ int quantum; /* based on band nr : 576KB, 192KB, 64KB */
+};
- struct rb_root delayed; /* for rate limited flows */
- u64 time_next_delayed_flow;
- u64 ktime_cache; /* copy of last ktime_get_ns() */
- unsigned long unthrottle_latency_ns;
+struct fq_sched_data {
+/* Read mostly cache line */
- struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
@@ -117,24 +122,46 @@ struct fq_sched_data {
u8 rate_enable;
u8 fq_trees_log;
u8 horizon_drop;
+ u8 prio2band[(TC_PRIO_MAX + 1) >> 2];
+ u32 timer_slack; /* hrtimer slack in ns */
+
+/* Read/Write fields. */
+
+ unsigned int band_nr; /* band being serviced in fq_dequeue() */
+
+ struct fq_perband_flows band_flows[FQ_BANDS];
+
+ struct fq_flow internal; /* fastpath queue. */
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
+
+ u32 band_pkt_count[FQ_BANDS];
u32 flows;
- u32 inactive_flows;
+ u32 inactive_flows; /* Flows with no packet to send. */
u32 throttled_flows;
- u64 stat_gc_flows;
- u64 stat_internal_packets;
u64 stat_throttled;
+ struct qdisc_watchdog watchdog;
+ u64 stat_gc_flows;
+
+/* Seldom used fields. */
+
+ u64 stat_band_drops[FQ_BANDS];
u64 stat_ce_mark;
u64 stat_horizon_drops;
u64 stat_horizon_caps;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
-
- u32 timer_slack; /* hrtimer slack in ns */
- struct qdisc_watchdog watchdog;
};
+/* return the i-th 2-bit value ("crumb") */
+static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
+{
+ return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3;
+}
+
/*
* f->tail and f->age share the same location.
* We can use the low order bit to differentiate if this location points
@@ -159,8 +186,19 @@ static bool fq_flow_is_throttled(const struct fq_flow *f)
return f->next == &throttled;
}
-static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+enum new_flow {
+ NEW_FLOW,
+ OLD_FLOW
+};
+
+static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
+ enum new_flow list_sel)
{
+ struct fq_perband_flows *pband = &q->band_flows[flow->band];
+ struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
+ &pband->new_flows :
+ &pband->old_flows;
+
if (head->first)
head->last->next = flow;
else
@@ -173,7 +211,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
rb_erase(&f->rate_node, &q->delayed);
q->throttled_flows--;
- fq_flow_add_tail(&q->old_flows, f);
+ fq_flow_add_tail(q, f, OLD_FLOW);
}
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
@@ -258,17 +296,61 @@ static void fq_gc(struct fq_sched_data *q,
kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
}
-static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+/* Fast path can be used if :
+ * 1) Packet tstamp is in the past.
+ * 2) FQ qlen == 0 OR
+ * (no flow is currently eligible for transmit,
+ * AND fast path queue has less than 8 packets)
+ * 3) No SO_MAX_PACING_RATE on the socket (if any).
+ * 4) No @maxrate attribute on this qdisc,
+ *
+ * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
+ */
+static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
+ u64 now)
{
+ const struct fq_sched_data *q = qdisc_priv(sch);
+ const struct sock *sk;
+
+ if (fq_skb_cb(skb)->time_to_send > now)
+ return false;
+
+ if (sch->q.qlen != 0) {
+ /* Even if some packets are stored in this qdisc,
+ * we can still enable fast path if all of them are
+ * scheduled in the future (ie no flows are eligible)
+ * or in the fast path queue.
+ */
+ if (q->flows != q->inactive_flows + q->throttled_flows)
+ return false;
+
+ /* Do not allow fast path queue to explode, we want Fair Queue mode
+ * under pressure.
+ */
+ if (q->internal.qlen >= 8)
+ return false;
+ }
+
+ sk = skb->sk;
+ if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
+ sk->sk_max_pacing_rate != ~0UL)
+ return false;
+
+ if (q->flow_max_rate != ~0UL)
+ return false;
+
+ return true;
+}
+
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
+ u64 now)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
struct rb_node **p, *parent;
struct sock *sk = skb->sk;
struct rb_root *root;
struct fq_flow *f;
- /* warning: no starvation prevention... */
- if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
- return &q->internal;
-
/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
* or a listener (SYNCOOKIE mode)
* 1) request sockets are not full blown,
@@ -299,11 +381,14 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
sk = (struct sock *)((hash << 1) | 1UL);
}
+ if (fq_fastpath_check(sch, skb, now)) {
+ q->internal.stat_fastpath_packets++;
+ return &q->internal;
+ }
+
root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
- if (q->flows >= (2U << q->fq_trees_log) &&
- q->inactive_flows > q->flows/2)
- fq_gc(q, root, sk);
+ fq_gc(q, root, sk);
p = &root->rb_node;
parent = NULL;
@@ -396,7 +481,6 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
{
fq_erase_head(sch, flow, skb);
skb_mark_not_on_list(skb);
- flow->qlen--;
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
}
@@ -434,9 +518,9 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
}
static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
- const struct fq_sched_data *q)
+ const struct fq_sched_data *q, u64 now)
{
- return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
+ return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
}
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -444,53 +528,57 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
+ u64 now;
+ u8 band;
- if (unlikely(sch->q.qlen >= sch->limit))
+ band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
+ if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
+ q->stat_band_drops[band]++;
return qdisc_drop(skb, sch, to_free);
+ }
+ now = ktime_get_ns();
if (!skb->tstamp) {
- fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+ fq_skb_cb(skb)->time_to_send = now;
} else {
- /* Check if packet timestamp is too far in the future.
- * Try first if our cached value, to avoid ktime_get_ns()
- * cost in most cases.
- */
- if (fq_packet_beyond_horizon(skb, q)) {
- /* Refresh our cache and check another time */
- q->ktime_cache = ktime_get_ns();
- if (fq_packet_beyond_horizon(skb, q)) {
- if (q->horizon_drop) {
+ /* Check if packet timestamp is too far in the future. */
+ if (fq_packet_beyond_horizon(skb, q, now)) {
+ if (q->horizon_drop) {
q->stat_horizon_drops++;
return qdisc_drop(skb, sch, to_free);
- }
- q->stat_horizon_caps++;
- skb->tstamp = q->ktime_cache + q->horizon;
}
+ q->stat_horizon_caps++;
+ skb->tstamp = now + q->horizon;
}
fq_skb_cb(skb)->time_to_send = skb->tstamp;
}
- f = fq_classify(skb, q);
- if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
- q->stat_flows_plimit++;
- return qdisc_drop(skb, sch, to_free);
- }
+ f = fq_classify(sch, skb, now);
- f->qlen++;
- qdisc_qstats_backlog_inc(sch, skb);
- if (fq_flow_is_detached(f)) {
- fq_flow_add_tail(&q->new_flows, f);
- if (time_after(jiffies, f->age + q->flow_refill_delay))
- f->credit = max_t(u32, f->credit, q->quantum);
- q->inactive_flows--;
+ if (f != &q->internal) {
+ if (unlikely(f->qlen >= q->flow_plimit)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ if (fq_flow_is_detached(f)) {
+ fq_flow_add_tail(q, f, NEW_FLOW);
+ if (time_after(jiffies, f->age + q->flow_refill_delay))
+ f->credit = max_t(u32, f->credit, q->quantum);
+ }
+
+ f->band = band;
+ q->band_pkt_count[band]++;
+ fq_skb_cb(skb)->band = band;
+ if (f->qlen == 0)
+ q->inactive_flows--;
}
+ f->qlen++;
/* Note: this overwrites f->age */
flow_queue_add(f, skb);
- if (unlikely(f == &q->internal)) {
- q->stat_internal_packets++;
- }
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -523,13 +611,26 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
}
}
+static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
+{
+ if (pband->credit <= 0)
+ return NULL;
+
+ if (pband->new_flows.first)
+ return &pband->new_flows;
+
+ return pband->old_flows.first ? &pband->old_flows : NULL;
+}
+
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_perband_flows *pband;
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
unsigned long rate;
+ int retry;
u32 plen;
u64 now;
@@ -538,30 +639,38 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
skb = fq_peek(&q->internal);
if (unlikely(skb)) {
+ q->internal.qlen--;
fq_dequeue_skb(sch, &q->internal, skb);
goto out;
}
- q->ktime_cache = now = ktime_get_ns();
+ now = ktime_get_ns();
fq_check_throttled(q, now);
+ retry = 0;
+ pband = &q->band_flows[q->band_nr];
begin:
- head = &q->new_flows;
- if (!head->first) {
- head = &q->old_flows;
- if (!head->first) {
- if (q->time_next_delayed_flow != ~0ULL)
- qdisc_watchdog_schedule_range_ns(&q->watchdog,
+ head = fq_pband_head_select(pband);
+ if (!head) {
+ while (++retry < FQ_BANDS) {
+ if (++q->band_nr == FQ_BANDS)
+ q->band_nr = 0;
+ pband = &q->band_flows[q->band_nr];
+ pband->credit = min(pband->credit + pband->quantum,
+ pband->quantum);
+ goto begin;
+ }
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
q->time_next_delayed_flow,
q->timer_slack);
- return NULL;
- }
+ return NULL;
}
f = head->first;
-
+ retry = 0;
if (f->credit <= 0) {
f->credit += q->quantum;
head->first = f->next;
- fq_flow_add_tail(&q->old_flows, f);
+ fq_flow_add_tail(q, f, OLD_FLOW);
goto begin;
}
@@ -581,20 +690,23 @@ begin:
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
+ if (--f->qlen == 0)
+ q->inactive_flows++;
+ q->band_pkt_count[fq_skb_cb(skb)->band]--;
fq_dequeue_skb(sch, f, skb);
} else {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
- if ((head == &q->new_flows) && q->old_flows.first) {
- fq_flow_add_tail(&q->old_flows, f);
+ if (head == &pband->new_flows) {
+ fq_flow_add_tail(q, f, OLD_FLOW);
} else {
fq_flow_set_detached(f);
- q->inactive_flows++;
}
goto begin;
}
plen = qdisc_pkt_len(skb);
f->credit -= plen;
+ pband->credit -= plen;
if (!q->rate_enable)
goto out;
@@ -607,7 +719,7 @@ begin:
*/
if (!skb->tstamp) {
if (skb->sk)
- rate = min(skb->sk->sk_pacing_rate, rate);
+ rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
if (rate <= q->low_rate_threshold) {
f->credit = 0;
@@ -686,8 +798,10 @@ static void fq_reset(struct Qdisc *sch)
kmem_cache_free(fq_flow_cachep, f);
}
}
- q->new_flows.first = NULL;
- q->old_flows.first = NULL;
+ for (idx = 0; idx < FQ_BANDS; idx++) {
+ q->band_flows[idx].new_flows.first = NULL;
+ q->band_flows[idx].old_flows.first = NULL;
+ }
q->delayed = RB_ROOT;
q->flows = 0;
q->inactive_flows = 0;
@@ -801,8 +915,77 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
[TCA_FQ_HORIZON] = { .type = NLA_U32 },
[TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
+ [TCA_FQ_PRIOMAP] = {
+ .type = NLA_BINARY,
+ .len = sizeof(struct tc_prio_qopt),
+ },
+ [TCA_FQ_WEIGHTS] = {
+ .type = NLA_BINARY,
+ .len = FQ_BANDS * sizeof(s32),
+ },
};
+/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
+static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
+{
+ const int num_elems = TC_PRIO_MAX + 1;
+ int i;
+
+ memset(out, 0, num_elems / 4);
+ for (i = 0; i < num_elems; i++)
+ out[i / 4] |= in[i] << (2 * (i & 0x3));
+}
+
+static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
+{
+ const int num_elems = TC_PRIO_MAX + 1;
+ int i;
+
+ for (i = 0; i < num_elems; i++)
+ out[i] = fq_prio2band(in, i);
+}
+
+static int fq_load_weights(struct fq_sched_data *q,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ s32 *weights = nla_data(attr);
+ int i;
+
+ for (i = 0; i < FQ_BANDS; i++) {
+ if (weights[i] < FQ_MIN_WEIGHT) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
+ weights[i], FQ_MIN_WEIGHT);
+ return -EINVAL;
+ }
+ }
+ for (i = 0; i < FQ_BANDS; i++)
+ q->band_flows[i].quantum = weights[i];
+ return 0;
+}
+
+static int fq_load_priomap(struct fq_sched_data *q,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const struct tc_prio_qopt *map = nla_data(attr);
+ int i;
+
+ if (map->bands != FQ_BANDS) {
+ NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
+ return -EINVAL;
+ }
+ for (i = 0; i < TC_PRIO_MAX + 1; i++) {
+ if (map->priomap[i] >= FQ_BANDS) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
+ i, map->priomap[i]);
+ return -EINVAL;
+ }
+ }
+ fq_prio2band_compress_crumb(map->priomap, q->prio2band);
+ return 0;
+}
+
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -877,6 +1060,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
}
+ if (!err && tb[TCA_FQ_PRIOMAP])
+ err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
+
+ if (!err && tb[TCA_FQ_WEIGHTS])
+ err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
+
if (tb[TCA_FQ_ORPHAN_MASK])
q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
@@ -928,7 +1117,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct fq_sched_data *q = qdisc_priv(sch);
- int err;
+ int i, err;
sch->limit = 10000;
q->flow_plimit = 100;
@@ -938,8 +1127,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->flow_max_rate = ~0UL;
q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
- q->new_flows.first = NULL;
- q->old_flows.first = NULL;
+ for (i = 0; i < FQ_BANDS; i++) {
+ q->band_flows[i].new_flows.first = NULL;
+ q->band_flows[i].old_flows.first = NULL;
+ }
+ q->band_flows[0].quantum = 9 << 16;
+ q->band_flows[1].quantum = 3 << 16;
+ q->band_flows[2].quantum = 1 << 16;
q->delayed = RB_ROOT;
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
@@ -954,6 +1148,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
/* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
+ fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
if (opt)
@@ -968,8 +1163,12 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
u64 ce_threshold = q->ce_threshold;
+ struct tc_prio_qopt prio = {
+ .bands = FQ_BANDS,
+ };
u64 horizon = q->horizon;
struct nlattr *opts;
+ s32 weights[3];
opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
@@ -999,6 +1198,16 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
goto nla_put_failure;
+ fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
+ if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
+ goto nla_put_failure;
+
+ weights[0] = q->band_flows[0].quantum;
+ weights[1] = q->band_flows[1].quantum;
+ weights[2] = q->band_flows[2].quantum;
+ if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
+ goto nla_put_failure;
+
return nla_nest_end(skb, opts);
nla_put_failure:
@@ -1009,11 +1218,15 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct tc_fq_qd_stats st;
+ int i;
+
+ st.pad = 0;
sch_tree_lock(sch);
st.gc_flows = q->stat_gc_flows;
- st.highprio_packets = q->stat_internal_packets;
+ st.highprio_packets = 0;
+ st.fastpath_packets = q->internal.stat_fastpath_packets;
st.tcp_retrans = 0;
st.throttled = q->stat_throttled;
st.flows_plimit = q->stat_flows_plimit;
@@ -1029,6 +1242,10 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.ce_mark = q->stat_ce_mark;
st.horizon_drops = q->stat_horizon_drops;
st.horizon_caps = q->stat_horizon_caps;
+ for (i = 0; i < FQ_BANDS; i++) {
+ st.band_drops[i] = q->stat_band_drops[i];
+ st.band_pkt_count[i] = q->band_pkt_count[i];
+ }
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
@@ -1056,7 +1273,7 @@ static int __init fq_module_init(void)
fq_flow_cachep = kmem_cache_create("fq_flow_cache",
sizeof(struct fq_flow),
- 0, 0, NULL);
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (!fq_flow_cachep)
return -ENOMEM;
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
index a9bd0a235890..ce63414185fd 100644
--- a/net/sched/sch_frag.c
+++ b/net/sched/sch_frag.c
@@ -96,7 +96,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
unsigned long orig_dst;
sch_frag_prepare_frag(skb, xmit);
- dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
+ dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
sch_frag_rt.dst.dev = skb->dev;
@@ -112,7 +112,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
sch_frag_prepare_frag(skb, xmit);
memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
- dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
+ dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
sch_frag_rt.dst.dev = skb->dev;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5d7e23f4cc0e..4195a4bc26ca 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -694,9 +694,10 @@ struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
.owner = THIS_MODULE,
};
-static const u8 prio2band[TC_PRIO_MAX + 1] = {
- 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
};
+EXPORT_SYMBOL(sch_default_prio2band);
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
@@ -721,7 +722,7 @@ static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- int band = prio2band[skb->priority & TC_PRIO_MAX];
+ int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct skb_array *q = band2list(priv, band);
unsigned int pkt_len = qdisc_pkt_len(skb);
@@ -830,7 +831,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
- memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+ memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 43f2731bf590..24368f755ab1 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -128,7 +128,6 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
{
struct sctp_association *asoc = t->asoc;
struct sock *sk = asoc->base.sk;
- struct ipv6_pinfo *np;
int err = 0;
switch (type) {
@@ -149,9 +148,8 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
break;
}
- np = inet6_sk(sk);
icmpv6_err_convert(type, code, &err);
- if (!sock_owned_by_user(sk) && np->recverr) {
+ if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
sk->sk_err = err;
sk_error_report(sk);
} else {
@@ -249,7 +247,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
rcu_read_lock();
res = ip6_xmit(sk, skb, fl6, sk->sk_mark,
rcu_dereference(np->opt),
- tclass, sk->sk_priority);
+ tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
return res;
}
@@ -298,7 +296,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
- if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+ if (inet6_test_bit(SNDFLOW, sk) &&
+ (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2185f44198de..94c6dd53cd62 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,7 +426,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct dst_entry *dst = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
- __u8 tos = inet_sk(sk)->tos;
+ u8 tos = READ_ONCE(inet_sk(sk)->tos);
if (t->dscp & SCTP_DSCP_SET_MASK)
tos = t->dscp & SCTP_DSCP_VAL_MASK;
@@ -1057,7 +1057,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
struct flowi4 *fl4 = &t->fl.u.ip4;
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
- __u8 dscp = inet->tos;
+ __u8 dscp = READ_ONCE(inet->tos);
__be16 df = 0;
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 08527d882e56..f80208edd6a5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3303,7 +3303,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
/* Process the TLVs contained within the ASCONF chunk. */
sctp_walk_params(param, addip) {
- /* Skip preceeding address parameters. */
+ /* Skip preceding address parameters. */
if (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
param.p->type == SCTP_PARAM_IPV6_ADDRESS)
continue;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index bacdd971615e..297681601414 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -493,7 +493,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
nsk->sk_sndtimeo = osk->sk_sndtimeo;
nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
nsk->sk_mark = READ_ONCE(osk->sk_mark);
- nsk->sk_priority = osk->sk_priority;
+ nsk->sk_priority = READ_ONCE(osk->sk_priority);
nsk->sk_rcvlowat = osk->sk_rcvlowat;
nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
nsk->sk_err = osk->sk_err;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index e33b4f29f77c..d0143823658d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1446,7 +1446,7 @@ u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l,
p = (struct tipc_gap_ack_blks *)msg_data(hdr);
sz = ntohs(p->len);
/* Sanity check */
- if (sz == struct_size(p, gacks, p->ugack_cnt + p->bgack_cnt)) {
+ if (sz == struct_size(p, gacks, size_add(p->ugack_cnt, p->bgack_cnt))) {
/* Good, check if the desired type exists */
if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt))
goto ok;
@@ -1533,7 +1533,7 @@ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr)
__tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0;
/* Total len */
- len = struct_size(ga, gacks, ga->bgack_cnt + ga->ugack_cnt);
+ len = struct_size(ga, gacks, size_add(ga->bgack_cnt, ga->ugack_cnt));
ga->len = htons(len);
return len;
}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d1fc295b83b5..270712b8d391 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1487,7 +1487,7 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov,
*/
aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
aead_size = ALIGN(aead_size, __alignof__(*dctx));
- mem = kmalloc(aead_size + struct_size(dctx, sg, n_sgin + n_sgout),
+ mem = kmalloc(aead_size + struct_size(dctx, sg, size_add(n_sgin, n_sgout)),
sk->sk_allocation);
if (!mem) {
err = -ENOMEM;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 020cf17ab7e4..013b65241b65 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1921,6 +1921,9 @@ out_err:
err = total_written;
}
out:
+ if (sk->sk_type == SOCK_STREAM)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+
release_sock(sk);
return err;
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..09ba3128e759 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -63,6 +63,17 @@ struct virtio_vsock {
u32 guest_cid;
bool seqpacket_allow;
+
+ /* These fields are used only in tx path in function
+ * 'virtio_transport_send_pkt_work()', so to save
+ * stack space in it, place both of them here. Each
+ * pointer from 'out_sgs' points to the corresponding
+ * element in 'out_bufs' - this is initialized in
+ * 'virtio_vsock_probe()'. Both fields are protected
+ * by 'tx_lock'. +1 is needed for packet header.
+ */
+ struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1];
+ struct scatterlist out_bufs[MAX_SKB_FRAGS + 1];
};
static u32 virtio_transport_get_local_cid(void)
@@ -100,8 +111,8 @@ virtio_transport_send_pkt_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];
for (;;) {
- struct scatterlist hdr, buf, *sgs[2];
int ret, in_sg = 0, out_sg = 0;
+ struct scatterlist **sgs;
struct sk_buff *skb;
bool reply;
@@ -111,12 +122,43 @@ virtio_transport_send_pkt_work(struct work_struct *work)
virtio_transport_deliver_tap_pkt(skb);
reply = virtio_vsock_skb_reply(skb);
-
- sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb)));
- sgs[out_sg++] = &hdr;
- if (skb->len > 0) {
- sg_init_one(&buf, skb->data, skb->len);
- sgs[out_sg++] = &buf;
+ sgs = vsock->out_sgs;
+ sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
+ sizeof(*virtio_vsock_hdr(skb)));
+ out_sg++;
+
+ if (!skb_is_nonlinear(skb)) {
+ if (skb->len > 0) {
+ sg_init_one(sgs[out_sg], skb->data, skb->len);
+ out_sg++;
+ }
+ } else {
+ struct skb_shared_info *si;
+ int i;
+
+ /* If skb is nonlinear, then its buffer must contain
+ * only header and nothing more. Data is stored in
+ * the fragged part.
+ */
+ WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb)));
+
+ si = skb_shinfo(skb);
+
+ for (i = 0; i < si->nr_frags; i++) {
+ skb_frag_t *skb_frag = &si->frags[i];
+ void *va;
+
+ /* We will use 'page_to_virt()' for the userspace page
+ * here, because virtio or dma-mapping layers will call
+ * 'virt_to_phys()' later to fill the buffer descriptor.
+ * We don't touch memory at "virtual" address of this page.
+ */
+ va = page_to_virt(skb_frag->bv_page);
+ sg_init_one(sgs[out_sg],
+ va + skb_frag->bv_offset,
+ skb_frag->bv_len);
+ out_sg++;
+ }
}
ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL);
@@ -413,6 +455,37 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}
+static bool virtio_transport_can_msgzerocopy(int bufs_num)
+{
+ struct virtio_vsock *vsock;
+ bool res = false;
+
+ rcu_read_lock();
+
+ vsock = rcu_dereference(the_virtio_vsock);
+ if (vsock) {
+ struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+
+ /* Check that tx queue is large enough to keep whole
+ * data to send. This is needed, because when there is
+ * not enough free space in the queue, current skb to
+ * send will be reinserted to the head of tx list of
+ * the socket to retry transmission later, so if skb
+ * is bigger than whole queue, it will be reinserted
+ * again and again, thus blocking other skbs to be sent.
+ * Each page of the user provided buffer will be added
+ * as a single buffer to the tx virtqueue, so compare
+ * number of pages against maximum capacity of the queue.
+ */
+ if (bufs_num <= vq->num_max)
+ res = true;
+ }
+
+ rcu_read_unlock();
+
+ return res;
+}
+
static bool virtio_transport_seqpacket_allow(u32 remote_cid);
static struct virtio_transport virtio_transport = {
@@ -462,6 +535,7 @@ static struct virtio_transport virtio_transport = {
},
.send_pkt = virtio_transport_send_pkt,
+ .can_msgzerocopy = virtio_transport_can_msgzerocopy,
};
static bool virtio_transport_seqpacket_allow(u32 remote_cid)
@@ -621,6 +695,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
{
struct virtio_vsock *vsock = NULL;
int ret;
+ int i;
ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
if (ret)
@@ -663,6 +738,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
if (ret < 0)
goto out;
+ for (i = 0; i < ARRAY_SIZE(vsock->out_sgs); i++)
+ vsock->out_sgs[i] = &vsock->out_bufs[i];
+
rcu_assign_pointer(the_virtio_vsock, vsock);
mutex_unlock(&the_virtio_vsock_mutex);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 352d042b130b..e22c81435ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -37,27 +37,89 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
return container_of(t, struct virtio_transport, transport);
}
-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
- size_t len,
- u32 src_cid,
- u32 src_port,
- u32 dst_cid,
- u32 dst_port)
-{
- const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
- struct virtio_vsock_hdr *hdr;
- struct sk_buff *skb;
- void *payload;
- int err;
+static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
+ struct virtio_vsock_pkt_info *info,
+ size_t pkt_len)
+{
+ struct iov_iter *iov_iter;
- skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
- if (!skb)
- return NULL;
+ if (!info->msg)
+ return false;
+
+ iov_iter = &info->msg->msg_iter;
+
+ if (iov_iter->iov_offset)
+ return false;
+
+ /* We can't send whole iov. */
+ if (iov_iter->count > pkt_len)
+ return false;
+
+ /* Check that transport can send data in zerocopy mode. */
+ t_ops = virtio_transport_get_ops(info->vsk);
+
+ if (t_ops->can_msgzerocopy) {
+ int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
+ int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS);
+
+ /* +1 is for packet header. */
+ return t_ops->can_msgzerocopy(pages_to_send + 1);
+ }
+
+ return true;
+}
+
+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
+ struct sk_buff *skb,
+ struct msghdr *msg,
+ bool zerocopy)
+{
+ struct ubuf_info *uarg;
+
+ if (msg->msg_ubuf) {
+ uarg = msg->msg_ubuf;
+ net_zcopy_get(uarg);
+ } else {
+ struct iov_iter *iter = &msg->msg_iter;
+ struct ubuf_info_msgzc *uarg_zc;
+
+ uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+ iter->count,
+ NULL);
+ if (!uarg)
+ return -1;
+
+ uarg_zc = uarg_to_msgzc(uarg);
+ uarg_zc->zerocopy = zerocopy ? 1 : 0;
+ }
+
+ skb_zcopy_init(skb, uarg);
+
+ return 0;
+}
+
+static int virtio_transport_fill_skb(struct sk_buff *skb,
+ struct virtio_vsock_pkt_info *info,
+ size_t len,
+ bool zcopy)
+{
+ if (zcopy)
+ return __zerocopy_sg_from_iter(info->msg, NULL, skb,
+ &info->msg->msg_iter,
+ len);
+
+ return memcpy_from_msg(skb_put(skb, len), info->msg, len);
+}
+
+static void virtio_transport_init_hdr(struct sk_buff *skb,
+ struct virtio_vsock_pkt_info *info,
+ size_t payload_len,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port)
+{
+ struct virtio_vsock_hdr *hdr;
hdr = virtio_vsock_hdr(skb);
hdr->type = cpu_to_le16(info->type);
@@ -67,43 +129,28 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
hdr->src_port = cpu_to_le32(src_port);
hdr->dst_port = cpu_to_le32(dst_port);
hdr->flags = cpu_to_le32(info->flags);
- hdr->len = cpu_to_le32(len);
-
- if (info->msg && len > 0) {
- payload = skb_put(skb, len);
- err = memcpy_from_msg(payload, info->msg, len);
- if (err)
- goto out;
-
- if (msg_data_left(info->msg) == 0 &&
- info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
- hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
-
- if (info->msg->msg_flags & MSG_EOR)
- hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
- }
- }
+ hdr->len = cpu_to_le32(payload_len);
+}
- if (info->reply)
- virtio_vsock_skb_set_reply(skb);
+static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb,
+ void *dst,
+ size_t len)
+{
+ struct iov_iter iov_iter = { 0 };
+ struct kvec kvec;
+ size_t to_copy;
- trace_virtio_transport_alloc_pkt(src_cid, src_port,
- dst_cid, dst_port,
- len,
- info->type,
- info->op,
- info->flags);
+ kvec.iov_base = dst;
+ kvec.iov_len = len;
- if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
- WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
- goto out;
- }
+ iov_iter.iter_type = ITER_KVEC;
+ iov_iter.kvec = &kvec;
+ iov_iter.nr_segs = 1;
- return skb;
+ to_copy = min_t(size_t, len, skb->len);
-out:
- kfree_skb(skb);
- return NULL;
+ skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+ &iov_iter, to_copy);
}
/* Packet capture */
@@ -114,7 +161,6 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
struct af_vsockmon_hdr *hdr;
struct sk_buff *skb;
size_t payload_len;
- void *payload_buf;
/* A packet could be split to fit the RX buffer, so we can retrieve
* the payload length from the header and the buffer pointer taking
@@ -122,7 +168,6 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
*/
pkt_hdr = virtio_vsock_hdr(pkt);
payload_len = pkt->len;
- payload_buf = pkt->data;
skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len,
GFP_ATOMIC);
@@ -165,7 +210,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr));
if (payload_len) {
- skb_put_data(skb, payload_buf, payload_len);
+ if (skb_is_nonlinear(pkt)) {
+ void *data = skb_put(skb, payload_len);
+
+ virtio_transport_copy_nonlinear_skb(pkt, data, payload_len);
+ } else {
+ skb_put_data(skb, pkt->data, payload_len);
+ }
}
return skb;
@@ -189,6 +240,82 @@ static u16 virtio_transport_get_type(struct sock *sk)
return VIRTIO_VSOCK_TYPE_SEQPACKET;
}
+/* Returns new sk_buff on success, otherwise returns NULL. */
+static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
+ size_t payload_len,
+ bool zcopy,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port)
+{
+ struct vsock_sock *vsk;
+ struct sk_buff *skb;
+ size_t skb_len;
+
+ skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
+
+ if (!zcopy)
+ skb_len += payload_len;
+
+ skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ virtio_transport_init_hdr(skb, info, payload_len, src_cid, src_port,
+ dst_cid, dst_port);
+
+ vsk = info->vsk;
+
+ /* If 'vsk' != NULL then payload is always present, so we
+ * will never call '__zerocopy_sg_from_iter()' below without
+ * setting skb owner in 'skb_set_owner_w()'. The only case
+ * when 'vsk' == NULL is VIRTIO_VSOCK_OP_RST control message
+ * without payload.
+ */
+ WARN_ON_ONCE(!(vsk && (info->msg && payload_len)) && zcopy);
+
+ /* Set owner here, because '__zerocopy_sg_from_iter()' uses
+ * owner of skb without check to update 'sk_wmem_alloc'.
+ */
+ if (vsk)
+ skb_set_owner_w(skb, sk_vsock(vsk));
+
+ if (info->msg && payload_len > 0) {
+ int err;
+
+ err = virtio_transport_fill_skb(skb, info, payload_len, zcopy);
+ if (err)
+ goto out;
+
+ if (msg_data_left(info->msg) == 0 &&
+ info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+
+ hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+
+ if (info->msg->msg_flags & MSG_EOR)
+ hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+ }
+ }
+
+ if (info->reply)
+ virtio_vsock_skb_set_reply(skb);
+
+ trace_virtio_transport_alloc_pkt(src_cid, src_port,
+ dst_cid, dst_port,
+ payload_len,
+ info->type,
+ info->op,
+ info->flags,
+ zcopy);
+
+ return skb;
+out:
+ kfree_skb(skb);
+ return NULL;
+}
+
/* This function can only be used on connecting/connected sockets,
* since a socket assigned to a transport is required.
*
@@ -197,10 +324,12 @@ static u16 virtio_transport_get_type(struct sock *sk)
static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info *info)
{
+ u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
u32 src_cid, src_port, dst_cid, dst_port;
const struct virtio_transport *t_ops;
struct virtio_vsock_sock *vvs;
u32 pkt_len = info->pkt_len;
+ bool can_zcopy = false;
u32 rest_len;
int ret;
@@ -229,15 +358,30 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
return pkt_len;
+ if (info->msg) {
+ /* If zerocopy is not enabled by 'setsockopt()', we behave as
+ * there is no MSG_ZEROCOPY flag set.
+ */
+ if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
+ info->msg->msg_flags &= ~MSG_ZEROCOPY;
+
+ if (info->msg->msg_flags & MSG_ZEROCOPY)
+ can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len);
+
+ if (can_zcopy)
+ max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
+ (MAX_SKB_FRAGS * PAGE_SIZE));
+ }
+
rest_len = pkt_len;
do {
struct sk_buff *skb;
size_t skb_len;
- skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
+ skb_len = min(max_skb_len, rest_len);
- skb = virtio_transport_alloc_skb(info, skb_len,
+ skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy,
src_cid, src_port,
dst_cid, dst_port);
if (!skb) {
@@ -245,6 +389,21 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
break;
}
+ /* We process buffer part by part, allocating skb on
+ * each iteration. If this is last skb for this buffer
+ * and MSG_ZEROCOPY mode is in use - we must allocate
+ * completion for the current syscall.
+ */
+ if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
+ skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) {
+ if (virtio_transport_init_zcopy_skb(vsk, skb,
+ info->msg,
+ can_zcopy)) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
virtio_transport_inc_tx_pkt(vvs, skb);
ret = t_ops->send_pkt(skb);
@@ -364,9 +523,10 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
spin_unlock_bh(&vvs->rx_lock);
/* sk_lock is held by caller so no one else can dequeue.
- * Unlock rx_lock since memcpy_to_msg() may sleep.
+ * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
*/
- err = memcpy_to_msg(msg, skb->data, bytes);
+ err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+ &msg->msg_iter, bytes);
if (err)
goto out;
@@ -410,25 +570,27 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
while (total < len && !skb_queue_empty(&vvs->rx_queue)) {
skb = skb_peek(&vvs->rx_queue);
- bytes = len - total;
- if (bytes > skb->len)
- bytes = skb->len;
+ bytes = min_t(size_t, len - total,
+ skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset);
/* sk_lock is held by caller so no one else can dequeue.
- * Unlock rx_lock since memcpy_to_msg() may sleep.
+ * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
*/
spin_unlock_bh(&vvs->rx_lock);
- err = memcpy_to_msg(msg, skb->data, bytes);
+ err = skb_copy_datagram_iter(skb,
+ VIRTIO_VSOCK_SKB_CB(skb)->offset,
+ &msg->msg_iter, bytes);
if (err)
goto out;
spin_lock_bh(&vvs->rx_lock);
total += bytes;
- skb_pull(skb, bytes);
- if (skb->len == 0) {
+ VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes;
+
+ if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) {
u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
virtio_transport_dec_rx_pkt(vvs, pkt_len);
@@ -492,9 +654,10 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
spin_unlock_bh(&vvs->rx_lock);
/* sk_lock is held by caller so no one else can dequeue.
- * Unlock rx_lock since memcpy_to_msg() may sleep.
+ * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
*/
- err = memcpy_to_msg(msg, skb->data, bytes);
+ err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+ &msg->msg_iter, bytes);
if (err)
return err;
@@ -553,11 +716,13 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
int err;
/* sk_lock is held by caller so no one else can dequeue.
- * Unlock rx_lock since memcpy_to_msg() may sleep.
+ * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
*/
spin_unlock_bh(&vvs->rx_lock);
- err = memcpy_to_msg(msg, skb->data, bytes_to_copy);
+ err = skb_copy_datagram_iter(skb, 0,
+ &msg->msg_iter,
+ bytes_to_copy);
if (err) {
/* Copy of message failed. Rest of
* fragments will be freed without copy.
@@ -954,7 +1119,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
if (!t)
return -ENOTCONN;
- reply = virtio_transport_alloc_skb(&info, 0,
+ reply = virtio_transport_alloc_skb(&info, 0, false,
le64_to_cpu(hdr->dst_cid),
le32_to_cpu(hdr->dst_port),
le64_to_cpu(hdr->src_cid),
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 0fb5143bec7a..aad8ffeaee04 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -598,7 +598,7 @@ static struct sock *x25_make_new(struct sock *osk)
x25 = x25_sk(sk);
sk->sk_type = osk->sk_type;
- sk->sk_priority = osk->sk_priority;
+ sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 55f8b9b0e06d..f5e96e0d6e01 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -684,7 +684,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
}
skb->dev = dev;
- skb->priority = xs->sk.sk_priority;
+ skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
xsk_set_destructor_arg(skb);
@@ -1228,7 +1228,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->dev = dev;
xs->zc = xs->umem->zc;
- xs->sg = !!(flags & XDP_USE_SG);
+ xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
xs->queue_id = qid;
xp_add_xsk(xs->pool, xs);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index b3f7b310811e..49cb9f9a09be 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -170,6 +170,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
if (err)
return err;
+ if (flags & XDP_USE_SG)
+ pool->umem->flags |= XDP_UMEM_SG_FLAG;
+
if (flags & XDP_USE_NEED_WAKEUP)
pool->uses_need_wakeup = true;
/* Tx needs to be explicitly woken up the first time. Also
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d6b405782b63..c4c4fc29ccf5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2561,7 +2561,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
default:
BUG();
}
- xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+ xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0);
if (likely(xdst)) {
memset_after(xdst, 0, u.dst);