summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c75
-rw-r--r--net/core/dev.c211
-rw-r--r--net/core/dev.h9
-rw-r--r--net/core/dev_ioctl.c2
-rw-r--r--net/core/dst.c10
-rw-r--r--net/core/filter.c83
-rw-r--r--net/core/flow_dissector.c2
-rw-r--r--net/core/gso_test.c278
-rw-r--r--net/core/neighbour.c81
-rw-r--r--net/core/netclassid_cgroup.c6
-rw-r--r--net/core/netdev-genl.c12
-rw-r--r--net/core/page_pool.c31
-rw-r--r--net/core/pktgen.c116
-rw-r--r--net/core/rtnetlink.c156
-rw-r--r--net/core/selftests.c9
-rw-r--r--net/core/skbuff.c67
-rw-r--r--net/core/sock.c228
-rw-r--r--net/core/sock_map.c4
-rw-r--r--net/core/stream.c12
-rw-r--r--net/core/xdp.c4
21 files changed, 1082 insertions, 315 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 731db2eaa610..0cb734cbc24b 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
+obj-$(CONFIG_NET_TEST) += gso_test.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 176eb5834746..103d46fa0eeb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -50,7 +50,7 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
-#include <linux/uio.h>
+#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
@@ -61,6 +61,7 @@
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
+#include <crypto/hash.h>
/*
* Is a socket 'connection oriented' ?
@@ -489,6 +490,24 @@ short_copy:
return 0;
}
+static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+ struct iov_iter *i)
+{
+#ifdef CONFIG_CRYPTO_HASH
+ struct ahash_request *hash = hashp;
+ struct scatterlist sg;
+ size_t copied;
+
+ copied = copy_to_iter(addr, bytes, i);
+ sg_init_one(&sg, addr, copied);
+ ahash_request_set_crypt(hash, &sg, NULL, copied);
+ crypto_ahash_update(hash);
+ return copied;
+#else
+ return 0;
+#endif
+}
+
/**
* skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
* and update a hash.
@@ -716,6 +735,60 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
+static __always_inline
+size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ __wsum next, *csum = priv2;
+
+ next = csum_and_copy_to_user(from + progress, iter_to, len);
+ *csum = csum_block_add(*csum, next, progress);
+ return next ? 0 : len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ __wsum *csum = priv2;
+ __wsum next = csum_partial_copy_nocheck(from, iter_to, len);
+
+ *csum = csum_block_add(*csum, next, progress);
+ return 0;
+}
+
+struct csum_state {
+ __wsum csum;
+ size_t off;
+};
+
+static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
+ struct iov_iter *i)
+{
+ struct csum_state *csstate = _csstate;
+ __wsum sum;
+
+ if (WARN_ON_ONCE(i->data_source))
+ return 0;
+ if (unlikely(iov_iter_is_discard(i))) {
+ // can't use csum_memcpy() for that one - data is not copied
+ csstate->csum = csum_block_add(csstate->csum,
+ csum_partial(addr, bytes, 0),
+ csstate->off);
+ csstate->off += bytes;
+ return bytes;
+ }
+
+ sum = csum_shift(csstate->csum, csstate->off);
+
+ bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
+ copy_to_user_iter_csum,
+ memcpy_to_iter_csum);
+ csstate->csum = csum_shift(sum, csstate->off);
+ csstate->off += bytes;
+ return bytes;
+}
+
/**
* skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
* and update a checksum.
diff --git a/net/core/dev.c b/net/core/dev.c
index ccff2b6ef958..0d548431f3fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -69,7 +69,7 @@
*/
#include <linux/uaccess.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
@@ -345,7 +345,6 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name)
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
list_del(&name_node->list);
- netdev_name_node_del(name_node);
kfree(name_node->name);
netdev_name_node_free(name_node);
}
@@ -364,6 +363,8 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
if (name_node == dev->name_node || name_node->dev != dev)
return -EINVAL;
+ netdev_name_node_del(name_node);
+ synchronize_rcu();
__netdev_name_node_alt_destroy(name_node);
return 0;
@@ -380,6 +381,7 @@ static void netdev_name_node_alt_flush(struct net_device *dev)
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
+ struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
ASSERT_RTNL();
@@ -390,6 +392,10 @@ static void list_netdevice(struct net_device *dev)
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock(&dev_base_lock);
+
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_add(net, name_node);
+
/* We reserved the ifindex, this can't fail */
WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
@@ -401,12 +407,16 @@ static void list_netdevice(struct net_device *dev)
*/
static void unlist_netdevice(struct net_device *dev, bool lock)
{
+ struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
ASSERT_RTNL();
xa_erase(&net->dev_by_index, dev->ifindex);
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_del(name_node);
+
/* Unlink dev from the device chain */
if (lock)
write_lock(&dev_base_lock);
@@ -1047,7 +1057,7 @@ EXPORT_SYMBOL(dev_valid_name);
* __dev_alloc_name - allocate a name for a device
* @net: network namespace to allocate the device name in
* @name: name format string
- * @buf: scratch buffer and result name string
+ * @res: result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
@@ -1058,83 +1068,77 @@ EXPORT_SYMBOL(dev_valid_name);
* Returns the number of the unit assigned or a negative errno code.
*/
-static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
int i = 0;
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
unsigned long *inuse;
struct net_device *d;
+ char buf[IFNAMSIZ];
- if (!dev_valid_name(name))
- return -EINVAL;
-
+ /* Verify the string as this thing may have come from the user.
+ * There must be one "%d" and no other "%" characters.
+ */
p = strchr(name, '%');
- if (p) {
- /*
- * Verify the string as this thing may have come from
- * the user. There must be either one "%d" and no other "%"
- * characters.
- */
- if (p[1] != 'd' || strchr(p + 2, '%'))
- return -EINVAL;
+ if (!p || p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
- /* Use one page as a bit array of possible slots */
- inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
- if (!inuse)
- return -ENOMEM;
+ /* Use one page as a bit array of possible slots */
+ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
- for_each_netdev(net, d) {
- struct netdev_name_node *name_node;
- list_for_each_entry(name_node, &d->name_node->list, list) {
- if (!sscanf(name_node->name, name, &i))
- continue;
- if (i < 0 || i >= max_netdevices)
- continue;
+ for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
- /* avoid cases where sscanf is not exact inverse of printf */
- snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, name_node->name, IFNAMSIZ))
- __set_bit(i, inuse);
- }
- if (!sscanf(d->name, name, &i))
+ netdev_for_each_altname(d, name_node) {
+ if (!sscanf(name_node->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
- /* avoid cases where sscanf is not exact inverse of printf */
+ /* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, d->name, IFNAMSIZ))
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
__set_bit(i, inuse);
}
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
- i = find_first_zero_bit(inuse, max_netdevices);
- free_page((unsigned long) inuse);
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
- snprintf(buf, IFNAMSIZ, name, i);
- if (!netdev_name_in_use(net, buf))
- return i;
+ i = find_first_zero_bit(inuse, max_netdevices);
+ bitmap_free(inuse);
+ if (i == max_netdevices)
+ return -ENFILE;
- /* It is possible to run out of possible slots
- * when the name is long and there isn't enough space left
- * for the digits, or if all bits are used.
- */
- return -ENFILE;
+ snprintf(res, IFNAMSIZ, name, i);
+ return i;
}
-static int dev_alloc_name_ns(struct net *net,
- struct net_device *dev,
- const char *name)
+/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
+static int dev_prep_valid_name(struct net *net, struct net_device *dev,
+ const char *want_name, char *out_name,
+ int dup_errno)
{
- char buf[IFNAMSIZ];
- int ret;
+ if (!dev_valid_name(want_name))
+ return -EINVAL;
- BUG_ON(!net);
- ret = __dev_alloc_name(net, name, buf);
- if (ret >= 0)
- strscpy(dev->name, buf, IFNAMSIZ);
- return ret;
+ if (strchr(want_name, '%'))
+ return __dev_alloc_name(net, want_name, out_name);
+
+ if (netdev_name_in_use(net, want_name))
+ return -dup_errno;
+ if (out_name != want_name)
+ strscpy(out_name, want_name, IFNAMSIZ);
+ return 0;
}
/**
@@ -1153,26 +1157,17 @@ static int dev_alloc_name_ns(struct net *net,
int dev_alloc_name(struct net_device *dev, const char *name)
{
- return dev_alloc_name_ns(dev_net(dev), dev, name);
+ return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);
static int dev_get_valid_name(struct net *net, struct net_device *dev,
const char *name)
{
- BUG_ON(!net);
-
- if (!dev_valid_name(name))
- return -EINVAL;
-
- if (strchr(name, '%'))
- return dev_alloc_name_ns(net, dev, name);
- else if (netdev_name_in_use(net, name))
- return -EEXIST;
- else if (dev->name != name)
- strscpy(dev->name, name, IFNAMSIZ);
+ int ret;
- return 0;
+ ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
+ return ret < 0 ? ret : 0;
}
/**
@@ -3292,15 +3287,19 @@ int skb_checksum_help(struct sk_buff *skb)
offset = skb_checksum_start_offset(skb);
ret = -EINVAL;
- if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
+ if (unlikely(offset >= skb_headlen(skb))) {
DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
+ offset, skb_headlen(skb));
goto out;
}
csum = skb_checksum(skb, offset, skb->len - offset, 0);
offset += skb->csum_offset;
- if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
+ if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
+ offset + sizeof(__sum16), skb_headlen(skb));
goto out;
}
ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
@@ -3910,7 +3909,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+ enum skb_drop_reason *drop_reason)
{
int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
@@ -3922,12 +3922,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
+ res.drop_reason = *drop_reason;
mini_qdisc_bstats_cpu_update(miniq, skb);
ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
/* Only tcf related quirks below. */
switch (ret) {
case TC_ACT_SHOT:
+ *drop_reason = res.drop_reason;
mini_qdisc_qstats_cpu_drop(miniq);
break;
case TC_ACT_OK:
@@ -3977,6 +3979,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev, bool *another)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
int sch_ret;
if (!entry)
@@ -3994,7 +3997,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
if (sch_ret != TC_ACT_UNSPEC)
goto ingress_verdict;
}
- sch_ret = tc_run(tcx_entry(entry), skb);
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
switch (sch_ret) {
case TC_ACT_REDIRECT:
@@ -4011,7 +4014,7 @@ ingress_verdict:
*ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
return NULL;
/* used by tc_run */
@@ -4032,6 +4035,7 @@ static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
int sch_ret;
if (!entry)
@@ -4045,7 +4049,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
if (sch_ret != TC_ACT_UNSPEC)
goto egress_verdict;
}
- sch_ret = tc_run(tcx_entry(entry), skb);
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
switch (sch_ret) {
case TC_ACT_REDIRECT:
@@ -4054,7 +4058,7 @@ egress_verdict:
*ret = NET_XMIT_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+ kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
return NULL;
/* used by tc_run */
@@ -6523,9 +6527,11 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ if (napi_is_scheduled(n)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
+
+ xdp_do_check_flushed(n);
}
if (unlikely(work > weight))
@@ -9023,6 +9029,28 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
+static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+ rtnl_lock();
+ dev->dpll_pin = dpll_pin;
+ rtnl_unlock();
+#endif
+}
+
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+ WARN_ON(!dpll_pin);
+ netdev_dpll_pin_assign(dev, dpll_pin);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_set);
+
+void netdev_dpll_pin_clear(struct net_device *dev)
+{
+ netdev_dpll_pin_assign(dev, NULL);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_clear);
+
/**
* dev_change_proto_down - set carrier according to proto_down.
*
@@ -10475,7 +10503,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+ struct net_device *dev)
{
struct net_device_core_stats __percpu *p;
@@ -10488,7 +10517,23 @@ struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device
/* This READ_ONCE() pairs with the cmpxchg() above */
return READ_ONCE(dev->core_stats);
}
-EXPORT_SYMBOL(netdev_core_stats_alloc);
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+ unsigned long __percpu *field;
+
+ if (unlikely(!p)) {
+ p = netdev_core_stats_alloc(dev);
+ if (!p)
+ return;
+ }
+
+ field = (__force unsigned long __percpu *)((__force void *)p + offset);
+ this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
/**
* dev_get_stats - get network device statistics
@@ -11033,7 +11078,9 @@ EXPORT_SYMBOL(unregister_netdev);
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
const char *pat, int new_ifindex)
{
+ struct netdev_name_node *name_node;
struct net *net_old = dev_net(dev);
+ char new_name[IFNAMSIZ] = {};
int err, new_nsid;
ASSERT_RTNL();
@@ -11060,10 +11107,15 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* We get here if we can't use the current device name */
if (!pat)
goto out;
- err = dev_get_valid_name(net, dev, pat);
+ err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
if (err < 0)
goto out;
}
+ /* Check that none of the altnames conflicts. */
+ err = -EEXIST;
+ netdev_for_each_altname(dev, name_node)
+ if (netdev_name_in_use(net, name_node->name))
+ goto out;
/* Check that new_ifindex isn't used yet. */
if (new_ifindex) {
@@ -11131,6 +11183,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
+ if (new_name[0]) /* Rename the netdev to prepared name */
+ strscpy(dev->name, new_name, IFNAMSIZ);
+
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
diff --git a/net/core/dev.h b/net/core/dev.h
index e075e198092c..5aa45f0fd4ae 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -62,6 +62,9 @@ struct netdev_name_node {
int netdev_get_name(struct net *net, char *name, int ifindex);
int dev_change_name(struct net_device *dev, const char *newname);
+#define netdev_for_each_altname(dev, namenode) \
+ list_for_each_entry((namenode), &(dev)->name_node->list, list)
+
int netdev_name_node_alt_create(struct net_device *dev, const char *name);
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
@@ -136,4 +139,10 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
}
int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
#endif
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b46aedc36939..feeddf95f450 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -382,7 +382,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
if (err)
return err;
- err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
+ err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack);
if (err) {
if (extack._msg)
netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/core/dst.c b/net/core/dst.c
index 980e2fd2f013..6838d3212c37 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,7 +45,7 @@ const struct dst_metrics dst_default_metrics = {
EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- struct net_device *dev, int initial_ref, int initial_obsolete,
+ struct net_device *dev, int initial_obsolete,
unsigned short flags)
{
dst->dev = dev;
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->tclassid = 0;
#endif
dst->lwtstate = NULL;
- rcuref_init(&dst->__rcuref, initial_ref);
+ rcuref_init(&dst->__rcuref, 1);
INIT_LIST_HEAD(&dst->rt_uncached);
dst->__use = 0;
dst->lastuse = jiffies;
@@ -77,7 +77,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
EXPORT_SYMBOL(dst_init);
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+ int initial_obsolete, unsigned short flags)
{
struct dst_entry *dst;
@@ -90,7 +90,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
if (!dst)
return NULL;
- dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+ dst_init(dst, ops, dev, initial_obsolete, flags);
return dst;
}
@@ -270,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst,
struct dst_entry *dst;
dst = &md_dst->dst;
- dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
+ dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
DST_METADATA | DST_NOCOUNT);
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
md_dst->type = type;
diff --git a/net/core/filter.c b/net/core/filter.c
index a094694899c9..21d75108c2e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -81,6 +81,9 @@
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
+#include <linux/un.h>
+
+#include "dev.h"
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -4207,6 +4210,20 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
+{
+ bool ret;
+
+ ret = dev_check_flush();
+ ret |= cpu_map_check_flush();
+ ret |= xsk_map_check_flush();
+
+ WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+ napi->poll);
+}
+#endif
+
void bpf_clear_redirect_map(struct bpf_map *map)
{
struct bpf_redirect_info *ri;
@@ -5850,6 +5867,9 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
params->rt_metric = res.fi->fib_priority;
params->ifindex = dev->ifindex;
+ if (flags & BPF_FIB_LOOKUP_SRC)
+ params->ipv4_src = fib_result_prefsrc(net, &res);
+
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
*/
@@ -5992,6 +6012,18 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
params->rt_metric = res.f6i->fib6_metric;
params->ifindex = dev->ifindex;
+ if (flags & BPF_FIB_LOOKUP_SRC) {
+ if (res.f6i->fib6_prefsrc.plen) {
+ *src = res.f6i->fib6_prefsrc.addr;
+ } else {
+ err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
+ &fl6.daddr, 0,
+ src);
+ if (err)
+ return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
+ }
+ }
+
if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
goto set_fwd_params;
@@ -6010,7 +6042,8 @@ set_fwd_params:
#endif
#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
- BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID)
+ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
+ BPF_FIB_LOOKUP_SRC)
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
@@ -7858,14 +7891,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return &bpf_sock_addr_setsockopt_proto;
default:
return NULL;
@@ -7876,14 +7914,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return &bpf_sock_addr_getsockopt_proto;
default:
return NULL;
@@ -8931,8 +8974,8 @@ static bool sock_addr_is_valid_access(int off, int size,
if (off % size != 0)
return false;
- /* Disallow access to IPv6 fields from IPv4 contex and vise
- * versa.
+ /* Disallow access to fields not belonging to the attach type's address
+ * family.
*/
switch (off) {
case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
@@ -11752,6 +11795,27 @@ __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
return 0;
}
+
+__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
+ const u8 *sun_path, u32 sun_path__sz)
+{
+ struct sockaddr_un *un;
+
+ if (sa_kern->sk->sk_family != AF_UNIX)
+ return -EINVAL;
+
+ /* We do not allow changing the address to unnamed or larger than the
+ * maximum allowed address size for a unix sockaddr.
+ */
+ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
+ return -EINVAL;
+
+ un = (struct sockaddr_un *)sa_kern->uaddr;
+ memcpy(un->sun_path, sun_path, sun_path__sz);
+ sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
+
+ return 0;
+}
__diag_pop();
int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
@@ -11776,6 +11840,10 @@ BTF_SET8_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_SET8_END(bpf_kfunc_check_set_xdp)
+BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
+BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
+BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
+
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
@@ -11786,6 +11854,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
.set = &bpf_kfunc_check_set_xdp,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_addr,
+};
+
static int __init bpf_kfunc_init(void)
{
int ret;
@@ -11800,7 +11873,9 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ &bpf_kfunc_set_sock_addr);
}
late_initcall(bpf_kfunc_init);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b3b3af0e7844..272f09251343 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1446,7 +1446,7 @@ proto_again:
break;
}
- nhoff += ntohs(hdr->message_length);
+ nhoff += sizeof(struct ptp_header);
fdret = FLOW_DISSECT_RET_OUT_GOOD;
break;
}
diff --git a/net/core/gso_test.c b/net/core/gso_test.c
new file mode 100644
index 000000000000..ceb684be4cbf
--- /dev/null
+++ b/net/core/gso_test.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+#include <linux/skbuff.h>
+
+static const char hdr[] = "abcdefgh";
+#define GSO_TEST_SIZE 1000
+
+static void __init_skb(struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ memcpy(skb_mac_header(skb), hdr, sizeof(hdr));
+
+ /* skb_segment expects skb->data at start of payload */
+ skb_pull(skb, sizeof(hdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ /* proto is arbitrary, as long as not ETH_P_TEB or vlan */
+ skb->protocol = htons(ETH_P_ATALK);
+ skb_shinfo(skb)->gso_size = GSO_TEST_SIZE;
+}
+
+enum gso_test_nr {
+ GSO_TEST_LINEAR,
+ GSO_TEST_NO_GSO,
+ GSO_TEST_FRAGS,
+ GSO_TEST_FRAGS_PURE,
+ GSO_TEST_GSO_PARTIAL,
+ GSO_TEST_FRAG_LIST,
+ GSO_TEST_FRAG_LIST_PURE,
+ GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ GSO_TEST_GSO_BY_FRAGS,
+};
+
+struct gso_test_case {
+ enum gso_test_nr id;
+ const char *name;
+
+ /* input */
+ unsigned int linear_len;
+ unsigned int nr_frags;
+ const unsigned int *frags;
+ unsigned int nr_frag_skbs;
+ const unsigned int *frag_skbs;
+
+ /* output as expected */
+ unsigned int nr_segs;
+ const unsigned int *segs;
+};
+
+static struct gso_test_case cases[] = {
+ {
+ .id = GSO_TEST_NO_GSO,
+ .name = "no_gso",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_segs = 1,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_LINEAR,
+ .name = "linear",
+ .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1,
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS,
+ .name = "frags",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS_PURE,
+ .name = "frags_pure",
+ .nr_frags = 3,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ },
+ {
+ .id = GSO_TEST_GSO_PARTIAL,
+ .name = "gso_partial",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 89319d3801d1: frag_list on mss boundaries */
+ .id = GSO_TEST_FRAG_LIST,
+ .name = "frag_list",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_FRAG_LIST_PURE,
+ .name = "frag_list_pure",
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ /* commit 43170c4e0ba7: GRO of frag_list trains */
+ .id = GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ .name = "frag_list_non_uniform",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and
+ * commit 90017accff61 ("sctp: Add GSO support")
+ *
+ * "there will be a cover skb with protocol headers and
+ * children ones containing the actual segments"
+ */
+ .id = GSO_TEST_GSO_BY_FRAGS,
+ .name = "gso_by_frags",
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { 100, 200, 300, 400 },
+ },
+};
+
+static void gso_test_case_to_desc(struct gso_test_case *t, char *desc)
+{
+ sprintf(desc, "%s", t->name);
+}
+
+KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc);
+
+static void gso_test_func(struct kunit *test)
+{
+ const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct sk_buff *skb, *segs, *cur, *next, *last;
+ const struct gso_test_case *tcase;
+ netdev_features_t features;
+ struct page *page;
+ int i;
+
+ tcase = test->param_value;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, skb);
+ __skb_put(skb, sizeof(hdr) + tcase->linear_len);
+
+ __init_skb(skb);
+
+ if (tcase->nr_frags) {
+ unsigned int pg_off = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ page_ref_add(page, tcase->nr_frags - 1);
+
+ for (i = 0; i < tcase->nr_frags; i++) {
+ skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]);
+ pg_off += tcase->frags[i];
+ }
+
+ KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE);
+
+ skb->data_len = pg_off;
+ skb->len += skb->data_len;
+ skb->truesize += skb->data_len;
+ }
+
+ if (tcase->frag_skbs) {
+ unsigned int total_size = 0, total_true_size = 0, alloc_size = 0;
+ struct sk_buff *frag_skb, *prev = NULL;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ page_ref_add(page, tcase->nr_frag_skbs - 1);
+
+ for (i = 0; i < tcase->nr_frag_skbs; i++) {
+ unsigned int frag_size;
+
+ frag_size = tcase->frag_skbs[i];
+ frag_skb = build_skb(page_address(page) + alloc_size,
+ frag_size + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, frag_skb);
+ __skb_put(frag_skb, frag_size);
+
+ if (prev)
+ prev->next = frag_skb;
+ else
+ skb_shinfo(skb)->frag_list = frag_skb;
+ prev = frag_skb;
+
+ total_size += frag_size;
+ total_true_size += frag_skb->truesize;
+ alloc_size += frag_size + shinfo_size;
+ }
+
+ KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE);
+
+ skb->len += total_size;
+ skb->data_len += total_size;
+ skb->truesize += total_true_size;
+
+ if (tcase->id == GSO_TEST_GSO_BY_FRAGS)
+ skb_shinfo(skb)->gso_size = GSO_BY_FRAGS;
+ }
+
+ features = NETIF_F_SG | NETIF_F_HW_CSUM;
+ if (tcase->id == GSO_TEST_GSO_PARTIAL)
+ features |= NETIF_F_GSO_PARTIAL;
+
+ /* TODO: this should also work with SG,
+ * rather than hit BUG_ON(i >= nfrags)
+ */
+ if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM)
+ features &= ~NETIF_F_SG;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs)) {
+ KUNIT_FAIL(test, "segs error %lld", PTR_ERR(segs));
+ goto free_gso_skb;
+ } else if (!segs) {
+ KUNIT_FAIL(test, "no segments");
+ goto free_gso_skb;
+ }
+
+ last = segs->prev;
+ for (cur = segs, i = 0; cur; cur = next, i++) {
+ next = cur->next;
+
+ KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]);
+
+ /* segs have skb->data pointing to the mac header */
+ KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data);
+ KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr));
+
+ /* header was copied to all segs */
+ KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0);
+
+ /* last seg can be found through segs->prev pointer */
+ if (!next)
+ KUNIT_ASSERT_PTR_EQ(test, cur, last);
+
+ consume_skb(cur);
+ }
+
+ KUNIT_ASSERT_EQ(test, i, tcase->nr_segs);
+
+free_gso_skb:
+ consume_skb(skb);
+}
+
+static struct kunit_case gso_test_cases[] = {
+ KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
+ {}
+};
+
+static struct kunit_suite gso_test_suite = {
+ .name = "net_core_gso",
+ .test_cases = gso_test_cases,
+};
+
+kunit_test_suite(gso_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KUnit tests for segmentation offload");
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6b76cd103195..df81c1f0a570 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -251,7 +251,8 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
static int neigh_forced_gc(struct neigh_table *tbl)
{
- int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
+ int max_clean = atomic_read(&tbl->gc_entries) -
+ READ_ONCE(tbl->gc_thresh2);
unsigned long tref = jiffies - 5 * HZ;
struct neighbour *n, *tmp;
int shrunk = 0;
@@ -280,7 +281,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
}
}
- tbl->last_flush = jiffies;
+ WRITE_ONCE(tbl->last_flush, jiffies);
write_unlock_bh(&tbl->lock);
@@ -410,7 +411,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
*/
__skb_queue_purge(&n->arp_queue);
n->arp_queue_len_bytes = 0;
- n->output = neigh_blackhole;
+ WRITE_ONCE(n->output, neigh_blackhole);
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
@@ -464,17 +465,17 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl,
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
- int entries;
+ int entries, gc_thresh3;
if (exempt_from_gc)
goto do_alloc;
entries = atomic_inc_return(&tbl->gc_entries) - 1;
- if (entries >= tbl->gc_thresh3 ||
- (entries >= tbl->gc_thresh2 &&
- time_after(now, tbl->last_flush + 5 * HZ))) {
- if (!neigh_forced_gc(tbl) &&
- entries >= tbl->gc_thresh3) {
+ gc_thresh3 = READ_ONCE(tbl->gc_thresh3);
+ if (entries >= gc_thresh3 ||
+ (entries >= READ_ONCE(tbl->gc_thresh2) &&
+ time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) {
+ if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) {
net_info_ratelimited("%s: neighbor table overflow!\n",
tbl->id);
NEIGH_CACHE_STAT_INC(tbl, table_fulls);
@@ -920,7 +921,7 @@ static void neigh_suspect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is suspected\n", neigh);
- neigh->output = neigh->ops->output;
+ WRITE_ONCE(neigh->output, neigh->ops->output);
}
/* Neighbour state is OK;
@@ -932,7 +933,7 @@ static void neigh_connect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is connected\n", neigh);
- neigh->output = neigh->ops->connected_output;
+ WRITE_ONCE(neigh->output, neigh->ops->connected_output);
}
static void neigh_periodic_work(struct work_struct *work)
@@ -955,13 +956,14 @@ static void neigh_periodic_work(struct work_struct *work)
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
- tbl->last_rand = jiffies;
+
+ WRITE_ONCE(tbl->last_rand, jiffies);
list_for_each_entry(p, &tbl->parms_list, list)
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
- if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
@@ -988,7 +990,9 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED ||
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
@@ -1447,7 +1451,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
if (n2)
n1 = n2;
}
- n1->output(n1, skb);
+ READ_ONCE(n1->output)(n1, skb);
if (n2)
neigh_release(n2);
rcu_read_unlock();
@@ -2165,15 +2169,16 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
- nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) ||
- nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
- nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
- nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval),
+ NDTA_PAD) ||
+ nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) ||
+ nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) ||
+ nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3)))
goto nla_put_failure;
{
unsigned long now = jiffies;
- long flush_delta = now - tbl->last_flush;
- long rand_delta = now - tbl->last_rand;
+ long flush_delta = now - READ_ONCE(tbl->last_flush);
+ long rand_delta = now - READ_ONCE(tbl->last_rand);
struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
@@ -2181,7 +2186,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
- .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen),
};
rcu_read_lock();
@@ -2204,17 +2209,17 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
struct neigh_statistics *st;
st = per_cpu_ptr(tbl->stats, cpu);
- ndst.ndts_allocs += st->allocs;
- ndst.ndts_destroys += st->destroys;
- ndst.ndts_hash_grows += st->hash_grows;
- ndst.ndts_res_failed += st->res_failed;
- ndst.ndts_lookups += st->lookups;
- ndst.ndts_hits += st->hits;
- ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
- ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
- ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
- ndst.ndts_forced_gc_runs += st->forced_gc_runs;
- ndst.ndts_table_fulls += st->table_fulls;
+ ndst.ndts_allocs += READ_ONCE(st->allocs);
+ ndst.ndts_destroys += READ_ONCE(st->destroys);
+ ndst.ndts_hash_grows += READ_ONCE(st->hash_grows);
+ ndst.ndts_res_failed += READ_ONCE(st->res_failed);
+ ndst.ndts_lookups += READ_ONCE(st->lookups);
+ ndst.ndts_hits += READ_ONCE(st->hits);
+ ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast);
+ ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast);
+ ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs);
+ ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs);
+ ndst.ndts_table_fulls += READ_ONCE(st->table_fulls);
}
if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
@@ -2443,16 +2448,16 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout_tbl_lock;
if (tb[NDTA_THRESH1])
- tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+ WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1]));
if (tb[NDTA_THRESH2])
- tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+ WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2]));
if (tb[NDTA_THRESH3])
- tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+ WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3]));
if (tb[NDTA_GC_INTERVAL])
- tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+ WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL]));
err = 0;
@@ -3153,7 +3158,7 @@ int neigh_xmit(int index, struct net_device *dev,
rcu_read_unlock();
goto out_kfree_skb;
}
- err = neigh->output(neigh, skb);
+ err = READ_ONCE(neigh->output)(neigh, skb);
rcu_read_unlock();
}
else if (index == NEIGH_LINK_TABLE) {
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index d6a70aeaa503..d22f0919821e 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -88,6 +88,12 @@ static void update_classid_task(struct task_struct *p, u32 classid)
};
unsigned int fd = 0;
+ /* Only update the leader task, when many threads in this task,
+ * so it can avoid the useless traversal.
+ */
+ if (p != p->group_leader)
+ return;
+
do {
task_lock(p);
fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index c1aea8b756b6..fe61f85bcf33 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -5,6 +5,7 @@
#include <linux/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/xdp.h>
#include "netdev-genl-gen.h"
@@ -12,15 +13,24 @@ static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
+ u64 xdp_rx_meta = 0;
void *hdr;
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+ xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
- netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+ netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ xdp_rx_meta, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 77cb75e63aca..5e409b98aba0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -211,10 +211,6 @@ static int page_pool_init(struct page_pool *pool,
*/
}
- if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
- pool->p.flags & PP_FLAG_PAGE_FRAG)
- return -EINVAL;
-
#ifdef CONFIG_PAGE_POOL_STATS
pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
if (!pool->recycle_stats)
@@ -359,12 +355,20 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
if (dma_mapping_error(pool->p.dev, dma))
return false;
- page_pool_set_dma_addr(page, dma);
+ if (page_pool_set_dma_addr(page, dma))
+ goto unmap_failed;
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
return true;
+
+unmap_failed:
+ WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ return false;
}
static void page_pool_set_pp_info(struct page_pool *pool,
@@ -372,6 +376,14 @@ static void page_pool_set_pp_info(struct page_pool *pool,
{
page->pp = pool;
page->pp_magic |= PP_SIGNATURE;
+
+ /* Ensuring all pages have been split into one fragment initially:
+ * page_pool_set_pp_info() is only called once for every page when it
+ * is allocated from the page allocator and page_pool_fragment_page()
+ * is dirtying the same cache line as the page->pp_magic above, so
+ * the overhead is negligible.
+ */
+ page_pool_fragment_page(page, 1);
if (pool->p.init_callback)
pool->p.init_callback(page, pool->p.init_arg);
}
@@ -668,7 +680,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
struct page *page = virt_to_head_page(data[i]);
/* It is not the last user for the page frag case */
- if (!page_pool_is_last_frag(pool, page))
+ if (!page_pool_is_last_frag(page))
continue;
page = __page_pool_put_page(pool, page, -1, false);
@@ -744,8 +756,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
unsigned int max_size = PAGE_SIZE << pool->p.order;
struct page *page = pool->frag_page;
- if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
- size > max_size))
+ if (WARN_ON(size > max_size))
return NULL;
size = ALIGN(size, dma_get_cache_alignment());
@@ -798,7 +809,7 @@ static void page_pool_empty_ring(struct page_pool *pool)
}
}
-static void page_pool_free(struct page_pool *pool)
+static void __page_pool_destroy(struct page_pool *pool)
{
if (pool->disconnect)
pool->disconnect(pool);
@@ -849,7 +860,7 @@ static int page_pool_release(struct page_pool *pool)
page_pool_scrub(pool);
inflight = page_pool_inflight(pool);
if (!inflight)
- page_pool_free(pool);
+ __page_pool_destroy(pool);
return inflight;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f56b8d697014..8afcfadf8d5a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -200,6 +200,7 @@
pf(VID_RND) /* Random VLAN ID */ \
pf(SVID_RND) /* Random SVLAN ID */ \
pf(NODE) /* Node memory alloc*/ \
+ pf(SHARED) /* Shared SKB */ \
#define pf(flag) flag##_SHIFT,
enum pkt_flags {
@@ -669,19 +670,19 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
seq_puts(seq, " Flags: ");
for (i = 0; i < NR_PKT_FLAGS; i++) {
- if (i == F_FLOW_SEQ)
+ if (i == FLOW_SEQ_SHIFT)
if (!pkt_dev->cflows)
continue;
- if (pkt_dev->flags & (1 << i))
+ if (pkt_dev->flags & (1 << i)) {
seq_printf(seq, "%s ", pkt_flag_names[i]);
- else if (i == F_FLOW_SEQ)
- seq_puts(seq, "FLOW_RND ");
-
#ifdef CONFIG_XFRM
- if (i == F_IPSEC && pkt_dev->spi)
- seq_printf(seq, "spi:%u", pkt_dev->spi);
+ if (i == IPSEC_SHIFT && pkt_dev->spi)
+ seq_printf(seq, "spi:%u ", pkt_dev->spi);
#endif
+ } else if (i == FLOW_SEQ_SHIFT) {
+ seq_puts(seq, "FLOW_RND ");
+ }
}
seq_puts(seq, "\n");
@@ -1198,7 +1199,8 @@ static ssize_t pktgen_if_write(struct file *file,
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
- if (value > 0 && pkt_dev->n_imix_entries > 0)
+ if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ !(pkt_dev->flags & F_SHARED)))
return -EINVAL;
i += len;
@@ -1257,6 +1259,10 @@ static ssize_t pktgen_if_write(struct file *file,
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
+
+ if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ return -EINVAL;
+
pkt_dev->burst = value < 1 ? 1 : value;
sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
@@ -1318,9 +1324,10 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "flag")) {
+ bool disable = false;
__u32 flag;
char f[32];
- bool disable = false;
+ char *end;
memset(f, 0, 32);
len = strn_len(&user_buffer[i], sizeof(f) - 1);
@@ -1332,28 +1339,42 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
flag = pktgen_read_flag(f, &disable);
-
if (flag) {
- if (disable)
+ if (disable) {
+ /* If "clone_skb", or "burst" parameters are
+ * configured, it means that the skb still
+ * needs to be referenced by the pktgen, so
+ * the skb must be shared.
+ */
+ if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ pkt_dev->burst > 1))
+ return -EINVAL;
pkt_dev->flags &= ~flag;
- else
+ } else {
pkt_dev->flags |= flag;
- } else {
- sprintf(pg_result,
- "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
- "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
- "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
- "NO_TIMESTAMP, "
-#ifdef CONFIG_XFRM
- "IPSEC, "
-#endif
- "NODE_ALLOC\n");
+ }
+
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
- sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+
+ /* Unknown flag */
+ end = pkt_dev->result + sizeof(pkt_dev->result);
+ pg_result += sprintf(pg_result,
+ "Flag -:%s:- unknown\n"
+ "Available flags, (prepend ! to un-set flag):\n", f);
+
+ for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ continue;
+ pg_result += snprintf(pg_result, end - pg_result,
+ "%s, ", pkt_flag_names[n]);
+ }
+ if (!WARN_ON_ONCE(pg_result >= end)) {
+ /* Remove the comma and whitespace at the end */
+ *(pg_result - 2) = '\0';
+ }
+
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
@@ -3440,12 +3461,24 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
- unsigned int burst = READ_ONCE(pkt_dev->burst);
+ bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ unsigned int burst = 1;
struct sk_buff *skb;
+ int clone_skb = 0;
int ret;
+ /* If 'skb_shared' is false, the read of possible
+ * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ * prevent some concurrent changes from slipping in. And the stabilized
+ * config will be read in during the next run of pktgen_xmit.
+ */
+ if (skb_shared) {
+ burst = READ_ONCE(pkt_dev->burst);
+ clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ }
+
/* If device is offline, then don't send */
if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
pktgen_stop_device(pkt_dev);
@@ -3462,7 +3495,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* If no skb or clone count exhausted then get new one */
if (!pkt_dev->skb || (pkt_dev->last_ok &&
- ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ ++pkt_dev->clone_count >= clone_skb)) {
/* build a new pkt */
kfree_skb(pkt_dev->skb);
@@ -3483,7 +3516,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
skb = pkt_dev->skb;
skb->protocol = eth_type_trans(skb, skb->dev);
- refcount_add(burst, &skb->users);
+ if (skb_shared)
+ refcount_add(burst, &skb->users);
local_bh_disable();
do {
ret = netif_receive_skb(skb);
@@ -3491,6 +3525,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
+ if (unlikely(!skb_shared)) {
+ pkt_dev->skb = NULL;
+ break;
+ }
if (refcount_read(&skb->users) != burst) {
/* skb was queued by rps/rfs or taps,
* so cannot reuse this skb
@@ -3509,9 +3547,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
local_bh_disable();
- refcount_inc(&pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_inc(&pkt_dev->skb->users);
ret = dev_queue_xmit(pkt_dev->skb);
+
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NET_XMIT_SUCCESS:
pkt_dev->sofar++;
@@ -3549,11 +3592,15 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
goto unlock;
}
- refcount_add(burst, &pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_add(burst, &pkt_dev->skb->users);
xmit_more:
ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NETDEV_TX_OK:
pkt_dev->last_ok = 1;
@@ -3575,7 +3622,8 @@ xmit_more:
fallthrough;
case NETDEV_TX_BUSY:
/* Retry it next time */
- refcount_dec(&(pkt_dev->skb->users));
+ if (skb_shared)
+ refcount_dec(&pkt_dev->skb->users);
pkt_dev->last_ok = 0;
}
if (unlikely(burst))
@@ -3588,7 +3636,8 @@ out:
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- pktgen_wait_for_skb(pkt_dev);
+ if (pkt_dev->skb)
+ pktgen_wait_for_skb(pkt_dev);
/* Done with this */
pktgen_stop_device(pkt_dev);
@@ -3771,6 +3820,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
pkt_dev->node = NUMA_NO_NODE;
+ pkt_dev->flags = F_SHARED; /* SKB shared by default */
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4a2ec33bfb51..e8431c6c8490 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -57,6 +57,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
#endif
+#include <linux/dpll.h>
#include "dev.h"
@@ -1055,6 +1056,15 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev)
return size;
}
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+ size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev));
+
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1111,6 +1121,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_prop_list_size(dev)
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ rtnl_devlink_port_size(dev)
+ + rtnl_dpll_pin_size(dev)
+ 0;
}
@@ -1774,6 +1785,28 @@ nest_cancel:
return ret;
}
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *dpll_pin_nest;
+ int ret;
+
+ dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+ if (!dpll_pin_nest)
+ return -EMSGSIZE;
+
+ ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev));
+ if (ret < 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, dpll_pin_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, dpll_pin_nest);
+ return ret;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
@@ -1916,6 +1949,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_fill_devlink_port(skb, dev))
goto nla_put_failure;
+ if (rtnl_fill_dpll_pin(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -4331,13 +4367,6 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
}
EXPORT_SYMBOL(ndo_dflt_fdb_del);
-static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = {
- [NDA_VLAN] = { .type = NLA_U16 },
- [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
- [NDA_NDM_STATE_MASK] = { .type = NLA_U16 },
- [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 },
-};
-
static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -4358,8 +4387,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
NULL, extack);
} else {
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
- fdb_del_bulk_policy, extack);
+ /* For bulk delete, the drivers will parse the message with
+ * policy.
+ */
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
}
if (err < 0)
return err;
@@ -4382,6 +4413,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
}
if (dev->type != ARPHRD_ETHER) {
@@ -4389,10 +4424,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
- err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
- if (err)
- return err;
-
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
@@ -4406,8 +4437,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
} else {
if (ops->ndo_fdb_del_bulk)
- err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
- extack);
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
}
if (err)
@@ -4428,8 +4458,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* in case err was cleared by NTF_MASTER call */
err = -EOPNOTSUPP;
if (ops->ndo_fdb_del_bulk)
- err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
- extack);
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
}
if (!err) {
@@ -5503,13 +5532,11 @@ static unsigned int
rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
enum netdev_offload_xstats_type type)
{
- bool enabled = netdev_offload_xstats_enabled(dev, type);
-
return nla_total_size(0) +
/* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
nla_total_size(sizeof(u8)) +
/* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
- (enabled ? nla_total_size(sizeof(u8)) : 0) +
+ nla_total_size(sizeof(u8)) +
0;
}
@@ -6192,6 +6219,93 @@ out:
return skb->len;
}
+static int rtnl_validate_mdb_entry_get(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->ifindex) {
+ NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->state) {
+ NL_SET_ERR_MSG(extack, "Entry state cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (entry->addr.proto != htons(ETH_P_IP) &&
+ entry->addr.proto != htons(ETH_P_IPV6) &&
+ entry->addr.proto != 0) {
+ NL_SET_ERR_MSG(extack, "Unknown entry protocol");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = {
+ [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_get,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1];
+ struct net *net = sock_net(in_skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb,
+ MDBA_GET_ENTRY_MAX, mdba_get_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_get) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+}
+
static int rtnl_validate_mdb_entry(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -6568,7 +6682,7 @@ void __init rtnetlink_init(void)
0);
rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, rtnl_mdb_dump, 0);
+ rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0);
rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0);
rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0);
}
diff --git a/net/core/selftests.c b/net/core/selftests.c
index acb1ee97bbd3..94fe3146a959 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -397,14 +397,11 @@ EXPORT_SYMBOL_GPL(net_selftest_get_count);
void net_selftest_get_strings(u8 *data)
{
- u8 *p = data;
int i;
- for (i = 0; i < net_selftest_get_count(); i++) {
- snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1,
- net_selftests[i].name);
- p += ETH_GSTRING_LEN;
- }
+ for (i = 0; i < net_selftest_get_count(); i++)
+ ethtool_sprintf(&data, "%2d. %s", i + 1,
+ net_selftests[i].name);
}
EXPORT_SYMBOL_GPL(net_selftest_get_strings);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4eaf7ed0d1f4..b157efea5dea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
+#include <linux/iov_iter.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -847,6 +848,8 @@ EXPORT_SYMBOL(__napi_alloc_skb);
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_fill_page_desc(skb, i, page, off, size);
skb->len += size;
skb->data_len += size;
@@ -859,6 +862,8 @@ void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_frag_size_add(frag, size);
skb->len += size;
skb->data_len += size;
@@ -3718,10 +3723,19 @@ EXPORT_SYMBOL(skb_dequeue_tail);
void skb_queue_purge_reason(struct sk_buff_head *list,
enum skb_drop_reason reason)
{
- struct sk_buff *skb;
+ struct sk_buff_head tmp;
+ unsigned long flags;
- while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb_reason(skb, reason);
+ if (skb_queue_empty_lockless(list))
+ return;
+
+ __skb_queue_head_init(&tmp);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_splice_init(list, &tmp);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);
@@ -4254,6 +4268,7 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
unsigned int to, struct ts_config *config)
{
+ unsigned int patlen = config->ops->get_pattern_len(config);
struct ts_state state;
unsigned int ret;
@@ -4265,7 +4280,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
ret = textsearch_find(config, &state);
- return (ret <= to - from ? ret : UINT_MAX);
+ return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);
@@ -5149,6 +5164,9 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
bool icmp_next = false;
unsigned long flags;
+ if (skb_queue_empty_lockless(q))
+ return NULL;
+
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q))) {
@@ -5748,7 +5766,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
/* In general, avoid mixing page_pool and non-page_pool allocated
* pages within the same SKB. Additionally avoid dealing with clones
* with page_pool pages, in case the SKB is using page_pool fragment
- * references (PP_FLAG_PAGE_FRAG). Since we only take full page
+ * references (page_pool_alloc_frag()). Since we only take full page
* references for cloned SKBs at the moment that would result in
* inconsistent reference counts.
* In theory we could take full references if @from is cloned and
@@ -6931,3 +6949,42 @@ out:
return spliced ?: ret;
}
EXPORT_SYMBOL(skb_splice_from_iter);
+
+static __always_inline
+size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
+ size_t len, void *to, void *priv2)
+{
+ __wsum *csum = priv2;
+ __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
+
+ *csum = csum_block_add(*csum, next, progress);
+ return 0;
+}
+
+static __always_inline
+size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
+ size_t len, void *to, void *priv2)
+{
+ __wsum next, *csum = priv2;
+
+ next = csum_and_copy_from_user(iter_from, to + progress, len);
+ *csum = csum_block_add(*csum, next, progress);
+ return next ? 0 : len;
+}
+
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+ __wsum *csum, struct iov_iter *i)
+{
+ size_t copied;
+
+ if (WARN_ON_ONCE(!i->data_source))
+ return false;
+ copied = iterate_and_advance2(i, bytes, addr, csum,
+ copy_from_user_iter_csum,
+ memcpy_from_iter_csum);
+ if (likely(copied == bytes))
+ return true;
+ iov_iter_revert(i, copied);
+ return false;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
diff --git a/net/core/sock.c b/net/core/sock.c
index 16584e2dd648..1d28e3e87970 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -600,7 +600,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- sk->sk_dst_pending_confirm = 0;
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -759,7 +759,7 @@ out:
return ret;
}
-bool sk_mc_loop(struct sock *sk)
+bool sk_mc_loop(const struct sock *sk)
{
if (dev_recursion_level())
return false;
@@ -771,7 +771,7 @@ bool sk_mc_loop(struct sock *sk)
return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- return inet6_sk(sk)->mc_loop;
+ return inet6_test_bit(MC6_LOOP, sk);
#endif
}
WARN_ON_ONCE(1);
@@ -806,9 +806,7 @@ EXPORT_SYMBOL(sock_no_linger);
void sock_set_priority(struct sock *sk, u32 priority)
{
- lock_sock(sk);
WRITE_ONCE(sk->sk_priority, priority);
- release_sock(sk);
}
EXPORT_SYMBOL(sock_set_priority);
@@ -1118,6 +1116,83 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
valbool = val ? 1 : 0;
+ /* handle options which do not require locking the socket. */
+ switch (optname) {
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ sock_set_priority(sk, val);
+ return 0;
+ }
+ return -EPERM;
+ case SO_PASSSEC:
+ assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
+ return 0;
+ case SO_PASSCRED:
+ assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
+ return 0;
+ case SO_PASSPIDFD:
+ assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
+ return 0;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ return -ENOPROTOOPT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ if (val < 0)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ return 0;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ return 0;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
+ !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (val < 0 || val > U16_MAX)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
+ return 0;
+#endif
+ case SO_MAX_PACING_RATE:
+ {
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+ unsigned long pacing_rate;
+
+ if (sizeof(ulval) != sizeof(val) &&
+ optlen >= sizeof(ulval) &&
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+ return -EFAULT;
+ }
+ if (ulval != ~0UL)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
+ /* Pairs with READ_ONCE() from sk_getsockopt() */
+ WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+ pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+ if (ulval < pacing_rate)
+ WRITE_ONCE(sk->sk_pacing_rate, ulval);
+ return 0;
+ }
+ case SO_TXREHASH:
+ if (val < -1 || val > 1)
+ return -EINVAL;
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+ /* Paired with READ_ONCE() in tcp_rtx_synack()
+ * and sk_getsockopt().
+ */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ return 0;
+ }
+
sockopt_lock_sock(sk);
switch (optname) {
@@ -1133,12 +1208,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
- case SO_TYPE:
- case SO_PROTOCOL:
- case SO_DOMAIN:
- case SO_ERROR:
- ret = -ENOPROTOOPT;
- break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
sk_dst_reset(sk);
@@ -1213,15 +1282,6 @@ set_sndbuf:
sk->sk_no_check_tx = valbool;
break;
- case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- WRITE_ONCE(sk->sk_priority, val);
- else
- ret = -EPERM;
- break;
-
case SO_LINGER:
if (optlen < sizeof(ling)) {
ret = -EINVAL; /* 1003.1g */
@@ -1247,14 +1307,6 @@ set_sndbuf:
case SO_BSDCOMPAT:
break;
- case SO_PASSCRED:
- assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
- break;
-
- case SO_PASSPIDFD:
- assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
- break;
-
case SO_TIMESTAMP_OLD:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
@@ -1360,9 +1412,6 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
break;
- case SO_PASSSEC:
- assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
- break;
case SO_MARK:
if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
@@ -1404,50 +1453,7 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- case SO_BUSY_POLL:
- if (val < 0)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_ll_usec, val);
- break;
- case SO_PREFER_BUSY_POLL:
- if (valbool && !sockopt_capable(CAP_NET_ADMIN))
- ret = -EPERM;
- else
- WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
- break;
- case SO_BUSY_POLL_BUDGET:
- if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else {
- if (val < 0 || val > U16_MAX)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_busy_poll_budget, val);
- }
- break;
-#endif
- case SO_MAX_PACING_RATE:
- {
- unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
-
- if (sizeof(ulval) != sizeof(val) &&
- optlen >= sizeof(ulval) &&
- copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
- ret = -EFAULT;
- break;
- }
- if (ulval != ~0UL)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- /* Pairs with READ_ONCE() from sk_getsockopt() */
- WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
- sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
- break;
- }
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
break;
@@ -1532,19 +1538,6 @@ set_sndbuf:
break;
}
- case SO_TXREHASH:
- if (val < -1 || val > 1) {
- ret = -EINVAL;
- break;
- }
- if ((u8)val == SOCK_TXREHASH_DEFAULT)
- val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
- /* Paired with READ_ONCE() in tcp_rtx_synack()
- * and sk_getsockopt().
- */
- WRITE_ONCE(sk->sk_txrehash, (u8)val);
- break;
-
default:
ret = -ENOPROTOOPT;
break;
@@ -3001,6 +2994,11 @@ void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
+
+ if (sk->sk_prot->release_cb)
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
+
spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);
@@ -3037,21 +3035,29 @@ EXPORT_SYMBOL(sk_wait_data);
* @amt: pages to allocate
* @kind: allocation type
*
- * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
+ *
+ * Unlike the globally shared limits among the sockets under same protocol,
+ * consuming the budget of a memcg won't have direct effect on other ones.
+ * So be optimistic about memcg's tolerance, and leave the callers to decide
+ * whether or not to raise allocated through sk_under_memory_pressure() or
+ * its variants.
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
- bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
+ struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
struct proto *prot = sk->sk_prot;
- bool charged = true;
+ bool charged = false;
long allocated;
sk_memory_allocated_add(sk, amt);
allocated = sk_memory_allocated(sk);
- if (memcg_charge &&
- !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
- gfp_memcg_charge())))
- goto suppress_allocation;
+
+ if (memcg) {
+ if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
+ goto suppress_allocation;
+ charged = true;
+ }
/* Under limit. */
if (allocated <= sk_prot_mem_limits(sk, 0)) {
@@ -3067,7 +3073,14 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (allocated > sk_prot_mem_limits(sk, 2))
goto suppress_allocation;
- /* guarantee minimum buffer size under pressure */
+ /* Guarantee minimum buffer size under pressure (either global
+ * or memcg) to make sure features described in RFC 7323 (TCP
+ * Extensions for High Performance) work properly.
+ *
+ * This rule does NOT stand when exceeds global or memcg's hard
+ * limit, or else a DoS attack can be taken place by spawning
+ * lots of sockets whose usage are under minimum buffer size.
+ */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
return 1;
@@ -3086,8 +3099,17 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (sk_has_memory_pressure(sk)) {
u64 alloc;
- if (!sk_under_memory_pressure(sk))
+ /* The following 'average' heuristic is within the
+ * scope of global accounting, so it only makes
+ * sense for global memory pressure.
+ */
+ if (!sk_under_global_memory_pressure(sk))
return 1;
+
+ /* Try to be fair among all the sockets under global
+ * pressure by allowing the ones that below average
+ * usage to raise.
+ */
alloc = sk_sockets_allocated_read_positive(sk);
if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
@@ -3106,8 +3128,8 @@ suppress_allocation:
*/
if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
/* Force charge with __GFP_NOFAIL */
- if (memcg_charge && !charged) {
- mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ if (memcg && !charged) {
+ mem_cgroup_charge_skmem(memcg, amt,
gfp_memcg_charge() | __GFP_NOFAIL);
}
return 1;
@@ -3119,8 +3141,8 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (memcg_charge && charged)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
+ if (charged)
+ mem_cgroup_uncharge_skmem(memcg, amt);
return 0;
}
@@ -3519,11 +3541,9 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail)
__release_sock(sk);
- /* Warning : release_cb() might need to release sk ownership,
- * ie call sock_release_ownership(sk) before us.
- */
if (sk->sk_prot->release_cb)
- sk->sk_prot->release_cb(sk);
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index cb11750b1df5..4292c2ed1828 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -668,6 +668,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
sk = __sock_map_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+ if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+ return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
@@ -1267,6 +1269,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
sk = __sock_hash_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+ if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+ return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
diff --git a/net/core/stream.c b/net/core/stream.c
index f5c4e47df165..96fbcb9bbb30 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(sk_stream_wait_close);
*/
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
{
- int err = 0;
+ int ret, err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
@@ -142,11 +142,13 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) ||
- (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
- (sk_stream_memory_free(sk) &&
- !vm_wait), &wait);
+ ret = sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) ||
+ (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) && !vm_wait),
+ &wait);
sk->sk_write_pending--;
+ if (ret < 0)
+ goto do_error;
if (vm_wait) {
vm_wait -= current_timeo;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a70670fe9a2d..df4789ab512d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
__diag_pop();
BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
};
BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC