summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/can/j1939/address-claim.c40
-rw-r--r--net/can/raw.c11
-rw-r--r--net/core/dev.h2
-rw-r--r--net/core/neighbour.c18
-rw-r--r--net/core/net-sysfs.c79
-rw-r--r--net/core/skbuff.c121
-rw-r--r--net/core/sock.c3
-rw-r--r--net/core/sysctl_net_core.c81
-rw-r--r--net/devlink/core.c5
-rw-r--r--net/devlink/leftover.c18
-rw-r--r--net/ipv4/af_inet.c1
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/mptcp/pm_netlink.c10
-rw-r--r--net/mptcp/protocol.c9
-rw-r--r--net/mptcp/sockopt.c11
-rw-r--r--net/mptcp/subflow.c12
-rw-r--r--net/netlink/genetlink.c4
-rw-r--r--net/openvswitch/conntrack.c5
-rw-r--r--net/rds/message.c6
-rw-r--r--net/sched/sch_api.c29
-rw-r--r--net/sched/sch_htb.c2
-rw-r--r--net/sched/sch_taprio.c639
-rw-r--r--net/tipc/netlink_compat.c16
-rw-r--r--net/xfrm/xfrm_compat.c4
-rw-r--r--net/xfrm/xfrm_input.c3
-rw-r--r--net/xfrm/xfrm_interface_core.c54
-rw-r--r--net/xfrm/xfrm_policy.c14
-rw-r--r--net/xfrm/xfrm_state.c18
29 files changed, 878 insertions, 341 deletions
diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c
index f33c47327927..ca4ad6cdd5cb 100644
--- a/net/can/j1939/address-claim.c
+++ b/net/can/j1939/address-claim.c
@@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb)
* leaving this function.
*/
ecu = j1939_ecu_get_by_name_locked(priv, name);
+
+ if (ecu && ecu->addr == skcb->addr.sa) {
+ /* The ISO 11783-5 standard, in "4.5.2 - Address claim
+ * requirements", states:
+ * d) No CF shall begin, or resume, transmission on the
+ * network until 250 ms after it has successfully claimed
+ * an address except when responding to a request for
+ * address-claimed.
+ *
+ * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim
+ * prioritization" show that the CF begins the transmission
+ * after 250 ms from the first AC (address-claimed) message
+ * even if it sends another AC message during that time window
+ * to resolve the address contention with another CF.
+ *
+ * As stated in "4.4.2.3 - Address-claimed message":
+ * In order to successfully claim an address, the CF sending
+ * an address claimed message shall not receive a contending
+ * claim from another CF for at least 250 ms.
+ *
+ * As stated in "4.4.3.2 - NAME management (NM) message":
+ * 1) A commanding CF can
+ * d) request that a CF with a specified NAME transmit
+ * the address-claimed message with its current NAME.
+ * 2) A target CF shall
+ * d) send an address-claimed message in response to a
+ * request for a matching NAME
+ *
+ * Taking the above arguments into account, the 250 ms wait is
+ * requested only during network initialization.
+ *
+ * Do not restart the timer on AC message if both the NAME and
+ * the address match and so if the address has already been
+ * claimed (timer has expired) or the AC message has been sent
+ * to resolve the contention with another CF (timer is still
+ * running).
+ */
+ goto out_ecu_put;
+ }
+
if (!ecu && j1939_address_is_unicast(skcb->addr.sa))
ecu = j1939_ecu_create_locked(priv, name);
diff --git a/net/can/raw.c b/net/can/raw.c
index ba86782ba8bb..f64469b98260 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -523,6 +523,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
struct can_filter sfilter; /* single filter */
struct net_device *dev = NULL;
can_err_mask_t err_mask = 0;
+ int fd_frames;
int count = 0;
int err = 0;
@@ -664,17 +665,17 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
break;
case CAN_RAW_FD_FRAMES:
- if (optlen != sizeof(ro->fd_frames))
+ if (optlen != sizeof(fd_frames))
return -EINVAL;
- if (copy_from_sockptr(&ro->fd_frames, optval, optlen))
+ if (copy_from_sockptr(&fd_frames, optval, optlen))
return -EFAULT;
/* Enabling CAN XL includes CAN FD */
- if (ro->xl_frames && !ro->fd_frames) {
- ro->fd_frames = ro->xl_frames;
+ if (ro->xl_frames && !fd_frames)
return -EINVAL;
- }
+
+ ro->fd_frames = fd_frames;
break;
case CAN_RAW_XL_FRAMES:
diff --git a/net/core/dev.h b/net/core/dev.h
index a065b7571441..e075e198092c 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -9,6 +9,7 @@ struct net_device;
struct netdev_bpf;
struct netdev_phys_item_id;
struct netlink_ext_ack;
+struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
@@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+int rps_cpumask_housekeeping(struct cpumask *mask);
#endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 57258110bccd..6798f6d2423b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -269,7 +269,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
(n->nud_state == NUD_NOARP) ||
(tbl->is_multicast &&
tbl->is_multicast(n->primary_key)) ||
- time_after(tref, n->updated))
+ !time_in_range(n->updated, tref, jiffies))
remove = true;
write_unlock(&n->lock);
@@ -289,7 +289,17 @@ static int neigh_forced_gc(struct neigh_table *tbl)
static void neigh_add_timer(struct neighbour *n, unsigned long when)
{
+ /* Use safe distance from the jiffies - LONG_MAX point while timer
+ * is running in DELAY/PROBE state but still show to user space
+ * large times in the past.
+ */
+ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ);
+
neigh_hold(n);
+ if (!time_in_range(n->confirmed, mint, jiffies))
+ n->confirmed = mint;
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
if (unlikely(mod_timer(&n->timer, when))) {
printk("NEIGH: BUG, double timer add, state is %x\n",
n->nud_state);
@@ -1001,12 +1011,14 @@ static void neigh_periodic_work(struct work_struct *work)
goto next_elt;
}
- if (time_before(n->used, n->confirmed))
+ if (time_before(n->used, n->confirmed) &&
+ time_is_before_eq_jiffies(n->confirmed))
n->used = n->confirmed;
if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
- time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ !time_in_range_open(jiffies, n->used,
+ n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
neigh_mark_dead(n);
write_unlock(&n->lock);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ca55dd747d6c..4b361ac6a252 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t store_rps_map(struct netdev_rx_queue *queue,
- const char *buf, size_t len)
+static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
+ cpumask_var_t mask)
{
- struct rps_map *old_map, *map;
- cpumask_var_t mask;
- int err, cpu, i;
static DEFINE_MUTEX(rps_map_mutex);
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
- if (err) {
- free_cpumask_var(mask);
- return err;
- }
-
- if (!cpumask_empty(mask)) {
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
- if (cpumask_empty(mask)) {
- free_cpumask_var(mask);
- return -EINVAL;
- }
- }
+ struct rps_map *old_map, *map;
+ int cpu, i;
map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
- if (!map) {
- free_cpumask_var(mask);
+ if (!map)
return -ENOMEM;
- }
i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask)
@@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
if (old_map)
kfree_rcu(old_map, rcu);
+ return 0;
+}
+int rps_cpumask_housekeeping(struct cpumask *mask)
+{
+ if (!cpumask_empty(mask)) {
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err)
+ goto out;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto out;
+
+ err = netdev_rx_queue_set_rps_mask(queue, mask);
+
+out:
free_cpumask_var(mask);
- return len;
+ return err ? : len;
}
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
@@ -1071,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
goto err;
}
+#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL)
+ if (!cpumask_empty(&rps_default_mask)) {
+ error = netdev_rx_queue_set_rps_mask(queue, &rps_default_mask);
+ if (error)
+ goto err;
+ }
+#endif
kobject_uevent(kobj, KOBJ_ADD);
return error;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 624e9e4ec116..70a6088e8326 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -89,6 +89,34 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
+
+/* skb_small_head_cache and related code is only supported
+ * for CONFIG_SLAB and CONFIG_SLUB.
+ * As soon as SLOB is removed from the kernel, we can clean up this.
+ */
+#if !defined(CONFIG_SLOB)
+# define HAVE_SKB_SMALL_HEAD_CACHE 1
+#endif
+
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+static struct kmem_cache *skb_small_head_cache __ro_after_init;
+
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+
+/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
+ * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
+ * size, and we can differentiate heads from skb_small_head_cache
+ * vs system slabs by looking at their size (skb_end_offset()).
+ */
+#define SKB_SMALL_HEAD_CACHE_SIZE \
+ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
+ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
+ SKB_SMALL_HEAD_SIZE)
+
+#define SKB_SMALL_HEAD_HEADROOM \
+ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
+#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
+
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -478,17 +506,37 @@ EXPORT_SYMBOL(napi_build_skb);
* may be used. Otherwise, the packet data may be discarded until enough
* memory is free
*/
-static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
+static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
bool *pfmemalloc)
{
- void *obj;
bool ret_pfmemalloc = false;
+ unsigned int obj_size;
+ void *obj;
+
+ obj_size = SKB_HEAD_ALIGN(*size);
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
+ !(flags & KMALLOC_NOT_NORMAL_BITS)) {
+ /* skb_small_head_cache has non power of two size,
+ * likely forcing SLUB to use order-3 pages.
+ * We deliberately attempt a NOMEMALLOC allocation only.
+ */
+ obj = kmem_cache_alloc_node(skb_small_head_cache,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj) {
+ *size = SKB_SMALL_HEAD_CACHE_SIZE;
+ goto out;
+ }
+ }
+#endif
+ *size = obj_size = kmalloc_size_roundup(obj_size);
/*
* Try a regular allocation, when that fails and we're not entitled
* to the reserves, fail.
*/
- obj = kmalloc_node_track_caller(size,
+ obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
if (obj || !(gfp_pfmemalloc_allowed(flags)))
@@ -496,7 +544,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
/* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+ obj = kmalloc_node_track_caller(obj_size, flags, node);
out:
if (pfmemalloc)
@@ -533,7 +581,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
struct kmem_cache *cache;
struct sk_buff *skb;
- unsigned int osize;
bool pfmemalloc;
u8 *data;
@@ -558,18 +605,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- osize = kmalloc_size_roundup(size);
- data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
+ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
if (unlikely(!data))
goto nodata;
/* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- size = SKB_WITH_OVERHEAD(osize);
- prefetchw(data + size);
+ prefetchw(data + SKB_WITH_OVERHEAD(size));
/*
* Only clear those fields we need to clear, not those that we will
@@ -577,7 +620,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
- __build_skb_around(skb, data, osize);
+ __build_skb_around(skb, data, size);
skb->pfmemalloc = pfmemalloc;
if (flags & SKB_ALLOC_FCLONE) {
@@ -632,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
goto skb_success;
}
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
@@ -732,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
} else {
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
data = page_frag_alloc(&nc->page, len, gfp_mask);
pfmemalloc = nc->page.pfmemalloc;
@@ -809,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
return page_pool_return_skb_page(virt_to_page(data));
}
+static void skb_kfree_head(void *head, unsigned int end_offset)
+{
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ if (end_offset == SKB_SMALL_HEAD_HEADROOM)
+ kmem_cache_free(skb_small_head_cache, head);
+ else
+#endif
+ kfree(head);
+}
+
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
@@ -818,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb)
return;
skb_free_frag(head);
} else {
- kfree(head);
+ skb_kfree_head(head, skb_end_offset(skb));
}
}
@@ -1938,10 +1989,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
size = SKB_WITH_OVERHEAD(size);
@@ -2004,7 +2052,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
return 0;
nofrags:
- kfree(data);
+ skb_kfree_head(data, size);
nodata:
return -ENOMEM;
}
@@ -4641,6 +4689,19 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
+ * struct skb_shared_info is located at the end of skb->head,
+ * and should not be copied to/from user.
+ */
+ skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+ SKB_SMALL_HEAD_CACHE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ 0,
+ SKB_SMALL_HEAD_HEADROOM,
+ NULL);
+#endif
skb_extensions_init();
}
@@ -6289,10 +6350,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
@@ -6308,7 +6366,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_cloned(skb)) {
/* drop the old head gracefully */
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -6408,10 +6466,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
@@ -6419,7 +6474,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
shinfo = (struct skb_shared_info *)(data + size);
@@ -6455,7 +6510,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
if (skb_has_frag_list(skb))
kfree_skb_list(skb_shinfo(skb)->frag_list);
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
skb_release_data(skb, SKB_CONSUMED);
diff --git a/net/core/sock.c b/net/core/sock.c
index 208634b01df5..afbb02984d5f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1531,6 +1531,8 @@ set_sndbuf:
ret = -EINVAL;
break;
}
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
/* Paired with READ_ONCE() in tcp_rtx_synack() */
WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;
@@ -3454,7 +3456,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
- sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index e7b98162c632..7130e6d9e263 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -16,6 +16,7 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/sched/isolation.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -45,7 +46,59 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
+#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
+static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
+{
+ char kbuf[128];
+ int len;
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return;
+ }
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ return;
+ }
+
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ memcpy(buffer, kbuf, len);
+ *lenp = len;
+ *ppos += len;
+}
+#endif
+
#ifdef CONFIG_RPS
+struct cpumask rps_default_mask;
+
+static int rps_default_mask_sysctl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+
+ rtnl_lock();
+ if (write) {
+ err = cpumask_parse(buffer, &rps_default_mask);
+ if (err)
+ goto done;
+
+ err = rps_cpumask_housekeeping(&rps_default_mask);
+ if (err)
+ goto done;
+ } else {
+ dump_cpumask(buffer, lenp, ppos, &rps_default_mask);
+ }
+
+done:
+ rtnl_unlock();
+ return err;
+}
+
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -155,13 +208,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
write_unlock:
mutex_unlock(&flow_limit_update_mutex);
} else {
- char kbuf[128];
-
- if (*ppos || !*lenp) {
- *lenp = 0;
- goto done;
- }
-
cpumask_clear(mask);
rcu_read_lock();
for_each_possible_cpu(i) {
@@ -171,17 +217,7 @@ write_unlock:
}
rcu_read_unlock();
- len = min(sizeof(kbuf) - 1, *lenp);
- len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
- if (!len) {
- *lenp = 0;
- goto done;
- }
- if (len < *lenp)
- kbuf[len++] = '\n';
- memcpy(buffer, kbuf, len);
- *lenp = len;
- *ppos += len;
+ dump_cpumask(buffer, lenp, ppos, mask);
}
done:
@@ -472,6 +508,11 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = rps_sock_flow_sysctl
},
+ {
+ .procname = "rps_default_mask",
+ .mode = 0644,
+ .proc_handler = rps_default_mask_sysctl
+ },
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
{
@@ -675,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = {
static __init int sysctl_core_init(void)
{
+#if IS_ENABLED(CONFIG_RPS)
+ cpumask_copy(&rps_default_mask, cpu_none_mask);
+#endif
+
register_net_sysctl(&init_net, "net/core", net_core_table);
return register_pernet_subsys(&sysctl_core_ops);
}
diff --git a/net/devlink/core.c b/net/devlink/core.c
index aeffd1b8206d..a4f47dafb864 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -205,7 +205,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
goto err_xa_alloc;
devlink->netdevice_nb.notifier_call = devlink_port_netdevice_event;
- ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
+ ret = register_netdevice_notifier(&devlink->netdevice_nb);
if (ret)
goto err_register_netdevice_notifier;
@@ -266,8 +266,7 @@ void devlink_free(struct devlink *devlink)
xa_destroy(&devlink->snapshot_ids);
xa_destroy(&devlink->ports);
- WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink),
- &devlink->netdevice_nb));
+ WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb));
xa_erase(&devlinks, devlink->index);
diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c
index 97d30ea98b00..f05ab093d231 100644
--- a/net/devlink/leftover.c
+++ b/net/devlink/leftover.c
@@ -6626,18 +6626,22 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
err = devlink_fmsg_obj_nest_start(fmsg);
if (err)
- return err;
+ goto out;
err = reporter->ops->diagnose(reporter, fmsg, info->extack);
if (err)
- return err;
+ goto out;
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
- return err;
+ goto out;
+
+ err = devlink_fmsg_snd(fmsg, info,
+ DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
- return devlink_fmsg_snd(fmsg, info,
- DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
+out:
+ devlink_fmsg_free(fmsg);
+ return err;
}
static int
@@ -8426,6 +8430,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb,
break;
case NETDEV_REGISTER:
case NETDEV_CHANGENAME:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
/* Set the netdev on top of previously set type. Note this
* event happens also during net namespace change so here
* we take into account netdev pointer appearing in this
@@ -8435,6 +8441,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb,
netdev);
break;
case NETDEV_UNREGISTER:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
/* Clear netdev pointer, but not the type. This event happens
* also during net namespace change so we need to clear
* pointer to netdev that is going to another net namespace.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2f992a323b95..2c778b013cb0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -347,6 +347,7 @@ lookup_protocol:
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
inet->uc_ttl = -1;
inet->mc_loop = 1;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6ed7e65de494..7d206a10ad14 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1246,9 +1246,6 @@ int inet_csk_listen_start(struct sock *sk)
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
- if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
- sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
-
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fee9163382c2..847934763868 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -222,6 +222,7 @@ lookup_protocol:
np->pmtudisc = IPV6_PMTUDISC_WANT;
np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
/* Init the ipv4 part of the socket since we can have sockets
* using v6 API for ipv4.
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index db07cc5b4fcb..56628b52d100 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1002,8 +1002,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
{
int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
- struct mptcp_sock *msk;
struct socket *ssock;
+ struct sock *newsk;
int backlog = 1024;
int err;
@@ -1012,11 +1012,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
if (err)
return err;
- msk = mptcp_sk(entry->lsk->sk);
- if (!msk)
+ newsk = entry->lsk->sk;
+ if (!newsk)
return -EINVAL;
- ssock = __mptcp_nmpc_socket(msk);
+ lock_sock(newsk);
+ ssock = __mptcp_nmpc_socket(mptcp_sk(newsk));
+ release_sock(newsk);
if (!ssock)
return -EINVAL;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1ff2efbab29e..c9817aa0f413 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2902,6 +2902,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
+ int subflows_alive = 0;
sk->sk_shutdown = SHUTDOWN_MASK;
@@ -2928,6 +2929,8 @@ cleanup:
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ subflows_alive += ssk->sk_state != TCP_CLOSE;
+
/* since the close timeout takes precedence on the fail one,
* cancel the latter
*/
@@ -2943,6 +2946,12 @@ cleanup:
}
sock_orphan(sk);
+ /* all the subflows are closed, only timeout can change the msk
+ * state, let's not keep resources busy for no reasons
+ */
+ if (subflows_alive == 0)
+ inet_sk_state_store(sk, TCP_CLOSE);
+
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (msk->token)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 9986681aaf40..8a9656248b0f 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
+ struct sock *sk = (struct sock *)msk;
struct socket *sock;
+ int ret = -EINVAL;
/* Limit to first subflow, before the connection establishment */
+ lock_sock(sk);
sock = __mptcp_nmpc_socket(msk);
if (!sock)
- return -EINVAL;
+ goto unlock;
- return tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+ ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+
+unlock:
+ release_sock(sk);
+ return ret;
}
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index beaec843f5ca..4ae1a7304cf0 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1400,6 +1400,7 @@ void __mptcp_error_report(struct sock *sk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int err = sock_error(ssk);
+ int ssk_state;
if (!err)
continue;
@@ -1410,7 +1411,14 @@ void __mptcp_error_report(struct sock *sk)
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
continue;
- inet_sk_state_store(sk, inet_sk_state_load(ssk));
+ /* We need to propagate only transition to CLOSE state.
+ * Orphaned socket will see such state change via
+ * subflow_sched_work_if_closed() and that path will properly
+ * destroy the msk as needed.
+ */
+ ssk_state = inet_sk_state_load(ssk);
+ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+ inet_sk_state_store(sk, ssk_state);
sk->sk_err = -err;
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
@@ -1682,7 +1690,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
if (err)
return err;
- lock_sock(sf->sk);
+ lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);
/* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 600993c80050..04c4036bf406 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -13,7 +13,7 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
-#include <linux/string.h>
+#include <linux/string_helpers.h>
#include <linux/skbuff.h>
#include <linux/mutex.h>
#include <linux/bitmap.h>
@@ -457,7 +457,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
if (WARN_ON(grp->name[0] == '\0'))
return -EINVAL;
- if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL))
+ if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ)))
return -EINVAL;
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 2172930b1f17..f95272ebfa08 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -9,6 +9,7 @@
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/static_key.h>
+#include <linux/string_helpers.h>
#include <net/ip.h>
#include <net/genetlink.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -1383,7 +1384,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
#endif
case OVS_CT_ATTR_HELPER:
*helper = nla_data(a);
- if (!memchr(*helper, '\0', nla_len(a))) {
+ if (!string_is_terminated(*helper, nla_len(a))) {
OVS_NLERR(log, "Invalid conntrack helper");
return -EINVAL;
}
@@ -1404,7 +1405,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
case OVS_CT_ATTR_TIMEOUT:
memcpy(info->timeout, nla_data(a), nla_len(a));
- if (!memchr(info->timeout, '\0', nla_len(a))) {
+ if (!string_is_terminated(info->timeout, nla_len(a))) {
OVS_NLERR(log, "Invalid conntrack timeout");
return -EINVAL;
}
diff --git a/net/rds/message.c b/net/rds/message.c
index b47e4f0a1639..c19c93561227 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
spin_lock_irqsave(&q->lock, flags);
head = &q->zcookie_head;
if (!list_empty(head)) {
- info = list_entry(head, struct rds_msg_zcopy_info,
- rs_zcookie_next);
- if (info && rds_zcookie_add(info, cookie)) {
+ info = list_first_entry(head, struct rds_msg_zcopy_info,
+ rs_zcookie_next);
+ if (rds_zcookie_add(info, cookie)) {
spin_unlock_irqrestore(&q->lock, flags);
kfree(rds_info_from_znotifier(znotif));
/* caller invokes rds_wake_sk_sleep() */
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c14018a8052c..e9780631b5b5 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1282,12 +1282,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (err)
goto err_out3;
- if (ops->init) {
- err = ops->init(sch, tca[TCA_OPTIONS], extack);
- if (err != 0)
- goto err_out5;
- }
-
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB], extack);
if (IS_ERR(stab)) {
@@ -1296,11 +1290,18 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
}
rcu_assign_pointer(sch->stab, stab);
}
+
+ if (ops->init) {
+ err = ops->init(sch, tca[TCA_OPTIONS], extack);
+ if (err != 0)
+ goto err_out5;
+ }
+
if (tca[TCA_RATE]) {
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT) {
NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
- goto err_out4;
+ goto err_out5;
}
err = gen_new_estimator(&sch->bstats,
@@ -1311,7 +1312,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
- goto err_out4;
+ goto err_out5;
}
}
@@ -1321,6 +1322,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
return sch;
err_out5:
+ qdisc_put_stab(rtnl_dereference(sch->stab));
+err_out4:
/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
if (ops->destroy)
ops->destroy(sch);
@@ -1332,16 +1335,6 @@ err_out2:
err_out:
*errp = err;
return NULL;
-
-err_out4:
- /*
- * Any broken qdiscs that would require a ops->reset() here?
- * The qdisc was never in action so it shouldn't be necessary.
- */
- qdisc_put_stab(rtnl_dereference(sch->stab));
- if (ops->destroy)
- ops->destroy(sch);
- goto err_out3;
}
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index cc28e41fb745..92f2975b6a82 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -433,7 +433,7 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
while (m) {
unsigned int prio = ffz(~m);
- if (WARN_ON_ONCE(prio > ARRAY_SIZE(p->inner.clprio)))
+ if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
break;
m &= ~(1 << prio);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 1c95785932b9..9781b47962bb 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -29,6 +29,8 @@
#include "sch_mqprio_lib.h"
static LIST_HEAD(taprio_list);
+static struct static_key_false taprio_have_broken_mqprio;
+static struct static_key_false taprio_have_working_mqprio;
#define TAPRIO_ALL_GATES_OPEN -1
@@ -37,15 +39,19 @@ static LIST_HEAD(taprio_list);
#define TAPRIO_FLAGS_INVALID U32_MAX
struct sched_entry {
- struct list_head list;
-
- /* The instant that this entry "closes" and the next one
- * should open, the qdisc will make some effort so that no
- * packet leaves after this time.
+ /* Durations between this GCL entry and the GCL entry where the
+ * respective traffic class gate closes
*/
- ktime_t close_time;
+ u64 gate_duration[TC_MAX_QUEUE];
+ atomic_t budget[TC_MAX_QUEUE];
+ /* The qdisc makes some effort so that no packet leaves
+ * after this time
+ */
+ ktime_t gate_close_time[TC_MAX_QUEUE];
+ struct list_head list;
+ /* Used to calculate when to advance the schedule */
+ ktime_t end_time;
ktime_t next_txtime;
- atomic_t budget;
int index;
u32 gate_mask;
u32 interval;
@@ -53,10 +59,16 @@ struct sched_entry {
};
struct sched_gate_list {
+ /* Longest non-zero contiguous gate durations per traffic class,
+ * or 0 if a traffic class gate never opens during the schedule.
+ */
+ u64 max_open_gate_duration[TC_MAX_QUEUE];
+ u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
+ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */
struct rcu_head rcu;
struct list_head entries;
size_t num_entries;
- ktime_t cycle_close_time;
+ ktime_t cycle_end_time;
s64 cycle_time;
s64 cycle_time_extension;
s64 base_time;
@@ -69,6 +81,8 @@ struct taprio_sched {
enum tk_offsets tk_offset;
int clockid;
bool offloaded;
+ bool detected_mqprio;
+ bool broken_mqprio;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
* speeds it's sub-nanoseconds per byte
*/
@@ -80,8 +94,8 @@ struct taprio_sched {
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
- u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
- u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
+ int cur_txq[TC_MAX_QUEUE];
+ u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
u32 txtime_delay;
};
@@ -90,6 +104,57 @@ struct __tc_taprio_qopt_offload {
struct tc_taprio_qopt_offload offload;
};
+static void taprio_calculate_gate_durations(struct taprio_sched *q,
+ struct sched_gate_list *sched)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sched_entry *entry, *cur;
+ int tc;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ u32 gates_still_open = entry->gate_mask;
+
+ /* For each traffic class, calculate each open gate duration,
+ * starting at this schedule entry and ending at the schedule
+ * entry containing a gate close event for that TC.
+ */
+ cur = entry;
+
+ do {
+ if (!gates_still_open)
+ break;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (!(gates_still_open & BIT(tc)))
+ continue;
+
+ if (cur->gate_mask & BIT(tc))
+ entry->gate_duration[tc] += cur->interval;
+ else
+ gates_still_open &= ~BIT(tc);
+ }
+
+ cur = list_next_entry_circular(cur, &sched->entries, list);
+ } while (cur != entry);
+
+ /* Keep track of the maximum gate duration for each traffic
+ * class, taking care to not confuse a traffic class which is
+ * temporarily closed with one that is always closed.
+ */
+ for (tc = 0; tc < num_tc; tc++)
+ if (entry->gate_duration[tc] &&
+ sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
+ sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
+ }
+}
+
+static bool taprio_entry_allows_tx(ktime_t skb_end_time,
+ struct sched_entry *entry, int tc)
+{
+ return ktime_before(skb_end_time, entry->gate_close_time[tc]);
+}
+
static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
if (!sched)
@@ -182,6 +247,55 @@ static int length_to_duration(struct taprio_sched *q, int len)
return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}
+static int duration_to_length(struct taprio_sched *q, u64 duration)
+{
+ return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
+}
+
+/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the
+ * q->max_sdu[] requested by the user and the max_sdu dynamically determined by
+ * the maximum open gate durations at the given link speed.
+ */
+static void taprio_update_queue_max_sdu(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct qdisc_size_table *stab)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ u32 max_sdu_from_user;
+ u32 max_sdu_dynamic;
+ u32 max_sdu;
+ int tc;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX;
+
+ /* TC gate never closes => keep the queueMaxSDU
+ * selected by the user
+ */
+ if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
+ max_sdu_dynamic = U32_MAX;
+ } else {
+ u32 max_frm_len;
+
+ max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]);
+ if (stab)
+ max_frm_len -= stab->szopts.overhead;
+ max_sdu_dynamic = max_frm_len - dev->hard_header_len;
+ }
+
+ max_sdu = min(max_sdu_dynamic, max_sdu_from_user);
+
+ if (max_sdu != U32_MAX) {
+ sched->max_frm_len[tc] = max_sdu + dev->hard_header_len;
+ sched->max_sdu[tc] = max_sdu;
+ } else {
+ sched->max_frm_len[tc] = U32_MAX; /* never oversized */
+ sched->max_sdu[tc] = 0;
+ }
+ }
+}
+
/* Returns the entry corresponding to next available interval. If
* validate_interval is set, it only validates whether the timestamp occurs
* when the gate corresponding to the skb's traffic class is open.
@@ -415,14 +529,33 @@ done:
return txtime;
}
-static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
- struct Qdisc *child, struct sk_buff **to_free)
+/* Devices with full offload are expected to honor this in hardware */
+static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
+ struct sk_buff *skb)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct sched_gate_list *sched;
int prio = skb->priority;
+ bool exceeds = false;
u8 tc;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ if (sched && skb->len > sched->max_frm_len[tc])
+ exceeds = true;
+ rcu_read_unlock();
+
+ return exceeds;
+}
+
+static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child, struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
/* sk_flags are only safe to use on full sockets. */
if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
if (!is_valid_interval(skb, sch))
@@ -433,17 +566,53 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_drop(skb, sch, to_free);
}
- /* Devices with full offload are expected to honor this in hardware */
- tc = netdev_get_prio_tc_map(dev, prio);
- if (skb->len > q->max_frm_len[tc])
- return qdisc_drop(skb, sch, to_free);
-
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return qdisc_enqueue(skb, child, to_free);
}
+static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
+{
+ unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
+ netdev_features_t features = netif_skb_features(skb);
+ struct sk_buff *segs, *nskb;
+ int ret;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ slen += segs->len;
+
+ /* FIXME: we should be segmenting to a smaller size
+ * rather than dropping these
+ */
+ if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
+ ret = qdisc_drop(segs, sch, to_free);
+ else
+ ret = taprio_enqueue_one(segs, sch, child, to_free);
+
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ numsegs++;
+ }
+ }
+
+ if (numsegs > 1)
+ qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
+ consume_skb(skb);
+
+ return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
@@ -460,97 +629,190 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
- /* Large packets might not be transmitted when the transmission duration
- * exceeds any configured interval. Therefore, segment the skb into
- * smaller chunks. Drivers with full offload are expected to handle
- * this in hardware.
- */
- if (skb_is_gso(skb)) {
- unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
- netdev_features_t features = netif_skb_features(skb);
- struct sk_buff *segs, *nskb;
- int ret;
-
- segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
- if (IS_ERR_OR_NULL(segs))
- return qdisc_drop(skb, sch, to_free);
+ if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
+ /* Large packets might not be transmitted when the transmission
+ * duration exceeds any configured interval. Therefore, segment
+ * the skb into smaller chunks. Drivers with full offload are
+ * expected to handle this in hardware.
+ */
+ if (skb_is_gso(skb))
+ return taprio_enqueue_segmented(skb, sch, child,
+ to_free);
- skb_list_walk_safe(segs, segs, nskb) {
- skb_mark_not_on_list(segs);
- qdisc_skb_cb(segs)->pkt_len = segs->len;
- slen += segs->len;
+ return qdisc_drop(skb, sch, to_free);
+ }
- ret = taprio_enqueue_one(segs, sch, child, to_free);
- if (ret != NET_XMIT_SUCCESS) {
- if (net_xmit_drop_count(ret))
- qdisc_qstats_drop(sch);
- } else {
- numsegs++;
- }
- }
+ return taprio_enqueue_one(skb, sch, child, to_free);
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented");
+ return NULL;
+}
- if (numsegs > 1)
- qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
- consume_skb(skb);
+static void taprio_set_budgets(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct sched_entry *entry)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ int tc, budget;
- return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+ for (tc = 0; tc < num_tc; tc++) {
+ /* Traffic classes which never close have infinite budget */
+ if (entry->gate_duration[tc] == sched->cycle_time)
+ budget = INT_MAX;
+ else
+ budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
+ atomic64_read(&q->picos_per_byte));
+
+ atomic_set(&entry->budget[tc], budget);
}
+}
- return taprio_enqueue_one(skb, sch, child, to_free);
+/* When an skb is sent, it consumes from the budget of all traffic classes */
+static int taprio_update_budgets(struct sched_entry *entry, size_t len,
+ int tc_consumed, int num_tc)
+{
+ int tc, budget, new_budget = 0;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ budget = atomic_read(&entry->budget[tc]);
+ /* Don't consume from infinite budget */
+ if (budget == INT_MAX) {
+ if (tc == tc_consumed)
+ new_budget = budget;
+ continue;
+ }
+
+ if (tc == tc_consumed)
+ new_budget = atomic_sub_return(len, &entry->budget[tc]);
+ else
+ atomic_sub(len, &entry->budget[tc]);
+ }
+
+ return new_budget;
}
-/* Will not be called in the full offload case, since the TX queues are
- * attached to the Qdisc created using qdisc_create_dflt()
- */
-static struct sk_buff *taprio_peek(struct Qdisc *sch)
+static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- struct sched_entry *entry;
+ struct Qdisc *child = q->qdiscs[txq];
+ int num_tc = netdev_get_num_tc(dev);
struct sk_buff *skb;
- u32 gate_mask;
- int i;
+ ktime_t guard;
+ int prio;
+ int len;
+ u8 tc;
- rcu_read_lock();
- entry = rcu_dereference(q->current_entry);
- gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
- rcu_read_unlock();
+ if (unlikely(!child))
+ return NULL;
- if (!gate_mask)
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ goto skip_peek_checks;
+
+ skb = child->ops->peek(child);
+ if (!skb)
return NULL;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- int prio;
- u8 tc;
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
- if (unlikely(!child))
- continue;
+ if (!(gate_mask & BIT(tc)))
+ return NULL;
- skb = child->ops->peek(child);
- if (!skb)
- continue;
+ len = qdisc_pkt_len(skb);
+ guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));
- if (TXTIME_ASSIST_IS_ENABLED(q->flags))
- return skb;
+ /* In the case that there's no gate entry, there's no
+ * guard band ...
+ */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ !taprio_entry_allows_tx(guard, entry, tc))
+ return NULL;
+
+ /* ... and no budget. */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ taprio_update_budgets(entry, len, tc, num_tc) < 0)
+ return NULL;
+
+skip_peek_checks:
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
+{
+ int offset = dev->tc_to_txq[tc].offset;
+ int count = dev->tc_to_txq[tc].count;
+
+ (*txq)++;
+ if (*txq == offset + count)
+ *txq = offset;
+}
+
+/* Prioritize higher traffic classes, and select among TXQs belonging to the
+ * same TC using round robin
+ */
+static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sk_buff *skb;
+ int tc;
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
+ for (tc = num_tc - 1; tc >= 0; tc--) {
+ int first_txq = q->cur_txq[tc];
if (!(gate_mask & BIT(tc)))
continue;
- return skb;
+ do {
+ skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
+ entry, gate_mask);
+
+ taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
+
+ if (skb)
+ return skb;
+ } while (q->cur_txq[tc] != first_txq);
}
return NULL;
}
-static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
+/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
+ * class other than to determine whether the gate is open or not
+ */
+static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
- atomic_set(&entry->budget,
- div64_u64((u64)entry->interval * PSEC_PER_NSEC,
- atomic64_read(&q->picos_per_byte)));
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
+ if (skb)
+ return skb;
+ }
+
+ return NULL;
}
/* Will not be called in the full offload case, since the TX queues are
@@ -559,11 +821,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
struct sk_buff *skb = NULL;
struct sched_entry *entry;
u32 gate_mask;
- int i;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
@@ -573,69 +833,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
* "AdminGateStates"
*/
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
-
if (!gate_mask)
goto done;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- ktime_t guard;
- int prio;
- int len;
- u8 tc;
-
- if (unlikely(!child))
- continue;
-
- if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
- skb = child->ops->dequeue(child);
- if (!skb)
- continue;
- goto skb_found;
- }
-
- skb = child->ops->peek(child);
- if (!skb)
- continue;
-
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
-
- if (!(gate_mask & BIT(tc))) {
- skb = NULL;
- continue;
- }
-
- len = qdisc_pkt_len(skb);
- guard = ktime_add_ns(taprio_get_time(q),
- length_to_duration(q, len));
-
- /* In the case that there's no gate entry, there's no
- * guard band ...
- */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- ktime_after(guard, entry->close_time)) {
- skb = NULL;
- continue;
- }
-
- /* ... and no budget. */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- atomic_sub_return(len, &entry->budget) < 0) {
- skb = NULL;
- continue;
- }
-
- skb = child->ops->dequeue(child);
- if (unlikely(!skb))
- goto done;
-
-skb_found:
- qdisc_bstats_update(sch, skb);
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
-
- goto done;
+ if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
+ !static_branch_likely(&taprio_have_working_mqprio)) {
+ /* Single NIC kind which is broken */
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ } else if (static_branch_likely(&taprio_have_working_mqprio) &&
+ !static_branch_unlikely(&taprio_have_broken_mqprio)) {
+ /* Single NIC kind which prioritizes properly */
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
+ } else {
+ /* Mixed NIC kinds present in system, need dynamic testing */
+ if (q->broken_mqprio)
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ else
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
}
done:
@@ -650,7 +864,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper,
if (list_is_last(&entry->list, &oper->entries))
return true;
- if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0)
+ if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0)
return true;
return false;
@@ -658,7 +872,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper,
static bool should_change_schedules(const struct sched_gate_list *admin,
const struct sched_gate_list *oper,
- ktime_t close_time)
+ ktime_t end_time)
{
ktime_t next_base_time, extension_time;
@@ -667,18 +881,18 @@ static bool should_change_schedules(const struct sched_gate_list *admin,
next_base_time = sched_base_time(admin);
- /* This is the simple case, the close_time would fall after
+ /* This is the simple case, the end_time would fall after
* the next schedule base_time.
*/
- if (ktime_compare(next_base_time, close_time) <= 0)
+ if (ktime_compare(next_base_time, end_time) <= 0)
return true;
- /* This is the cycle_time_extension case, if the close_time
+ /* This is the cycle_time_extension case, if the end_time
* plus the amount that can be extended would fall after the
* next schedule base_time, we can extend the current schedule
* for that amount.
*/
- extension_time = ktime_add_ns(close_time, oper->cycle_time_extension);
+ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);
/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
* how precisely the extension should be made. So after
@@ -694,10 +908,13 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
struct taprio_sched *q = container_of(timer, struct taprio_sched,
advance_timer);
+ struct net_device *dev = qdisc_dev(q->root);
struct sched_gate_list *oper, *admin;
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *entry, *next;
struct Qdisc *sch = q->root;
- ktime_t close_time;
+ ktime_t end_time;
+ int tc;
spin_lock(&q->current_entry_lock);
entry = rcu_dereference_protected(q->current_entry,
@@ -716,41 +933,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
* entry of all schedules are pre-calculated during the
* schedule initialization.
*/
- if (unlikely(!entry || entry->close_time == oper->base_time)) {
+ if (unlikely(!entry || entry->end_time == oper->base_time)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
- close_time = next->close_time;
+ end_time = next->end_time;
goto first_run;
}
if (should_restart_cycle(oper, entry)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
- oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time,
- oper->cycle_time);
+ oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
+ oper->cycle_time);
} else {
next = list_next_entry(entry, list);
}
- close_time = ktime_add_ns(entry->close_time, next->interval);
- close_time = min_t(ktime_t, close_time, oper->cycle_close_time);
+ end_time = ktime_add_ns(entry->end_time, next->interval);
+ end_time = min_t(ktime_t, end_time, oper->cycle_end_time);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (next->gate_duration[tc] == oper->cycle_time)
+ next->gate_close_time[tc] = KTIME_MAX;
+ else
+ next->gate_close_time[tc] = ktime_add_ns(entry->end_time,
+ next->gate_duration[tc]);
+ }
- if (should_change_schedules(admin, oper, close_time)) {
+ if (should_change_schedules(admin, oper, end_time)) {
/* Set things so the next time this runs, the new
* schedule runs.
*/
- close_time = sched_base_time(admin);
+ end_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
}
- next->close_time = close_time;
- taprio_set_budget(q, next);
+ next->end_time = end_time;
+ taprio_set_budgets(q, oper, next);
first_run:
rcu_assign_pointer(q->current_entry, next);
spin_unlock(&q->current_entry_lock);
- hrtimer_set_expires(&q->advance_timer, close_time);
+ hrtimer_set_expires(&q->advance_timer, end_time);
rcu_read_lock();
__netif_schedule(sch);
@@ -918,6 +1143,8 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
new->cycle_time = cycle;
}
+ taprio_calculate_gate_durations(q, new);
+
return 0;
}
@@ -986,11 +1213,14 @@ static int taprio_get_start_time(struct Qdisc *sch,
return 0;
}
-static void setup_first_close_time(struct taprio_sched *q,
- struct sched_gate_list *sched, ktime_t base)
+static void setup_first_end_time(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *first;
ktime_t cycle;
+ int tc;
first = list_first_entry(&sched->entries,
struct sched_entry, list);
@@ -998,10 +1228,18 @@ static void setup_first_close_time(struct taprio_sched *q,
cycle = sched->cycle_time;
/* FIXME: find a better place to do this */
- sched->cycle_close_time = ktime_add_ns(base, cycle);
+ sched->cycle_end_time = ktime_add_ns(base, cycle);
+
+ first->end_time = ktime_add_ns(base, first->interval);
+ taprio_set_budgets(q, sched, first);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (first->gate_duration[tc] == sched->cycle_time)
+ first->gate_close_time[tc] = KTIME_MAX;
+ else
+ first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
+ }
- first->close_time = ktime_add_ns(base, first->interval);
- taprio_set_budget(q, first);
rcu_assign_pointer(q->current_entry, NULL);
}
@@ -1055,6 +1293,8 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct sched_gate_list *oper, *admin;
+ struct qdisc_size_table *stab;
struct taprio_sched *q;
ASSERT_RTNL();
@@ -1067,6 +1307,17 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
continue;
taprio_set_picos_per_byte(dev, q);
+
+ stab = rtnl_dereference(q->root->stab);
+
+ oper = rtnl_dereference(q->oper_sched);
+ if (oper)
+ taprio_update_queue_max_sdu(q, oper, stab);
+
+ admin = rtnl_dereference(q->admin_sched);
+ if (admin)
+ taprio_update_queue_max_sdu(q, admin, stab);
+
break;
}
@@ -1197,6 +1448,34 @@ static void taprio_sched_to_offload(struct net_device *dev,
offload->num_entries = i;
}
+static void taprio_detect_broken_mqprio(struct taprio_sched *q)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ struct tc_taprio_caps caps;
+
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
+ &caps, sizeof(caps));
+
+ q->broken_mqprio = caps.broken_mqprio;
+ if (q->broken_mqprio)
+ static_branch_inc(&taprio_have_broken_mqprio);
+ else
+ static_branch_inc(&taprio_have_working_mqprio);
+
+ q->detected_mqprio = true;
+}
+
+static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
+{
+ if (!q->detected_mqprio)
+ return;
+
+ if (q->broken_mqprio)
+ static_branch_dec(&taprio_have_broken_mqprio);
+ else
+ static_branch_dec(&taprio_have_working_mqprio);
+}
+
static int taprio_enable_offload(struct net_device *dev,
struct taprio_sched *q,
struct sched_gate_list *sched,
@@ -1425,7 +1704,6 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
u32 max_sdu[TC_QOPT_MAX_QUEUE];
unsigned long seen_tcs = 0;
struct nlattr *n;
@@ -1439,18 +1717,14 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
continue;
- err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack);
+ err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs,
+ extack);
if (err)
goto out;
}
- for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
q->max_sdu[tc] = max_sdu[tc];
- if (max_sdu[tc])
- q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len;
- else
- q->max_frm_len[tc] = U32_MAX; /* never oversized */
- }
out:
return err;
@@ -1506,6 +1780,7 @@ static int taprio_new_flags(const struct nlattr *attr, u32 old,
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct qdisc_size_table *stab = rtnl_dereference(sch->stab);
struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
struct sched_gate_list *oper, *admin, *new_admin;
struct taprio_sched *q = qdisc_priv(sch);
@@ -1573,15 +1848,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto free_sched;
taprio_set_picos_per_byte(dev, q);
+ taprio_update_queue_max_sdu(q, new_admin, stab);
if (mqprio) {
err = netdev_set_num_tc(dev, mqprio->num_tc);
if (err)
goto free_sched;
- for (i = 0; i < mqprio->num_tc; i++)
+ for (i = 0; i < mqprio->num_tc; i++) {
netdev_set_tc_queue(dev, i,
mqprio->count[i],
mqprio->offset[i]);
+ q->cur_txq[i] = mqprio->offset[i];
+ }
/* Always use supplied priority mappings */
for (i = 0; i <= TC_BITMASK; i++)
@@ -1636,7 +1914,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
} else {
- setup_first_close_time(q, new_admin, start);
+ setup_first_end_time(q, new_admin, start);
/* Protects against advance_sched() */
spin_lock_irqsave(&q->current_entry_lock, flags);
@@ -1656,6 +1934,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
new_admin = NULL;
err = 0;
+ if (!stab)
+ NL_SET_ERR_MSG_MOD(extack,
+ "Size table not specified, frame length estimations may be inaccurate");
+
unlock:
spin_unlock_bh(qdisc_lock(sch));
@@ -1716,6 +1998,8 @@ static void taprio_destroy(struct Qdisc *sch)
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ taprio_cleanup_broken_mqprio(q);
}
static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1780,6 +2064,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
q->qdiscs[i] = qdisc;
}
+ taprio_detect_broken_mqprio(q);
+
return taprio_change(sch, opt, extack);
}
@@ -1920,7 +2206,8 @@ error_nest:
return -1;
}
-static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
+static int taprio_dump_tc_entries(struct sk_buff *skb,
+ struct sched_gate_list *sched)
{
struct nlattr *n;
int tc;
@@ -1934,7 +2221,7 @@ static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
- q->max_sdu[tc]))
+ sched->max_sdu[tc]))
goto nla_put_failure;
nla_nest_end(skb, n);
@@ -1978,7 +2265,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
goto options_error;
- if (taprio_dump_tc_entries(q, skb))
+ if (oper && taprio_dump_tc_entries(skb, oper))
goto options_error;
if (oper && dump_schedule(skb, oper))
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index dfea27a906f2..9b47c8409231 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -39,6 +39,7 @@
#include "node.h"
#include "net.h"
#include <net/genetlink.h>
+#include <linux/string_helpers.h>
#include <linux/tipc_config.h>
/* The legacy API had an artificial message length limit called
@@ -173,11 +174,6 @@ static struct sk_buff *tipc_get_err_tlv(char *str)
return buf;
}
-static inline bool string_is_valid(char *s, int len)
-{
- return memchr(s, '\0', len) ? true : false;
-}
-
static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
struct tipc_nl_compat_msg *msg,
struct sk_buff *arg)
@@ -445,7 +441,7 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_BEARER_NAME);
- if (!string_is_valid(b->name, len))
+ if (!string_is_terminated(b->name, len))
return -EINVAL;
if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name))
@@ -486,7 +482,7 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_BEARER_NAME);
- if (!string_is_valid(name, len))
+ if (!string_is_terminated(name, len))
return -EINVAL;
if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name))
@@ -584,7 +580,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_LINK_NAME);
- if (!string_is_valid(name, len))
+ if (!string_is_terminated(name, len))
return -EINVAL;
if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
@@ -819,7 +815,7 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_LINK_NAME);
- if (!string_is_valid(lc->name, len))
+ if (!string_is_terminated(lc->name, len))
return -EINVAL;
media = tipc_media_find(lc->name);
@@ -856,7 +852,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_LINK_NAME);
- if (!string_is_valid(name, len))
+ if (!string_is_terminated(name, len))
return -EINVAL;
if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name))
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index a0f62fa02e06..8cbf45a8bcdc 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -5,6 +5,7 @@
* Based on code and translator idea by: Florian Westphal <fw@strlen.de>
*/
#include <linux/compat.h>
+#include <linux/nospec.h>
#include <linux/xfrm.h>
#include <net/xfrm.h>
@@ -302,7 +303,7 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src)
nla_for_each_attr(nla, attrs, len, remaining) {
int err;
- switch (type) {
+ switch (nlh_src->nlmsg_type) {
case XFRM_MSG_NEWSPDINFO:
err = xfrm_nla_cpy(dst, nla, nla_len(nla));
break;
@@ -437,6 +438,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
NL_SET_ERR_MSG(extack, "Bad attribute");
return -EOPNOTSUPP;
}
+ type = array_index_nospec(type, XFRMA_MAX + 1);
if (nla_len(nla) < compat_policy[type].len) {
NL_SET_ERR_MSG(extack, "Attribute bad length");
return -EOPNOTSUPP;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c06e54a10540..436d29640ac2 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -279,8 +279,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb)
goto out;
if (x->props.flags & XFRM_STATE_DECAP_DSCP)
- ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)),
- ipipv6_hdr(skb));
+ ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb));
if (!(x->props.flags & XFRM_STATE_NOECN))
ipip6_ecn_decapsulate(skb);
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 1f99dc469027..35279c220bd7 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
skb->mark = 0;
}
+static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type, unsigned short family)
+{
+ struct sec_path *sp;
+
+ sp = skb_sec_path(skb);
+ if (sp && (sp->len || sp->olen) &&
+ !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+ goto discard;
+
+ XFRM_SPI_SKB_CB(skb)->family = family;
+ if (family == AF_INET) {
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ } else {
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ }
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int xfrmi4_rcv(struct sk_buff *skb)
+{
+ return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET);
+}
+
+static int xfrmi6_rcv(struct sk_buff *skb)
+{
+ return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
+ 0, 0, AF_INET6);
+}
+
+static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+{
+ return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET);
+}
+
+static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+{
+ return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6);
+}
+
static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
{
const struct xfrm_mode *inner_mode;
@@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = {
};
static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
- .handler = xfrm6_rcv,
- .input_handler = xfrm_input,
+ .handler = xfrmi6_rcv,
+ .input_handler = xfrmi6_input,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi6_err,
.priority = 10,
@@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = {
#endif
static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
- .handler = xfrm4_rcv,
- .input_handler = xfrm_input,
+ .handler = xfrmi4_rcv,
+ .input_handler = xfrmi4_input,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi4_err,
.priority = 10,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index e9eb82c5457d..5c61ec04b839 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t)
}
if (xp->lft.hard_use_expires_seconds) {
time64_t tmo = xp->lft.hard_use_expires_seconds +
- (xp->curlft.use_time ? : xp->curlft.add_time) - now;
+ (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
if (tmo <= 0)
goto expired;
if (tmo < next)
@@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t)
}
if (xp->lft.soft_use_expires_seconds) {
time64_t tmo = xp->lft.soft_use_expires_seconds +
- (xp->curlft.use_time ? : xp->curlft.add_time) - now;
+ (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
if (tmo <= 0) {
warn = 1;
tmo = XFRM_KM_TIMEOUT;
@@ -3661,7 +3661,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
return 1;
}
- pol->curlft.use_time = ktime_get_real_seconds();
+ /* This lockless write can happen from different cpus. */
+ WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds());
pols[0] = pol;
npols++;
@@ -3676,7 +3677,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
xfrm_pol_put(pols[0]);
return 0;
}
- pols[1]->curlft.use_time = ktime_get_real_seconds();
+ /* This write can happen from different cpus. */
+ WRITE_ONCE(pols[1]->curlft.use_time,
+ ktime_get_real_seconds());
npols++;
}
}
@@ -3742,6 +3745,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
goto reject;
}
+ if (if_id)
+ secpath_reset(skb);
+
xfrm_pols_put(pols, npols);
return 1;
}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 59fffa02d1cc..2ab3e09e2227 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -577,7 +577,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (x->km.state == XFRM_STATE_EXPIRED)
goto expired;
if (x->lft.hard_add_expires_seconds) {
- long tmo = x->lft.hard_add_expires_seconds +
+ time64_t tmo = x->lft.hard_add_expires_seconds +
x->curlft.add_time - now;
if (tmo <= 0) {
if (x->xflags & XFRM_SOFT_EXPIRE) {
@@ -594,8 +594,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
next = tmo;
}
if (x->lft.hard_use_expires_seconds) {
- long tmo = x->lft.hard_use_expires_seconds +
- (x->curlft.use_time ? : now) - now;
+ time64_t tmo = x->lft.hard_use_expires_seconds +
+ (READ_ONCE(x->curlft.use_time) ? : now) - now;
if (tmo <= 0)
goto expired;
if (tmo < next)
@@ -604,7 +604,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (x->km.dying)
goto resched;
if (x->lft.soft_add_expires_seconds) {
- long tmo = x->lft.soft_add_expires_seconds +
+ time64_t tmo = x->lft.soft_add_expires_seconds +
x->curlft.add_time - now;
if (tmo <= 0) {
warn = 1;
@@ -616,8 +616,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
}
}
if (x->lft.soft_use_expires_seconds) {
- long tmo = x->lft.soft_use_expires_seconds +
- (x->curlft.use_time ? : now) - now;
+ time64_t tmo = x->lft.soft_use_expires_seconds +
+ (READ_ONCE(x->curlft.use_time) ? : now) - now;
if (tmo <= 0)
warn = 1;
else if (tmo < next)
@@ -1906,7 +1906,7 @@ out:
hrtimer_start(&x1->mtimer, ktime_set(1, 0),
HRTIMER_MODE_REL_SOFT);
- if (x1->curlft.use_time)
+ if (READ_ONCE(x1->curlft.use_time))
xfrm_state_check_expire(x1);
if (x->props.smark.m || x->props.smark.v || x->if_id) {
@@ -1940,8 +1940,8 @@ int xfrm_state_check_expire(struct xfrm_state *x)
{
xfrm_dev_state_update_curlft(x);
- if (!x->curlft.use_time)
- x->curlft.use_time = ktime_get_real_seconds();
+ if (!READ_ONCE(x->curlft.use_time))
+ WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds());
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {