diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 2 | ||||
-rw-r--r-- | net/core/datagram.h | 15 | ||||
-rw-r--r-- | net/core/dev.c | 174 | ||||
-rw-r--r-- | net/core/dev.h | 23 | ||||
-rw-r--r-- | net/core/devlink.c | 303 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 2 | ||||
-rw-r--r-- | net/core/gro.c | 8 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 7 | ||||
-rw-r--r-- | net/core/neighbour.c | 2 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 21 | ||||
-rw-r--r-- | net/core/page_pool.c | 5 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 250 | ||||
-rw-r--r-- | net/core/secure_seq.c | 16 | ||||
-rw-r--r-- | net/core/skbuff.c | 46 | ||||
-rw-r--r-- | net/core/sock.c | 88 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 14 |
16 files changed, 412 insertions, 564 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index 70126d15ca6e..50f4faeea76c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -62,8 +62,6 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> -#include "datagram.h" - /* * Is a socket 'connection oriented' ? */ diff --git a/net/core/datagram.h b/net/core/datagram.h deleted file mode 100644 index bcfb75bfa3b2..000000000000 --- a/net/core/datagram.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _NET_CORE_DATAGRAM_H_ -#define _NET_CORE_DATAGRAM_H_ - -#include <linux/types.h> - -struct sock; -struct sk_buff; -struct iov_iter; - -int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); - -#endif /* _NET_CORE_DATAGRAM_H_ */ diff --git a/net/core/dev.c b/net/core/dev.c index 611bd7197064..721ba9c26554 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -682,11 +682,11 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, - .daddr = daddr, }; struct net_device_path *path; int ret = 0; + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; @@ -2993,6 +2993,52 @@ undo_rx: EXPORT_SYMBOL(netif_set_real_num_queues); /** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or @@ -3220,12 +3266,18 @@ int skb_checksum_help(struct sk_buff *skb) } offset = skb_checksum_start_offset(skb); - BUG_ON(offset >= skb_headlen(skb)); + ret = -EINVAL; + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - + if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; @@ -4086,32 +4138,27 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, } /** - * __dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit - * @sb_dev: suboordinate device used for L2 forwarding offload + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) - * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors */ -static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; @@ -4235,18 +4282,7 @@ out: rcu_read_unlock_bh(); return rc; } - -int dev_queue_xmit(struct sk_buff *skb) -{ - return __dev_queue_xmit(skb, NULL); -} -EXPORT_SYMBOL(dev_queue_xmit); - -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) -{ - return __dev_queue_xmit(skb, sb_dev); -} -EXPORT_SYMBOL(dev_queue_xmit_accel); +EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { @@ -4294,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; +unsigned int sysctl_skb_defer_max __read_mostly = 64; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; @@ -4546,9 +4583,12 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ -static void trigger_rx_softirq(void *data __always_unused) +static void trigger_rx_softirq(void *data) { + struct softnet_data *sd = data; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); } /* @@ -6314,8 +6354,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) { if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; @@ -6348,7 +6388,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; } -EXPORT_SYMBOL(netif_napi_add); +EXPORT_SYMBOL(netif_napi_add_weight); void napi_disable(struct napi_struct *n) { @@ -6594,7 +6634,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) while (skb != NULL) { next = skb->next; - __kfree_skb(skb); + napi_consume_skb(skb, 1); skb = next; } } @@ -6615,9 +6655,11 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) for (;;) { struct napi_struct *n; + skb_defer_free_flush(sd); + if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - return; + goto end; break; } @@ -6644,7 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); - skb_defer_free_flush(sd); +end:; } struct netdev_adjacent { @@ -9891,22 +9933,14 @@ void netif_tx_stop_all_queues(struct net_device *dev) EXPORT_SYMBOL(netif_tx_stop_all_queues); /** - * register_netdevice - register a network device - * @dev: device to register + * register_netdevice() - register a network device + * @dev: device to register * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. */ - int register_netdevice(struct net_device *dev) { int ret; @@ -10357,7 +10391,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) { struct net_device_core_stats __percpu *p; @@ -10368,11 +10402,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ - p = READ_ONCE(dev->core_stats); - if (!p) - return NULL; - - return this_cpu_ptr(p); + return READ_ONCE(dev->core_stats); } EXPORT_SYMBOL(netdev_core_stats_alloc); @@ -10409,10 +10439,10 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); - storage->rx_dropped += local_read(&core_stats->rx_dropped); - storage->tx_dropped += local_read(&core_stats->tx_dropped); - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); - storage->rx_otherhost_dropped += local_read(&core_stats->rx_otherhost_dropped); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); + storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); } } return storage; @@ -10571,9 +10601,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gro_max_size = GRO_MAX_SIZE; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP @@ -11355,7 +11387,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif - INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); diff --git a/net/core/dev.h b/net/core/dev.h index 27923df00637..cbb8a925175a 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern int netdev_budget; extern unsigned int netdev_budget_usecs; - +extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; @@ -88,4 +88,25 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + #endif diff --git a/net/core/devlink.c b/net/core/devlink.c index 5f441a0e34f4..5cc88490f18f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -83,11 +83,10 @@ struct devlink_linecard { const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; - struct mutex state_lock; /* Protects state and device_list */ + struct mutex state_lock; /* Protects state */ const char *type; struct devlink_linecard_type *types; unsigned int types_count; - struct list_head device_list; }; /** @@ -2059,56 +2058,6 @@ struct devlink_linecard_type { const void *priv; }; -struct devlink_linecard_device { - struct list_head list; - unsigned int index; - void *priv; -}; - -static int -devlink_nl_linecard_device_fill(struct sk_buff *msg, - struct devlink_linecard_device *linecard_device) -{ - struct nlattr *attr; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE); - if (!attr) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX, - linecard_device->index)) { - nla_nest_cancel(msg, attr); - return -EMSGSIZE; - } - nla_nest_end(msg, attr); - - return 0; -} - -static int devlink_nl_linecard_devices_fill(struct sk_buff *msg, - struct devlink_linecard *linecard) -{ - struct devlink_linecard_device *linecard_device; - struct nlattr *attr; - int err; - - if (list_empty(&linecard->device_list)) - return 0; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST); - if (!attr) - return -EMSGSIZE; - list_for_each_entry(linecard_device, &linecard->device_list, list) { - err = devlink_nl_linecard_device_fill(msg, linecard_device); - if (err) { - nla_nest_cancel(msg, attr); - return err; - } - } - nla_nest_end(msg, attr); - - return 0; -} - static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_linecard *linecard, @@ -2119,7 +2068,6 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink_linecard_type *linecard_type; struct nlattr *attr; void *hdr; - int err; int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -2152,10 +2100,6 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg, nla_nest_end(msg, attr); } - err = devlink_nl_linecard_devices_fill(msg, linecard); - if (err) - goto nla_put_failure; - genlmsg_end(msg, hdr); return 0; @@ -2425,191 +2369,6 @@ static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, return 0; } -struct devlink_info_req { - struct sk_buff *msg; -}; - -static int -devlink_nl_linecard_device_info_fill(struct sk_buff *msg, - struct devlink_linecard *linecard, - struct devlink_linecard_device *linecard_device, - struct netlink_ext_ack *extack) -{ - struct nlattr *attr; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE); - if (!attr) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX, - linecard_device->index)) { - nla_nest_cancel(msg, attr); - return -EMSGSIZE; - } - if (linecard->ops->device_info_get) { - struct devlink_info_req req; - int err; - - req.msg = msg; - err = linecard->ops->device_info_get(linecard_device, - linecard_device->priv, - &req, extack); - if (err) { - nla_nest_cancel(msg, attr); - return err; - } - } - nla_nest_end(msg, attr); - - return 0; -} - -static int devlink_nl_linecard_devices_info_fill(struct sk_buff *msg, - struct devlink_linecard *linecard, - struct netlink_ext_ack *extack) -{ - struct devlink_linecard_device *linecard_device; - struct nlattr *attr; - int err; - - if (list_empty(&linecard->device_list)) - return 0; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST); - if (!attr) - return -EMSGSIZE; - list_for_each_entry(linecard_device, &linecard->device_list, list) { - err = devlink_nl_linecard_device_info_fill(msg, linecard, - linecard_device, - extack); - if (err) { - nla_nest_cancel(msg, attr); - return err; - } - } - nla_nest_end(msg, attr); - - return 0; -} - -static int -devlink_nl_linecard_info_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_linecard *linecard, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, struct netlink_ext_ack *extack) -{ - struct devlink_info_req req; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) - goto nla_put_failure; - - req.msg = msg; - err = linecard->ops->info_get(linecard, linecard->priv, &req, extack); - if (err) - goto nla_put_failure; - - err = devlink_nl_linecard_devices_info_fill(msg, linecard, extack); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_linecard_info_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_linecard *linecard = info->user_ptr[1]; - struct devlink *devlink = linecard->devlink; - struct sk_buff *msg; - int err; - - if (!linecard->ops->info_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_info_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_INFO_GET, - info->snd_portid, info->snd_seq, 0, - info->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_linecard_info_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->linecards_lock); - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < start || !linecard->ops->info_get) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_info_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_INFO_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&devlink->linecards_lock); -retry: - devlink_put(devlink); - } -out: - mutex_unlock(&devlink_mutex); - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -6602,6 +6361,10 @@ out_dev: return err; } +struct devlink_info_req { + struct sk_buff *msg; +}; + int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) { return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); @@ -9322,13 +9085,6 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, }, { - .cmd = DEVLINK_CMD_LINECARD_INFO_GET, - .doit = devlink_nl_cmd_linecard_info_get_doit, - .dumpit = devlink_nl_cmd_linecard_info_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, @@ -10508,7 +10264,6 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, linecard->priv = priv; linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; mutex_init(&linecard->state_lock); - INIT_LIST_HEAD(&linecard->device_list); err = devlink_linecard_types_init(linecard); if (err) { @@ -10536,7 +10291,6 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard) struct devlink *devlink = linecard->devlink; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - WARN_ON(!list_empty(&linecard->device_list)); mutex_lock(&devlink->linecards_lock); list_del(&linecard->list); devlink_linecard_types_fini(linecard); @@ -10546,52 +10300,6 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard) EXPORT_SYMBOL_GPL(devlink_linecard_destroy); /** - * devlink_linecard_device_create - Create a device on linecard - * - * @linecard: devlink linecard - * @device_index: index of the linecard device - * @priv: user priv pointer - * - * Return: Line card device structure or an ERR_PTR() encoded error code. - */ -struct devlink_linecard_device * -devlink_linecard_device_create(struct devlink_linecard *linecard, - unsigned int device_index, void *priv) -{ - struct devlink_linecard_device *linecard_device; - - linecard_device = kzalloc(sizeof(*linecard_device), GFP_KERNEL); - if (!linecard_device) - return ERR_PTR(-ENOMEM); - linecard_device->index = device_index; - linecard_device->priv = priv; - mutex_lock(&linecard->state_lock); - list_add_tail(&linecard_device->list, &linecard->device_list); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - return linecard_device; -} -EXPORT_SYMBOL_GPL(devlink_linecard_device_create); - -/** - * devlink_linecard_device_destroy - Destroy device on linecard - * - * @linecard: devlink linecard - * @linecard_device: devlink linecard device - */ -void -devlink_linecard_device_destroy(struct devlink_linecard *linecard, - struct devlink_linecard_device *linecard_device) -{ - mutex_lock(&linecard->state_lock); - list_del(&linecard_device->list); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - kfree(linecard_device); -} -EXPORT_SYMBOL_GPL(devlink_linecard_device_destroy); - -/** * devlink_linecard_provision_set - Set provisioning on linecard * * @linecard: devlink linecard @@ -10623,7 +10331,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); void devlink_linecard_provision_clear(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); - WARN_ON(!list_empty(&linecard->device_list)); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b89e3e95bffc..41cac0e4834e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -517,7 +517,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, if (!nskb) return; - if ((unsigned int)reason >= SKB_DROP_REASON_MAX) + if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0)) reason = SKB_DROP_REASON_NOT_SPECIFIED; cb = NET_DM_SKB_CB(nskb); cb->reason = reason; diff --git a/net/core/gro.c b/net/core/gro.c index 78110edf5d4b..b4190eb08467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -167,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 349480ef68a5..8b6b5e72b217 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -159,10 +159,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst->lwtstate->orig_output(net, sk, skb); } -static int xmit_check_hhlen(struct sk_buff *skb) +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { - int hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -274,6 +272,7 @@ static int bpf_xmit(struct sk_buff *skb) bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { + int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; @@ -291,7 +290,7 @@ static int bpf_xmit(struct sk_buff *skb) /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ - ret = xmit_check_hhlen(skb); + ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f64ebd050f6c..47b6c1f0fdbb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3728,7 +3728,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; - t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4980c3a50475..e319e242dddf 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -746,7 +746,6 @@ static const struct attribute_group netstat_group = { .attrs = netstat_attrs, }; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) static struct attribute *wireless_attrs[] = { NULL }; @@ -755,7 +754,19 @@ static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; + +static bool wireless_group_needed(struct net_device *ndev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + if (ndev->ieee80211_ptr) + return true; #endif +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + if (ndev->wireless_handlers) + return true; +#endif + return false; +} #else /* CONFIG_SYSFS */ #define net_class_groups NULL @@ -1996,14 +2007,8 @@ int netdev_register_kobject(struct net_device *ndev) *groups++ = &netstat_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (ndev->ieee80211_ptr) - *groups++ = &wireless_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (ndev->wireless_handlers) + if (wireless_group_needed(ndev)) *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index bdbadfaee867..f18e6e771993 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -704,8 +704,10 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, if (page && *offset + size > max_size) { page = page_pool_drain_frag(pool, page); - if (page) + if (page) { + alloc_stat_inc(pool, fast); goto frag_reset; + } } if (!page) { @@ -727,6 +729,7 @@ frag_reset: pool->frag_users++; pool->frag_offset = *offset + size; + alloc_stat_inc(pool, fast); return page; } EXPORT_SYMBOL(page_pool_alloc_frag); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 73f2cbc440c9..ac45328607f7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1064,6 +1064,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1769,6 +1771,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1922,6 +1926,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, + [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2306,6 +2312,19 @@ invalid_attr: return -EINVAL; } +static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_set_vf_rate) + return -EOPNOTSUPP; + if (max_tx_rate && max_tx_rate < min_tx_rate) + return -EINVAL; + + return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { @@ -2341,14 +2360,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } - if (tb[IFLA_GRO_MAX_SIZE]) { - u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); - - if (gro_max_size > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } - } return 0; } @@ -2443,11 +2454,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (err < 0) return err; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivf.min_tx_rate, - ivt->rate); + err = rtnl_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } @@ -2457,11 +2465,9 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (ivt->vf >= INT_MAX) return -EINVAL; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivt->min_tx_rate, - ivt->max_tx_rate); + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } @@ -2803,7 +2809,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > GSO_MAX_SIZE) { + if (max_size > dev->tso_max_size) { err = -EINVAL; goto errout; } @@ -2817,7 +2823,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS) { + if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { err = -EINVAL; goto errout; } @@ -3302,23 +3308,116 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attr, struct netlink_ext_ack *extack) +static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, + const struct rtnl_link_ops *ops, + struct nlattr **tb, struct nlattr **data, + struct netlink_ext_ack *extack) { - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; + struct net *net = sock_net(skb->sk); + struct net *dest_net, *link_net; + struct net_device *dev; + char ifname[IFNAMSIZ]; + int err; + + if (!ops->alloc && !ops->setup) + return -EOPNOTSUPP; + + if (tb[IFLA_IFNAME]) { + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + } else { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } + + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; + goto out; + } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } + + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } + + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + else + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; + } + + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); + if (err < 0) + goto out_unregister; + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } +out: + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; +out_unregister: + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; +} + +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtnl_newlink_tbs *tbs, + struct netlink_ext_ack *extack) +{ struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr ** const tb = tbs->tb; const struct rtnl_link_ops *m_ops; struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - struct nlattr *tb[IFLA_MAX + 1]; - struct net *dest_net, *link_net; struct nlattr **slave_data; char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr **data; bool link_specified; int err; @@ -3382,12 +3481,12 @@ replay: return -EINVAL; if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(attr, ops->maxtype, + err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (err < 0) return err; - data = attr; + data = tbs->attr; } if (ops->validate) { err = ops->validate(tb, data, extack); @@ -3403,14 +3502,14 @@ replay: if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested_deprecated(slave_attr, + err = nla_parse_nested_deprecated(tbs->slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; - slave_data = slave_attr; + slave_data = tbs->slave_attr; } } @@ -3478,96 +3577,21 @@ replay: return -EOPNOTSUPP; } - if (!ops->alloc && !ops->setup) - return -EOPNOTSUPP; - - if (tb[IFLA_IFNAME]) { - nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - } else { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } - - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); - goto out; - } - - dev->ifindex = ifm->ifi_index; - - if (ops->newlink) - err = ops->newlink(link_net ? : net, dev, tb, data, extack); - else - err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } - - err = rtnl_configure_link(dev, ifm); - if (err < 0) - goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); - if (err) - goto out_unregister; - } -out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; -out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + return rtnl_newlink_create(skb, ifm, ops, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct nlattr **attr; + struct rtnl_newlink_tbs *tbs; int ret; - attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); - if (!attr) + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) return -ENOMEM; - ret = __rtnl_newlink(skb, nlh, attr, extack); - kfree(attr); + ret = __rtnl_newlink(skb, nlh, tbs, extack); + kfree(tbs); return ret; } @@ -4240,7 +4264,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, @@ -4258,7 +4282,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 9b8443774449..5f85e01d4093 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -22,6 +22,8 @@ static siphash_aligned_key_t net_secret; static siphash_aligned_key_t ts_secret; +#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) + static __always_inline void net_secret_init(void) { net_get_random_once(&net_secret, sizeof(net_secret)); @@ -94,17 +96,19 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, } EXPORT_SYMBOL(secure_tcpv6_seq); -u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { const struct { struct in6_addr saddr; struct in6_addr daddr; + unsigned int timeseed; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, - .dport = dport + .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + .dport = dport, }; net_secret_init(); return siphash(&combined, offsetofend(typeof(combined), dport), @@ -142,11 +146,13 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr, } EXPORT_SYMBOL_GPL(secure_tcp_seq); -u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { net_secret_init(); - return siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u16)dport, &net_secret); + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, + jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00980e62415b..5b3559cb1d82 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -80,7 +80,7 @@ #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> -#include "datagram.h" +#include "dev.h" #include "sock_destructor.h" struct kmem_cache *skbuff_head_cache __ro_after_init; @@ -772,6 +772,8 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) if (!skb_unref(skb)) return; + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + trace_kfree_skb(skb, __builtin_return_address(0), reason); __kfree_skb(skb); } @@ -1166,7 +1168,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info *uarg; struct sk_buff *skb; @@ -1197,7 +1199,6 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return uarg; } -EXPORT_SYMBOL_GPL(msg_zerocopy_alloc); static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) { @@ -1340,18 +1341,11 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); -int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) -{ - return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); -} -EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); - int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) { struct ubuf_info *orig_uarg = skb_zcopy(skb); - struct iov_iter orig_iter = msg->msg_iter; int err, orig_len = skb->len; /* An skb can only point to one uarg. This edge case happens when @@ -1365,7 +1359,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct sock *save_sk = skb->sk; /* Streams do not free skb on error. Reset to prev state. */ - msg->msg_iter = orig_iter; + iov_iter_revert(&msg->msg_iter, skb->len - orig_len); skb->sk = sk; ___pskb_trim(skb, orig_len); skb->sk = save_sk; @@ -3898,7 +3892,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; - int err; + int len_diff, err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3938,9 +3932,11 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); + len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + nskb->transport_header += len_diff; skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, offset + tnl_hlen); @@ -6501,37 +6497,35 @@ void skb_attempt_defer_free(struct sk_buff *skb) int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned long flags; + unsigned int defer_max; bool kick; if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu) || cpu == raw_smp_processor_id()) { - __kfree_skb(skb); +nodefer: __kfree_skb(skb); return; } sd = &per_cpu(softnet_data, cpu); - /* We do not send an IPI or any signal. - * Remote cpu will eventually call skb_defer_free_flush() - */ + defer_max = READ_ONCE(sysctl_skb_defer_max); + if (READ_ONCE(sd->defer_count) >= defer_max) + goto nodefer; + spin_lock_irqsave(&sd->defer_lock, flags); + /* Send an IPI every time queue reaches half capacity. */ + kick = sd->defer_count == (defer_max >> 1); + /* Paired with the READ_ONCE() few lines above */ + WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ WRITE_ONCE(sd->defer_list, skb); - sd->defer_count++; - - /* kick every time queue length reaches 128. - * This should avoid blocking in smp_call_function_single_async(). - * This condition should hardly be bit under normal conditions, - * unless cpu suddenly stopped to receive NIC interrupts. - */ - kick = sd->defer_count == 128; - spin_unlock_irqrestore(&sd->defer_lock, flags); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick)) + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } diff --git a/net/core/sock.c b/net/core/sock.c index a0f3989de3d6..2ff40dd0a7a6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -146,6 +146,9 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space(struct sock *sk); + /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@ -632,7 +635,9 @@ static int sock_bindtoindex_locked(struct sock *sk, int ifindex) if (ifindex < 0) goto out; - sk->sk_bound_dev_if = ifindex; + /* Paired with all READ_ONCE() done locklessly. */ + WRITE_ONCE(sk->sk_bound_dev_if, ifindex); + if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); @@ -710,10 +715,11 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; - if (sk->sk_bound_dev_if == 0) { + if (bound_dev_if == 0) { len = 0; goto zero; } @@ -722,7 +728,7 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, if (len < IFNAMSIZ) goto out; - ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; @@ -1311,6 +1317,15 @@ set_sndbuf: __sock_set_mark(sk, val); break; + case SO_RCVMARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + + sock_valbool_flag(sk, SOCK_RCVMARK, valbool); + break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); @@ -1737,6 +1752,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_mark; break; + case SO_RCVMARK: + v.val = sock_flag(sk, SOCK_RCVMARK); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; @@ -1845,7 +1864,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_BINDTOIFINDEX: - v.val = sk->sk_bound_dev_if; + v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: @@ -2277,6 +2296,19 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +static void sk_trim_gso_size(struct sock *sk) +{ + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return; +#endif + sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; +} + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; @@ -2296,6 +2328,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk_trim_gso_size(sk); sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); @@ -2317,8 +2350,20 @@ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; + bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + if (sock_flag(sk, SOCK_RCU_FREE) && + sk->sk_write_space == sock_def_write_space) { + rcu_read_lock(); + free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); + sock_def_write_space_wfree(sk); + rcu_read_unlock(); + if (unlikely(free)) + __sk_free(sk); + return; + } + /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call @@ -2628,13 +2673,6 @@ failure: } EXPORT_SYMBOL(sock_alloc_send_pskb); -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, - int noblock, int *errcode) -{ - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); -} -EXPORT_SYMBOL(sock_alloc_send_skb); - int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { @@ -3191,20 +3229,42 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { + if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } +/* An optimised version of sock_def_write_space(), should only be called + * for SOCK_RCU_FREE sockets under RCU read section and after putting + * ->sk_wmem_alloc. + */ +static void sock_def_write_space_wfree(struct sock *sk) +{ + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if (sock_writeable(sk)) { + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + /* rely on refcount_sub from sock_wfree() */ + smp_mb__after_atomic(); + if (wq && waitqueue_active(&wq->wait)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } +} + static void sock_def_destruct(struct sock *sk) { } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ca4527fcbc75..71a13596ea2b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,12 +25,10 @@ #include "dev.h" -static int three = 3; static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; -static long long_one __maybe_unused = 1; static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -424,7 +422,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, - .extra1 = &long_one, + .extra1 = SYSCTL_LONG_ONE, .extra2 = &bpf_jit_limit_max, }, #endif @@ -560,7 +558,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &three, + .extra2 = SYSCTL_THREE, }, { .procname = "high_order_alloc_disable", @@ -586,6 +584,14 @@ static struct ctl_table net_core_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, + { + .procname = "skb_defer_max", + .data = &sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, { } }; |