diff options
Diffstat (limited to 'net')
55 files changed, 2889 insertions, 2992 deletions
diff --git a/net/Makefile b/net/Makefile index 6a62e5b27378..0914bea9c335 100644 --- a/net/Makefile +++ b/net/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ +obj-$(CONFIG_NET_DEVLINK) += devlink/ obj-$(CONFIG_NET_DSA) += dsa/ obj-$(CONFIG_ATALK) += appletalk/ obj-$(CONFIG_X25) += x25/ diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 748be7253248..1f2c1d7b90e2 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -533,10 +533,6 @@ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) goto err; - ret = -EINVAL; - if (unlikely(msg->msg_iter.nr_segs == 0) || - unlikely(msg->msg_iter.iov->iov_base == NULL)) - goto err; noblock = msg->msg_flags & MSG_DONTWAIT; timeo = sock_sndtimeo(sk, noblock); diff --git a/net/core/Makefile b/net/core/Makefile index 5857cec87b83..10edd66a8a37 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o -obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..cf78f35bc0b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1840,7 +1840,7 @@ EXPORT_SYMBOL(register_netdevice_notifier_net); * @nb: notifier * * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the + * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * diff --git a/net/core/dst.c b/net/core/dst.c index 6d2dd03dafa8..31c08a3386d3 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -82,12 +82,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) diff --git a/net/core/filter.c b/net/core/filter.c index 43cc1fe58a2c..7a2b67893afd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4128,9 +4128,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * - * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), - * which will flush all the different bulk queues, thus completing the - * redirect. + * 3. Before exiting its NAPI poll loop, the driver will call + * xdp_do_flush(), which will flush all the different bulk queues, + * thus completing the redirect. Note that xdp_do_flush() must be + * called before napi_complete_done() in the driver, as the + * XDP_REDIRECT logic relies on being inside a single NAPI instance + * through to the xdp_do_flush() call for RCU protection of all + * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of @@ -4618,7 +4622,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -4656,6 +4661,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags &= ~TUNNEL_CSUM; if (flags & BPF_F_SEQ_NUMBER) info->key.tun_flags |= TUNNEL_SEQ; + if (flags & BPF_F_NO_TUNNEL_KEY) + info->key.tun_flags &= ~TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4a0eb5593275..4e73ab3482b8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -386,8 +386,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) /* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc - * This means that if @frag_size is not zero, then @data must be backed - * by a page fragment, not kmalloc() or vmalloc() */ struct sk_buff *build_skb(void *data, unsigned int frag_size) { @@ -406,7 +404,7 @@ EXPORT_SYMBOL(build_skb); * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data */ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) @@ -428,7 +426,7 @@ EXPORT_SYMBOL(build_skb_around); /** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. @@ -452,7 +450,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) /** * napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. @@ -932,6 +930,21 @@ void __kfree_skb(struct sk_buff *skb) } EXPORT_SYMBOL(__kfree_skb); +static __always_inline +bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + if (unlikely(!skb_unref(skb))) + return false; + + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + + if (reason == SKB_CONSUMED) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, __builtin_return_address(0), reason); + return true; +} + /** * kfree_skb_reason - free an sk_buff with special reason * @skb: buffer to free @@ -944,28 +957,59 @@ EXPORT_SYMBOL(__kfree_skb); void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { - if (unlikely(!skb_unref(skb))) + if (__kfree_skb_reason(skb, reason)) + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb_reason); + +#define KFREE_SKB_BULK_SIZE 16 + +struct skb_free_array { + unsigned int skb_count; + void *skb_array[KFREE_SKB_BULK_SIZE]; +}; + +static void kfree_skb_add_bulk(struct sk_buff *skb, + struct skb_free_array *sa, + enum skb_drop_reason reason) +{ + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); return; + } - DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + skb_release_all(skb, reason); + sa->skb_array[sa->skb_count++] = skb; - if (reason == SKB_CONSUMED) - trace_consume_skb(skb); - else - trace_kfree_skb(skb, __builtin_return_address(0), reason); - __kfree_skb(skb); + if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, KFREE_SKB_BULK_SIZE, + sa->skb_array); + sa->skb_count = 0; + } } -EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list_reason(struct sk_buff *segs, - enum skb_drop_reason reason) +void __fix_address +kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { + struct skb_free_array sa; + + sa.skb_count = 0; + while (segs) { struct sk_buff *next = segs->next; - kfree_skb_reason(segs, reason); + skb_mark_not_on_list(segs); + + if (__kfree_skb_reason(segs, reason)) + kfree_skb_add_bulk(segs, &sa, reason); + segs = next; } + + if (sa.skb_count) + kmem_cache_free_bulk(skbuff_head_cache, sa.skb_count, + sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5b1ce656baa1..e7b98162c632 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -643,11 +643,6 @@ static __net_init int sysctl_core_net_init(struct net *net) for (tmp = tbl; tmp->procname; tmp++) tmp->data += (char *)net - (char *)&init_net; - - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; - } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index f9949e051f49..c0c438128575 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -178,6 +178,7 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { }; static LIST_HEAD(dcb_app_list); +static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) @@ -1099,11 +1100,46 @@ out: return err; } +/* Set or delete APP table or rewrite table entries. The APP struct is validated + * and the appropriate callback function is called. + */ +static int dcbnl_app_table_setdel(struct nlattr *attr, + struct net_device *netdev, + int (*setdel)(struct net_device *dev, + struct dcb_app *app)) +{ + struct dcb_app *app_data; + enum ieee_attrs_app type; + struct nlattr *attr_itr; + int rem, err; + + nla_for_each_nested(attr_itr, attr, rem) { + type = nla_type(attr_itr); + + if (!dcbnl_app_attr_type_validate(type)) + continue; + + if (nla_len(attr_itr) < sizeof(struct dcb_app)) + return -ERANGE; + + app_data = nla_data(attr_itr); + + if (!dcbnl_app_selector_validate(type, app_data->selector)) + return -EINVAL; + + err = setdel(netdev, app_data); + if (err) + return err; + } + + return 0; +} + /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; - struct nlattr *ieee, *app; + struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; @@ -1206,6 +1242,27 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); + rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); + if (!rewr) + return -EMSGSIZE; + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == netdev->ifindex) { + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(itr->app.selector); + err = nla_put(skb, type, sizeof(itr->app), &itr->app); + if (err) { + spin_unlock_bh(&dcb_lock); + nla_nest_cancel(skb, rewr); + return -EMSGSIZE; + } + } + } + + spin_unlock_bh(&dcb_lock); + nla_nest_end(skb, rewr); + if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) @@ -1567,37 +1624,20 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, goto err; } - if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { - enum ieee_attrs_app type = nla_type(attr); - struct dcb_app *app_data; - - if (!dcbnl_app_attr_type_validate(type)) - continue; - - if (nla_len(attr) < sizeof(struct dcb_app)) { - err = -ERANGE; - goto err; - } - - app_data = nla_data(attr); - - if (!dcbnl_app_selector_validate(type, - app_data->selector)) { - err = -EINVAL; - goto err; - } + if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], + netdev, + ops->dcbnl_setrewr ?: dcb_setrewr); + if (err) + goto err; + } - if (ops->ieee_setapp) - err = ops->ieee_setapp(netdev, app_data); - else - err = dcb_ieee_setapp(netdev, app_data); - if (err) - goto err; - } + if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], + netdev, ops->ieee_setapp ?: + dcb_ieee_setapp); + if (err) + goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { @@ -1684,31 +1724,19 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { - struct nlattr *attr; - int rem; - - nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { - enum ieee_attrs_app type = nla_type(attr); - struct dcb_app *app_data; - - if (!dcbnl_app_attr_type_validate(type)) - continue; - - app_data = nla_data(attr); - - if (!dcbnl_app_selector_validate(type, - app_data->selector)) { - err = -EINVAL; - goto err; - } + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], + netdev, ops->ieee_delapp ?: + dcb_ieee_delapp); + if (err) + goto err; + } - if (ops->ieee_delapp) - err = ops->ieee_delapp(netdev, app_data); - else - err = dcb_ieee_delapp(netdev, app_data); - if (err) - goto err; - } + if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { + err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], + netdev, + ops->dcbnl_delrewr ?: dcb_delrewr); + if (err) + goto err; } err: @@ -1939,6 +1967,22 @@ out: return ret; } +static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, + int ifindex, int proto) +{ + struct dcb_app_type *itr; + + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->app.selector == app->selector && + itr->app.priority == app->priority && + itr->ifindex == ifindex && + ((proto == -1) || itr->app.protocol == proto)) + return itr; + } + + return NULL; +} + static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { @@ -1955,7 +1999,8 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, return NULL; } -static int dcb_app_add(const struct dcb_app *app, int ifindex) +static int dcb_app_add(struct list_head *list, const struct dcb_app *app, + int ifindex) { struct dcb_app_type *entry; @@ -1965,7 +2010,7 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex) memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; - list_add(&entry->list, &dcb_app_list); + list_add(&entry->list, list); return 0; } @@ -2028,7 +2073,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) } /* App type does not exist add new application type */ if (new->priority) - err = dcb_app_add(new, dev->ifindex); + err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) @@ -2061,6 +2106,63 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) } EXPORT_SYMBOL(dcb_ieee_getapp_mask); +/* Get protocol value from rewrite entry. */ +u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) +{ + struct dcb_app_type *itr; + u16 proto = 0; + + spin_lock_bh(&dcb_lock); + itr = dcb_rewr_lookup(app, dev->ifindex, -1); + if (itr) + proto = itr->app.protocol; + spin_unlock_bh(&dcb_lock); + + return proto; +} +EXPORT_SYMBOL(dcb_getrewr); + + /* Add rewrite entry to the rewrite list. */ +int dcb_setrewr(struct net_device *dev, struct dcb_app *new) +{ + int err; + + spin_lock_bh(&dcb_lock); + /* Search for existing match and abort if found. */ + if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { + err = -EEXIST; + goto out; + } + + err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); +out: + spin_unlock_bh(&dcb_lock); + + return err; +} +EXPORT_SYMBOL(dcb_setrewr); + +/* Delete rewrite entry from the rewrite list. */ +int dcb_delrewr(struct net_device *dev, struct dcb_app *del) +{ + struct dcb_app_type *itr; + int err = -ENOENT; + + spin_lock_bh(&dcb_lock); + /* Search for existing match and remove it. */ + itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); + if (itr) { + list_del(&itr->list); + kfree(itr); + err = 0; + } + + spin_unlock_bh(&dcb_lock); + + return err; +} +EXPORT_SYMBOL(dcb_delrewr); + /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface @@ -2088,7 +2190,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) goto out; } - err = dcb_app_add(new, dev->ifindex); + err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) @@ -2130,6 +2232,58 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); +/* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from + * priorities to the PCP and DEI values assigned to that priority. + */ +void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, + struct dcb_rewr_prio_pcp_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == DCB_APP_SEL_PCP && + itr->app.protocol < 16 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1 << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); + +/* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from + * priorities to the DSCP values assigned to that priority. + */ +void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_rewr_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1ULL << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); + /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map diff --git a/net/devlink/Makefile b/net/devlink/Makefile new file mode 100644 index 000000000000..1b1eeac59cb3 --- /dev/null +++ b/net/devlink/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := leftover.o core.o netlink.o diff --git a/net/devlink/core.c b/net/devlink/core.c new file mode 100644 index 000000000000..6c0e2fc57e45 --- /dev/null +++ b/net/devlink/core.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> + +#include "devl_internal.h" + +DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); + +void *devlink_priv(struct devlink *devlink) +{ + return &devlink->priv; +} +EXPORT_SYMBOL_GPL(devlink_priv); + +struct devlink *priv_to_devlink(void *priv) +{ + return container_of(priv, struct devlink, priv); +} +EXPORT_SYMBOL_GPL(priv_to_devlink); + +struct device *devlink_to_dev(const struct devlink *devlink) +{ + return devlink->dev; +} +EXPORT_SYMBOL_GPL(devlink_to_dev); + +struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} +EXPORT_SYMBOL_GPL(devlink_net); + +void devl_assert_locked(struct devlink *devlink) +{ + lockdep_assert_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_assert_locked); + +#ifdef CONFIG_LOCKDEP +/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ +bool devl_lock_is_held(struct devlink *devlink) +{ + return lockdep_is_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock_is_held); +#endif + +void devl_lock(struct devlink *devlink) +{ + mutex_lock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock); + +int devl_trylock(struct devlink *devlink) +{ + return mutex_trylock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_trylock); + +void devl_unlock(struct devlink *devlink) +{ + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_unlock); + +/** + * devlink_try_get() - try to obtain a reference on a devlink instance + * @devlink: instance to reference + * + * Obtain a reference on a devlink instance. A reference on a devlink instance + * only implies that it's safe to take the instance lock. It does not imply + * that the instance is registered, use devl_is_registered() after taking + * the instance lock to check registration status. + */ +struct devlink *__must_check devlink_try_get(struct devlink *devlink) +{ + if (refcount_inc_not_zero(&devlink->refcount)) + return devlink; + return NULL; +} + +static void devlink_release(struct work_struct *work) +{ + struct devlink *devlink; + + devlink = container_of(to_rcu_work(work), struct devlink, rwork); + + mutex_destroy(&devlink->lock); + lockdep_unregister_key(&devlink->lock_key); + kfree(devlink); +} + +void devlink_put(struct devlink *devlink) +{ + if (refcount_dec_and_test(&devlink->refcount)) + queue_rcu_work(system_wq, &devlink->rwork); +} + +struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) +{ + struct devlink *devlink = NULL; + + rcu_read_lock(); +retry: + devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); + if (!devlink) + goto unlock; + + if (!devlink_try_get(devlink)) + goto next; + if (!net_eq(devlink_net(devlink), net)) { + devlink_put(devlink); + goto next; + } +unlock: + rcu_read_unlock(); + return devlink; + +next: + (*indexp)++; + goto retry; +} + +/** + * devlink_set_features - Set devlink supported features + * + * @devlink: devlink + * @features: devlink support features + * + * This interface allows us to set reload ops separatelly from + * the devlink_alloc. + */ +void devlink_set_features(struct devlink *devlink, u64 features) +{ + WARN_ON(features & DEVLINK_F_RELOAD && + !devlink_reload_supported(devlink->ops)); + devlink->features = features; +} +EXPORT_SYMBOL_GPL(devlink_set_features); + +/** + * devl_register - Register devlink instance + * @devlink: devlink + */ +int devl_register(struct devlink *devlink) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + devl_assert_locked(devlink); + + xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); + devlink_notify_register(devlink); + + return 0; +} +EXPORT_SYMBOL_GPL(devl_register); + +void devlink_register(struct devlink *devlink) +{ + devl_lock(devlink); + devl_register(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devl_unregister - Unregister devlink instance + * @devlink: devlink + */ +void devl_unregister(struct devlink *devlink) +{ + ASSERT_DEVLINK_REGISTERED(devlink); + devl_assert_locked(devlink); + + devlink_notify_unregister(devlink); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} +EXPORT_SYMBOL_GPL(devl_unregister); + +void devlink_unregister(struct devlink *devlink) +{ + devl_lock(devlink); + devl_unregister(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace + * + * @ops: ops + * @priv_size: size of user private data + * @net: net namespace + * @dev: parent device + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev) +{ + struct devlink *devlink; + static u32 last_id; + int ret; + + WARN_ON(!ops || !dev); + if (!devlink_reload_actions_valid(ops)) + return NULL; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + + ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret < 0) + goto err_xa_alloc; + + devlink->netdevice_nb.notifier_call = devlink_port_netdevice_event; + ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); + if (ret) + goto err_register_netdevice_notifier; + + devlink->dev = dev; + devlink->ops = ops; + xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); + xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); + write_pnet(&devlink->_net, net); + INIT_LIST_HEAD(&devlink->rate_list); + INIT_LIST_HEAD(&devlink->linecard_list); + INIT_LIST_HEAD(&devlink->sb_list); + INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->param_list); + INIT_LIST_HEAD(&devlink->region_list); + INIT_LIST_HEAD(&devlink->reporter_list); + INIT_LIST_HEAD(&devlink->trap_list); + INIT_LIST_HEAD(&devlink->trap_group_list); + INIT_LIST_HEAD(&devlink->trap_policer_list); + INIT_RCU_WORK(&devlink->rwork, devlink_release); + lockdep_register_key(&devlink->lock_key); + mutex_init(&devlink->lock); + lockdep_set_class(&devlink->lock, &devlink->lock_key); + refcount_set(&devlink->refcount, 1); + + return devlink; + +err_register_netdevice_notifier: + xa_erase(&devlinks, devlink->index); +err_xa_alloc: + kfree(devlink); + return NULL; +} +EXPORT_SYMBOL_GPL(devlink_alloc_ns); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + + WARN_ON(!list_empty(&devlink->trap_policer_list)); + WARN_ON(!list_empty(&devlink->trap_group_list)); + WARN_ON(!list_empty(&devlink->trap_list)); + WARN_ON(!list_empty(&devlink->reporter_list)); + WARN_ON(!list_empty(&devlink->region_list)); + WARN_ON(!list_empty(&devlink->param_list)); + WARN_ON(!list_empty(&devlink->resource_list)); + WARN_ON(!list_empty(&devlink->dpipe_table_list)); + WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(!list_empty(&devlink->linecard_list)); + WARN_ON(!xa_empty(&devlink->ports)); + + xa_destroy(&devlink->snapshot_ids); + xa_destroy(&devlink->ports); + + WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), + &devlink->netdevice_nb)); + + xa_erase(&devlinks, devlink->index); + + devlink_put(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + u32 actions_performed; + unsigned long index; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + devlinks_xa_for_each_registered_get(net, index, devlink) { + WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); + devl_lock(devlink); + err = 0; + if (devl_is_registered(devlink)) + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); + devl_unlock(devlink); + devlink_put(devlink); + + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); + } +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + +static int __init devlink_init(void) +{ + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; +} + +subsys_initcall(devlink_init); diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h new file mode 100644 index 000000000000..1aa1a9549549 --- /dev/null +++ b/net/devlink/devl_internal.h @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include <net/devlink.h> +#include <net/net_namespace.h> + +#define DEVLINK_REGISTERED XA_MARK_1 + +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + +struct devlink { + u32 index; + struct xarray ports; + struct list_head rate_list; + struct list_head sb_list; + struct list_head dpipe_table_list; + struct list_head resource_list; + struct list_head param_list; + struct list_head region_list; + struct list_head reporter_list; + struct devlink_dpipe_headers *dpipe_headers; + struct list_head trap_list; + struct list_head trap_group_list; + struct list_head trap_policer_list; + struct list_head linecard_list; + const struct devlink_ops *ops; + u64 features; + struct xarray snapshot_ids; + struct devlink_dev_stats stats; + struct device *dev; + possible_net_t _net; + /* Serializes access to devlink instance specific objects such as + * port, sb, dpipe, resource, params, region, traps and more. + */ + struct mutex lock; + struct lock_class_key lock_key; + u8 reload_failed:1; + refcount_t refcount; + struct rcu_work rwork; + struct notifier_block netdevice_nb; + char priv[] __aligned(NETDEV_ALIGN); +}; + +extern struct xarray devlinks; +extern struct genl_family devlink_nl_family; + +/* devlink instances are open to the access from the user space after + * devlink_register() call. Such logical barrier allows us to have certain + * expectations related to locking. + * + * Before *_register() - we are in initialization stage and no parallel + * access possible to the devlink instance. All drivers perform that phase + * by implicitly holding device_lock. + * + * After *_register() - users and driver can access devlink instance at + * the same time. + */ +#define ASSERT_DEVLINK_REGISTERED(d) \ + WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) +#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ + WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) + +/* Iterate over devlink pointers which were possible to get reference to. + * devlink_put() needs to be called for each iterated devlink pointer + * in loop body in order to release the reference. + */ +#define devlinks_xa_for_each_registered_get(net, index, devlink) \ + for (index = 0; (devlink = devlinks_xa_find_get(net, &index)); index++) + +struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp); + +static inline bool devl_is_registered(struct devlink *devlink) +{ + devl_assert_locked(devlink); + return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} + +/* Netlink */ +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) +#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) +#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +/* state held across netlink dumps */ +struct devlink_nl_dump_state { + unsigned long instance; + int idx; + union { + /* DEVLINK_CMD_REGION_READ */ + struct { + u64 start_offset; + }; + /* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET */ + struct { + u64 dump_ts; + }; + }; +}; + +struct devlink_gen_cmd { + int (*dump_one)(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb); +}; + +extern const struct genl_small_ops devlink_nl_ops[56]; + +struct devlink * +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); + +void devlink_notify_unregister(struct devlink *devlink); +void devlink_notify_register(struct devlink *devlink); + +int devlink_nl_instance_iter_dump(struct sk_buff *msg, + struct netlink_callback *cb); + +static inline struct devlink_nl_dump_state * +devlink_dump_state(struct netlink_callback *cb) +{ + NL_ASSET_DUMP_CTX_FITS(struct devlink_nl_dump_state); + + return (struct devlink_nl_dump_state *)cb->ctx; +} + +/* gen cmds */ +extern const struct devlink_gen_cmd devl_gen_inst; +extern const struct devlink_gen_cmd devl_gen_port; +extern const struct devlink_gen_cmd devl_gen_sb; +extern const struct devlink_gen_cmd devl_gen_sb_pool; +extern const struct devlink_gen_cmd devl_gen_sb_port_pool; +extern const struct devlink_gen_cmd devl_gen_sb_tc_pool_bind; +extern const struct devlink_gen_cmd devl_gen_selftests; +extern const struct devlink_gen_cmd devl_gen_param; +extern const struct devlink_gen_cmd devl_gen_region; +extern const struct devlink_gen_cmd devl_gen_info; +extern const struct devlink_gen_cmd devl_gen_health_reporter; +extern const struct devlink_gen_cmd devl_gen_trap; +extern const struct devlink_gen_cmd devl_gen_trap_group; +extern const struct devlink_gen_cmd devl_gen_trap_policer; +extern const struct devlink_gen_cmd devl_gen_linecard; + +/* Ports */ +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr); + +struct devlink_port * +devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info); + +/* Reload */ +bool devlink_reload_actions_valid(const struct devlink_ops *ops); +int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack); + +static inline bool devlink_reload_supported(const struct devlink_ops *ops) +{ + return ops->reload_down && ops->reload_up; +} + +/* Line cards */ +struct devlink_linecard; + +struct devlink_linecard * +devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info); + +/* Rates */ +extern const struct devlink_gen_cmd devl_gen_rate_get; + +struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info); +struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, + struct genl_info *info); diff --git a/net/core/devlink.c b/net/devlink/leftover.c index 032d6d0a5ce6..8eab95cae917 100644 --- a/net/core/devlink.c +++ b/net/devlink/leftover.c @@ -31,58 +31,12 @@ #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> -#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ - (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) - -struct devlink_dev_stats { - u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; -}; - -struct devlink { - u32 index; - struct xarray ports; - struct list_head rate_list; - struct list_head sb_list; - struct list_head dpipe_table_list; - struct list_head resource_list; - struct list_head param_list; - struct list_head region_list; - struct list_head reporter_list; - struct mutex reporters_lock; /* protects reporter_list */ - struct devlink_dpipe_headers *dpipe_headers; - struct list_head trap_list; - struct list_head trap_group_list; - struct list_head trap_policer_list; - struct list_head linecard_list; - struct mutex linecards_lock; /* protects linecard_list */ - const struct devlink_ops *ops; - u64 features; - struct xarray snapshot_ids; - struct devlink_dev_stats stats; - struct device *dev; - possible_net_t _net; - /* Serializes access to devlink instance specific objects such as - * port, sb, dpipe, resource, params, region, traps and more. - */ - struct mutex lock; - struct lock_class_key lock_key; - u8 reload_failed:1; - refcount_t refcount; - struct completion comp; - struct rcu_head rcu; - struct notifier_block netdevice_nb; - char priv[] __aligned(NETDEV_ALIGN); -}; - -struct devlink_linecard_ops; -struct devlink_linecard_type; +#include "devl_internal.h" struct devlink_linecard { struct list_head list; struct devlink *devlink; unsigned int index; - refcount_t refcount; const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; @@ -122,24 +76,6 @@ struct devlink_resource { void *occ_get_priv; }; -void *devlink_priv(struct devlink *devlink) -{ - return &devlink->priv; -} -EXPORT_SYMBOL_GPL(devlink_priv); - -struct devlink *priv_to_devlink(void *priv) -{ - return container_of(priv, struct devlink, priv); -} -EXPORT_SYMBOL_GPL(priv_to_devlink); - -struct device *devlink_to_dev(const struct devlink *devlink) -{ - return devlink->dev; -} -EXPORT_SYMBOL_GPL(devlink_to_dev); - static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { { .name = "destination mac", @@ -211,172 +147,6 @@ static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, }; -static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); -#define DEVLINK_REGISTERED XA_MARK_1 -#define DEVLINK_UNREGISTERING XA_MARK_2 - -/* devlink instances are open to the access from the user space after - * devlink_register() call. Such logical barrier allows us to have certain - * expectations related to locking. - * - * Before *_register() - we are in initialization stage and no parallel - * access possible to the devlink instance. All drivers perform that phase - * by implicitly holding device_lock. - * - * After *_register() - users and driver can access devlink instance at - * the same time. - */ -#define ASSERT_DEVLINK_REGISTERED(d) \ - WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) -#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ - WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) - -struct net *devlink_net(const struct devlink *devlink) -{ - return read_pnet(&devlink->_net); -} -EXPORT_SYMBOL_GPL(devlink_net); - -static void __devlink_put_rcu(struct rcu_head *head) -{ - struct devlink *devlink = container_of(head, struct devlink, rcu); - - complete(&devlink->comp); -} - -void devlink_put(struct devlink *devlink) -{ - if (refcount_dec_and_test(&devlink->refcount)) - /* Make sure unregister operation that may await the completion - * is unblocked only after all users are after the end of - * RCU grace period. - */ - call_rcu(&devlink->rcu, __devlink_put_rcu); -} - -struct devlink *__must_check devlink_try_get(struct devlink *devlink) -{ - if (refcount_inc_not_zero(&devlink->refcount)) - return devlink; - return NULL; -} - -void devl_assert_locked(struct devlink *devlink) -{ - lockdep_assert_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_assert_locked); - -#ifdef CONFIG_LOCKDEP -/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ -bool devl_lock_is_held(struct devlink *devlink) -{ - return lockdep_is_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock_is_held); -#endif - -void devl_lock(struct devlink *devlink) -{ - mutex_lock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock); - -int devl_trylock(struct devlink *devlink) -{ - return mutex_trylock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_trylock); - -void devl_unlock(struct devlink *devlink) -{ - mutex_unlock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_unlock); - -static struct devlink * -devlinks_xa_find_get(struct net *net, unsigned long *indexp, xa_mark_t filter, - void * (*xa_find_fn)(struct xarray *, unsigned long *, - unsigned long, xa_mark_t)) -{ - struct devlink *devlink; - - rcu_read_lock(); -retry: - devlink = xa_find_fn(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); - if (!devlink) - goto unlock; - - /* In case devlink_unregister() was already called and "unregistering" - * mark was set, do not allow to get a devlink reference here. - * This prevents live-lock of devlink_unregister() wait for completion. - */ - if (xa_get_mark(&devlinks, *indexp, DEVLINK_UNREGISTERING)) - goto retry; - - /* For a possible retry, the xa_find_after() should be always used */ - xa_find_fn = xa_find_after; - if (!devlink_try_get(devlink)) - goto retry; - if (!net_eq(devlink_net(devlink), net)) { - devlink_put(devlink); - goto retry; - } -unlock: - rcu_read_unlock(); - return devlink; -} - -static struct devlink *devlinks_xa_find_get_first(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find); -} - -static struct devlink *devlinks_xa_find_get_next(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find_after); -} - -/* Iterate over devlink pointers which were possible to get reference to. - * devlink_put() needs to be called for each iterated devlink pointer - * in loop body in order to release the reference. - */ -#define devlinks_xa_for_each_get(net, index, devlink, filter) \ - for (index = 0, \ - devlink = devlinks_xa_find_get_first(net, &index, filter); \ - devlink; devlink = devlinks_xa_find_get_next(net, &index, filter)) - -#define devlinks_xa_for_each_registered_get(net, index, devlink) \ - devlinks_xa_for_each_get(net, index, devlink, DEVLINK_REGISTERED) - -static struct devlink *devlink_get_from_attrs(struct net *net, - struct nlattr **attrs) -{ - struct devlink *devlink; - unsigned long index; - char *busname; - char *devname; - - if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) - return ERR_PTR(-EINVAL); - - busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); - devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); - - devlinks_xa_for_each_registered_get(net, index, devlink) { - if (strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0) - return devlink; - devlink_put(devlink); - } - - return ERR_PTR(-ENODEV); -} - #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ @@ -405,8 +175,8 @@ static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, return ERR_PTR(-EINVAL); } -static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, - struct genl_info *info) +struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } @@ -466,13 +236,13 @@ devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return devlink_rate_node_get_by_name(devlink, rate_node_name); } -static struct devlink_rate * +struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } -static struct devlink_rate * +struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; @@ -511,11 +281,7 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); struct devlink_linecard *linecard; - mutex_lock(&devlink->linecards_lock); linecard = devlink_linecard_get_by_index(devlink, linecard_index); - if (linecard) - refcount_inc(&linecard->refcount); - mutex_unlock(&devlink->linecards_lock); if (!linecard) return ERR_PTR(-ENODEV); return linecard; @@ -523,20 +289,12 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return ERR_PTR(-EINVAL); } -static struct devlink_linecard * +struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); } -static void devlink_linecard_put(struct devlink_linecard *linecard) -{ - if (refcount_dec_and_test(&linecard->refcount)) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - } -} - struct devlink_sb { struct list_head list; unsigned int index; @@ -838,95 +596,6 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) return NULL; } -#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) -#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) -#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) -#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) - -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink_port *devlink_port; - struct devlink *devlink; - int err; - - devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(devlink)) - return PTR_ERR(devlink); - devl_lock(devlink); - info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (IS_ERR(devlink_port)) { - err = PTR_ERR(devlink_port); - goto unlock; - } - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (!IS_ERR(devlink_port)) - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { - struct devlink_rate *devlink_rate; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) { - err = PTR_ERR(devlink_rate); - goto unlock; - } - info->user_ptr[1] = devlink_rate; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) { - err = PTR_ERR(rate_node); - goto unlock; - } - info->user_ptr[1] = rate_node; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) { - err = PTR_ERR(linecard); - goto unlock; - } - info->user_ptr[1] = linecard; - } - return 0; - -unlock: - devl_unlock(devlink); - devlink_put(devlink); - return err; -} - -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - - devlink = info->user_ptr[0]; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = info->user_ptr[1]; - devlink_linecard_put(linecard); - } - devl_unlock(devlink); - devlink_put(devlink); -} - -static struct genl_family devlink_nl_family; - -enum devlink_multicast_groups { - DEVLINK_MCGRP_CONFIG, -}; - -static const struct genl_multicast_group devlink_nl_mcgrps[] = { - [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, -}; - static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) { if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) @@ -1537,47 +1206,40 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; - u32 id = NETLINK_CB(cb->skb).portid; + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, NULL); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + if (idx < state->idx) { idx++; + continue; } - devl_unlock(devlink); - devlink_put(devlink); + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, NULL); + if (err) { + state->idx = idx; + break; + } + idx++; } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_gen_cmd devl_gen_rate_get = { + .dump_one = devlink_nl_cmd_rate_get_dump_one, +}; + static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1632,38 +1294,19 @@ static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) { - idx++; - devlink_put(devlink); - continue; - } - - devl_lock(devlink); - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - devl_unlock(devlink); - devlink_put(devlink); - - if (err) - goto out; - idx++; - } -out: - cb->args[0] = idx; - return msg->len; + return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); } +const struct devlink_gen_cmd devl_gen_inst = { + .dump_one = devlink_nl_cmd_get_dump_one, +}; + static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1686,43 +1329,40 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; - unsigned long index, port_index; - int start = cb->args[0]; + unsigned long port_index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, devlink_port) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_port_fill(msg, devlink_port, - DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + xa_for_each(&devlink->ports, port_index, devlink_port) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_port_fill(msg, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->extack); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_gen_cmd devl_gen_port = { + .dump_one = devlink_nl_cmd_port_get_dump_one, +}; + static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) @@ -2465,46 +2105,42 @@ static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->linecards_lock); - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < start) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - goto out; - } + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < state->idx) { idx++; + continue; + } + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + state->idx = idx; + break; } - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_gen_cmd devl_gen_linecard = { + .dump_one = devlink_nl_cmd_linecard_get_dump_one, +}; + static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, const char *type) @@ -2727,43 +2363,39 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_gen_cmd devl_gen_sb = { + .dump_one = devlink_nl_cmd_sb_get_dump_one, +}; + static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, u16 pool_index, enum devlink_command cmd, @@ -2869,46 +2501,39 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, return 0; } -static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; int err = 0; + int idx = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_pool_get) - goto retry; + if (!devlink->ops->sb_pool_get) + return 0; - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_pool_get_dumpit(msg, start, &idx, devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_gen_cmd devl_gen_sb_pool = { + .dump_one = devlink_nl_cmd_sb_pool_get_dump_one, +}; + static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, enum devlink_sb_threshold_type threshold_type, @@ -3084,46 +2709,39 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, return 0; } -static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_port_pool_get) - goto retry; - - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_port_pool_get_dumpit(msg, start, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + if (!devlink->ops->sb_port_pool_get) + return 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_gen_cmd devl_gen_sb_port_pool = { + .dump_one = devlink_nl_cmd_sb_port_pool_get_dump_one, +}; + static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, u32 threshold, @@ -3327,47 +2945,38 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, } static int -devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_tc_pool_bind_get) - goto retry; + if (!devlink->ops->sb_tc_pool_bind_get) + return 0; - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, - devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); -retry: - devlink_put(devlink); } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_gen_cmd devl_gen_sb_tc_pool_bind = { + .dump_one = devlink_nl_cmd_sb_tc_pool_bind_get_dump_one, +}; + static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, @@ -4653,11 +4262,6 @@ static void devlink_ns_change_notify(struct devlink *devlink, devlink_notify(devlink, DEVLINK_CMD_DEL); } -static bool devlink_reload_supported(const struct devlink_ops *ops) -{ - return ops->reload_down && ops->reload_up; -} - static void devlink_reload_failed_set(struct devlink *devlink, bool reload_failed) { @@ -4725,9 +4329,10 @@ void devlink_remote_reload_actions_performed(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); -static int devlink_reload(struct devlink *devlink, struct net *dest_net, - enum devlink_reload_action action, enum devlink_reload_limit limit, - u32 *actions_performed, struct netlink_ext_ack *extack) +int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack) { u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; struct net *curr_net; @@ -5190,41 +4795,24 @@ static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start || !devlink->ops->selftest_check) - goto inc; - - devl_lock(devlink); - err = devlink_nl_selftests_fill(msg, devlink, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; + if (!devlink->ops->selftest_check) + return 0; - cb->args[0] = idx; - return msg->len; + return devlink_nl_selftests_fill(msg, devlink, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); } +const struct devlink_gen_cmd devl_gen_selftests = { + .dump_one = devlink_nl_cmd_selftests_get_dump_one, +}; + static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, enum devlink_selftest_status test_status) { @@ -5654,7 +5242,13 @@ static void devlink_param_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); - ASSERT_DEVLINK_REGISTERED(devlink); + + /* devlink_notify_register() / devlink_notify_unregister() + * will replay the notifications if the params are added/removed + * outside of the lifetime of the instance. + */ + if (!devl_is_registered(devlink)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -5670,48 +5264,41 @@ static void devlink_param_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(param_item, &devlink->param_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(param_item, &devlink->param_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_param_fill(msg, devlink, 0, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - if (err != -EMSGSIZE) - return err; - cb->args[0] = idx; - return msg->len; + return err; } +const struct devlink_gen_cmd devl_gen_param = { + .dump_one = devlink_nl_cmd_param_get_dump_one, +}; + static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) @@ -6375,21 +5962,20 @@ out: return err; } -static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, - struct netlink_callback *cb, - struct devlink *devlink, - int *idx, - int start) +static int +devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; struct devlink_port *port; unsigned long port_index; - int err = 0; + int idx = 0; + int err; - devl_lock(devlink); list_for_each_entry(region, &devlink->region_list, list) { - if (*idx < start) { - (*idx)++; + if (idx < state->idx) { + idx++; continue; } err = devlink_nl_region_fill(msg, devlink, @@ -6397,43 +5983,28 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, region); - if (err) - goto out; - (*idx)++; + if (err) { + state->idx = idx; + return err; + } + idx++; } xa_for_each(&devlink->ports, port_index, port) { - err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx, - start); - if (err) - goto out; + err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, + state->idx); + if (err) { + state->idx = idx; + return err; + } } -out: - devl_unlock(devlink); - return err; + return 0; } -static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, - &idx, start); - devlink_put(devlink); - if (err) - goto out; - } -out: - cb->args[0] = idx; - return msg->len; -} +const struct devlink_gen_cmd devl_gen_region = { + .dump_one = devlink_nl_cmd_region_get_dump_one, +}; static int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info) @@ -6716,6 +6287,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; @@ -6729,14 +6301,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, void *hdr; int err; - start_offset = *((u64 *)&cb->args[0]); + start_offset = state->start_offset; - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) return PTR_ERR(devlink); - devl_lock(devlink); - if (!attrs[DEVLINK_ATTR_REGION_NAME]) { NL_SET_ERR_MSG(cb->extack, "No region name provided"); err = -EINVAL; @@ -6868,7 +6438,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - *((u64 *)&cb->args[0]) = ret_offset; + state->start_offset = ret_offset; nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); @@ -7064,43 +6634,25 @@ static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) - goto inc; - - devl_lock(devlink); - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err == -EOPNOTSUPP) - err = 0; - else if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; + int err; - cb->args[0] = idx; - return msg->len; + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + if (err == -EOPNOTSUPP) + err = 0; + return err; } +const struct devlink_gen_cmd devl_gen_info = { + .dump_one = devlink_nl_cmd_info_get_dump_one, +}; + struct devlink_fmsg_item { struct list_head list; int attrtype; @@ -7666,7 +7218,8 @@ static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, struct netlink_callback *cb, enum devlink_command cmd) { - int index = cb->args[0]; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + int index = state->idx; int tmp_index = index; void *hdr; int err; @@ -7682,7 +7235,7 @@ static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, if ((err && err != -EMSGSIZE) || tmp_index == index) goto nla_put_failure; - cb->args[0] = index; + state->idx = index; genlmsg_end(skb, hdr); return skb->len; @@ -7708,7 +7261,6 @@ struct devlink_health_reporter { u64 error_count; u64 recovery_count; u64 last_recovery_ts; - refcount_t refcount; }; void * @@ -7720,12 +7272,10 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * __devlink_health_reporter_find_by_name(struct list_head *reporter_list, - struct mutex *list_lock, const char *reporter_name) { struct devlink_health_reporter *reporter; - lockdep_assert_held(list_lock); list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; @@ -7737,7 +7287,6 @@ devlink_health_reporter_find_by_name(struct devlink *devlink, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink->reporter_list, - &devlink->reporters_lock, reporter_name); } @@ -7746,7 +7295,6 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, - &devlink_port->reporters_lock, reporter_name); } @@ -7771,13 +7319,12 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; mutex_init(&reporter->dump_lock); - refcount_set(&reporter->refcount, 1); return reporter; } /** - * devlink_port_health_reporter_create - create devlink health reporter for - * specified port instance + * devl_port_health_reporter_create - create devlink health reporter for + * specified port instance * * @port: devlink_port which should contain the new reporter * @ops: ops @@ -7785,34 +7332,47 @@ __devlink_health_reporter_create(struct devlink *devlink, * @priv: priv */ struct devlink_health_reporter * -devlink_port_health_reporter_create(struct devlink_port *port, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) +devl_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; - mutex_lock(&port->reporters_lock); + devl_assert_locked(port->devlink); + if (__devlink_health_reporter_find_by_name(&port->reporter_list, - &port->reporters_lock, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } + ops->name)) + return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(port->devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) - goto unlock; + return reporter; reporter->devlink_port = port; list_add_tail(&reporter->list, &port->reporter_list); -unlock: - mutex_unlock(&port->reporters_lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); + +struct devlink_health_reporter * +devlink_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink = port->devlink; + + devl_lock(devlink); + reporter = devl_port_health_reporter_create(port, ops, + graceful_period, priv); + devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); /** - * devlink_health_reporter_create - create devlink health reporter + * devl_health_reporter_create - create devlink health reporter * * @devlink: devlink * @ops: ops @@ -7820,26 +7380,38 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * @priv: priv */ struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) +devl_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; - mutex_lock(&devlink->reporters_lock); - if (devlink_health_reporter_find_by_name(devlink, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } + devl_assert_locked(devlink); + + if (devlink_health_reporter_find_by_name(devlink, ops->name)) + return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) - goto unlock; + return reporter; list_add_tail(&reporter->list, &devlink->reporter_list); -unlock: - mutex_unlock(&devlink->reporters_lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devl_health_reporter_create); + +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + devl_lock(devlink); + reporter = devl_health_reporter_create(devlink, ops, + graceful_period, priv); + devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); @@ -7853,51 +7425,31 @@ devlink_health_reporter_free(struct devlink_health_reporter *reporter) kfree(reporter); } -static void -devlink_health_reporter_put(struct devlink_health_reporter *reporter) -{ - if (refcount_dec_and_test(&reporter->refcount)) - devlink_health_reporter_free(reporter); -} - -static void -__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - list_del(&reporter->list); - devlink_health_reporter_put(reporter); -} - /** - * devlink_health_reporter_destroy - destroy devlink health reporter + * devl_health_reporter_destroy - destroy devlink health reporter * * @reporter: devlink health reporter to destroy */ void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +devl_health_reporter_destroy(struct devlink_health_reporter *reporter) { - struct mutex *lock = &reporter->devlink->reporters_lock; + devl_assert_locked(reporter->devlink); - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); + list_del(&reporter->list); + devlink_health_reporter_free(reporter); } -EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); +EXPORT_SYMBOL_GPL(devl_health_reporter_destroy); -/** - * devlink_port_health_reporter_destroy - destroy devlink port health reporter - * - * @reporter: devlink health reporter to destroy - */ void -devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter) +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { - struct mutex *lock = &reporter->devlink_port->reporters_lock; + struct devlink *devlink = reporter->devlink; - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); + devl_lock(devlink); + devl_health_reporter_destroy(reporter); + devl_unlock(devlink); } -EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); +EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, @@ -8128,7 +7680,6 @@ static struct devlink_health_reporter * devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { - struct devlink_health_reporter *reporter; struct devlink_port *devlink_port; char *reporter_name; @@ -8137,21 +7688,12 @@ devlink_health_reporter_get_from_attrs(struct devlink *devlink, reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); devlink_port = devlink_port_get_from_attrs(devlink, attrs); - if (IS_ERR(devlink_port)) { - mutex_lock(&devlink->reporters_lock); - reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink->reporters_lock); - } else { - mutex_lock(&devlink_port->reporters_lock); - reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink_port->reporters_lock); - } - - return reporter; + if (IS_ERR(devlink_port)) + return devlink_health_reporter_find_by_name(devlink, + reporter_name); + else + return devlink_port_health_reporter_find_by_name(devlink_port, + reporter_name); } static struct devlink_health_reporter * @@ -8169,9 +7711,10 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) struct nlattr **attrs = info->attrs; struct devlink *devlink; - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) return NULL; + devl_unlock(devlink); reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); devlink_put(devlink); @@ -8209,10 +7752,8 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } + if (!msg) + return -ENOMEM; err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, @@ -8220,89 +7761,72 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, 0); if (err) { nlmsg_free(msg); - goto out; + return err; } - err = genlmsg_reply(msg, info); -out: - devlink_health_reporter_put(reporter); - return err; + return genlmsg_reply(msg, info); } static int -devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; - unsigned long index, port_index; struct devlink_port *port; - struct devlink *devlink; - int start = cb->args[0]; + unsigned long port_index; int idx = 0; int err; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->reporters_lock); - list_for_each_entry(reporter, &devlink->reporter_list, - list) { - if (idx < start) { + list_for_each_entry(reporter, &devlink->reporter_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + return err; + } + idx++; + } + xa_for_each(&devlink->ports, port_index, port) { + list_for_each_entry(reporter, &port->reporter_list, list) { + if (idx < state->idx) { idx++; continue; } - err = devlink_nl_health_reporter_fill( - msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill(msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - goto out; + state->idx = idx; + return err; } idx++; } - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - } - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, port) { - mutex_lock(&port->reporters_lock); - list_for_each_entry(reporter, &port->reporter_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill( - msg, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { - mutex_unlock(&port->reporters_lock); - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&port->reporters_lock); - } - devl_unlock(devlink); - devlink_put(devlink); } -out: - cb->args[0] = idx; - return msg->len; + + return 0; } +const struct devlink_gen_cmd devl_gen_health_reporter = { + .dump_one = devlink_nl_cmd_health_reporter_get_dump_one, +}; + static int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) @@ -8310,15 +7834,12 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) { - err = -EOPNOTSUPP; - goto out; - } + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + return -EOPNOTSUPP; + if (!reporter->ops->dump && - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { - err = -EOPNOTSUPP; - goto out; - } + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + return -EOPNOTSUPP; if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -8332,11 +7853,7 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_dump = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); - devlink_health_reporter_put(reporter); return 0; -out: - devlink_health_reporter_put(reporter); - return err; } static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, @@ -8344,16 +7861,12 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL, info->extack); - - devlink_health_reporter_put(reporter); - return err; + return devlink_health_reporter_recover(reporter, NULL, info->extack); } static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, @@ -8368,62 +7881,52 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->diagnose) { - devlink_health_reporter_put(reporter); + if (!reporter->ops->diagnose) return -EOPNOTSUPP; - } fmsg = devlink_fmsg_alloc(); - if (!fmsg) { - devlink_health_reporter_put(reporter); + if (!fmsg) return -ENOMEM; - } err = devlink_fmsg_obj_nest_start(fmsg); if (err) - goto out; + return err; err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) - goto out; + return err; err = devlink_fmsg_obj_nest_end(fmsg); if (err) - goto out; - - err = devlink_fmsg_snd(fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); + return err; -out: - devlink_fmsg_free(fmsg); - devlink_health_reporter_put(reporter); - return err; + return devlink_fmsg_snd(fmsg, info, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); } static int devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; - u64 start = cb->args[0]; int err; reporter = devlink_health_reporter_get_from_cb(cb); if (!reporter) return -EINVAL; - if (!reporter->ops->dump) { - err = -EOPNOTSUPP; - goto out; - } + if (!reporter->ops->dump) + return -EOPNOTSUPP; + mutex_lock(&reporter->dump_lock); - if (!start) { + if (!state->idx) { err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; - cb->args[1] = reporter->dump_ts; + state->dump_ts = reporter->dump_ts; } - if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { + if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) { NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); err = -EAGAIN; goto unlock; @@ -8433,8 +7936,6 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); unlock: mutex_unlock(&reporter->dump_lock); -out: - devlink_health_reporter_put(reporter); return err; } @@ -8449,15 +7950,12 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); + if (!reporter->ops->dump) return -EOPNOTSUPP; - } mutex_lock(&reporter->dump_lock); devlink_health_dump_clear(reporter); mutex_unlock(&reporter->dump_lock); - devlink_health_reporter_put(reporter); return 0; } @@ -8466,21 +7964,15 @@ static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; - if (!reporter->ops->test) { - devlink_health_reporter_put(reporter); + if (!reporter->ops->test) return -EOPNOTSUPP; - } - - err = reporter->ops->test(reporter, info->extack); - devlink_health_reporter_put(reporter); - return err; + return reporter->ops->test(reporter, info->extack); } struct devlink_stats { @@ -8814,43 +8306,39 @@ err_trap_fill: return err; } -static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_gen_cmd devl_gen_trap = { + .dump_one = devlink_nl_cmd_trap_get_dump_one, +}; + static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, enum devlink_trap_action trap_action, @@ -9029,46 +8517,41 @@ err_trap_group_fill: return err; } -static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(group_item, &devlink->trap_group_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_group_fill(msg, devlink, - group_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (idx < state->idx) { idx++; + continue; } - devl_unlock(devlink); - devlink_put(devlink); + err = devlink_nl_trap_group_fill(msg, devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; + } + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_gen_cmd devl_gen_trap_group = { + .dump_one = devlink_nl_cmd_trap_group_get_dump_one, +}; + static int __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, @@ -9144,6 +8627,7 @@ static int devlink_trap_group_set(struct devlink *devlink, struct netlink_ext_ack *extack = info->extack; const struct devlink_trap_policer *policer; struct nlattr **attrs = info->attrs; + u32 policer_id; int err; if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) @@ -9152,17 +8636,11 @@ static int devlink_trap_group_set(struct devlink *devlink, if (!devlink->ops->trap_group_set) return -EOPNOTSUPP; - policer_item = group_item->policer_item; - if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) { - u32 policer_id; - - policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - policer_item = devlink_trap_policer_item_lookup(devlink, - policer_id); - if (policer_id && !policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); - return -ENOENT; - } + policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (policer_id && !policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; } policer = policer_item ? policer_item->policer : NULL; @@ -9333,46 +8811,40 @@ err_trap_policer_fill: return err; } -static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +static int +devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { - enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW; + struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; int idx = 0; - int err; + int err = 0; - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(policer_item, &devlink->trap_policer_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_policer_fill(msg, devlink, - policer_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (idx < state->idx) { idx++; + continue; + } + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + state->idx = idx; + break; } - devl_unlock(devlink); - devlink_put(devlink); + idx++; } -out: - cb->args[0] = idx; - return msg->len; + + return err; } +const struct devlink_gen_cmd devl_gen_trap_policer = { + .dump_one = devlink_nl_cmd_trap_policer_get_dump_one, +}; + static int devlink_trap_policer_set(struct devlink *devlink, struct devlink_trap_policer_item *policer_item, @@ -9445,88 +8917,19 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { - [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = - DEVLINK_ATTR_TRAP_POLICER_ID }, - [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, - DEVLINK_PORT_TYPE_IB), - [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, - DEVLINK_ESWITCH_MODE_SWITCHDEV), - [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, - [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, - [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, - [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, - [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, - [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = - NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), - [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, - [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_ACTION_MAX), - [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), - [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, - [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, -}; - -static const struct genl_small_ops devlink_nl_ops[] = { +const struct genl_small_ops devlink_nl_ops[56] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_get_doit, - .dumpit = devlink_nl_cmd_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_get_doit, - .dumpit = devlink_nl_cmd_port_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9540,7 +8943,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_RATE_GET, .doit = devlink_nl_cmd_rate_get_doit, - .dumpit = devlink_nl_cmd_rate_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, /* can be retrieved by unprivileged users */ }, @@ -9588,7 +8991,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_LINECARD_GET, .doit = devlink_nl_cmd_linecard_get_doit, - .dumpit = devlink_nl_cmd_linecard_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, /* can be retrieved by unprivileged users */ }, @@ -9602,14 +9005,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, - .dumpit = devlink_nl_cmd_sb_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { @@ -9622,7 +9025,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9637,7 +9040,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -9718,7 +9121,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_get_doit, - .dumpit = devlink_nl_cmd_param_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { @@ -9746,7 +9149,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_REGION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_get_doit, - .dumpit = devlink_nl_cmd_region_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .flags = GENL_ADMIN_PERM, }, { @@ -9772,14 +9175,14 @@ static const struct genl_small_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_info_get_doit, - .dumpit = devlink_nl_cmd_info_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, /* can be retrieved by unprivileged users */ }, @@ -9834,7 +9237,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_GET, .doit = devlink_nl_cmd_trap_get_doit, - .dumpit = devlink_nl_cmd_trap_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { @@ -9845,7 +9248,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, .doit = devlink_nl_cmd_trap_group_get_doit, - .dumpit = devlink_nl_cmd_trap_group_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { @@ -9856,7 +9259,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, .doit = devlink_nl_cmd_trap_policer_get_doit, - .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { @@ -9867,7 +9270,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_SELFTESTS_GET, .doit = devlink_nl_cmd_selftests_get_doit, - .dumpit = devlink_nl_cmd_selftests_get_dumpit + .dumpit = devlink_nl_instance_iter_dump, /* can be retrieved by unprivileged users */ }, { @@ -9875,26 +9278,10 @@ static const struct genl_small_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_selftests_run, .flags = GENL_ADMIN_PERM, }, + /* -- No new ops here! Use split ops going forward! -- */ }; -static struct genl_family devlink_nl_family __ro_after_init = { - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .policy = devlink_nl_policy, - .netnsok = true, - .parallel_ops = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, - .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), - .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, - .mcgrps = devlink_nl_mcgrps, - .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), -}; - -static bool devlink_reload_actions_valid(const struct devlink_ops *ops) +bool devlink_reload_actions_valid(const struct devlink_ops *ops) { const struct devlink_reload_combination *comb; int i; @@ -9923,100 +9310,6 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) return true; } -/** - * devlink_set_features - Set devlink supported features - * - * @devlink: devlink - * @features: devlink support features - * - * This interface allows us to set reload ops separatelly from - * the devlink_alloc. - */ -void devlink_set_features(struct devlink *devlink, u64 features) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(features & DEVLINK_F_RELOAD && - !devlink_reload_supported(devlink->ops)); - devlink->features = features; -} -EXPORT_SYMBOL_GPL(devlink_set_features); - -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr); - -/** - * devlink_alloc_ns - Allocate new devlink instance resources - * in specific namespace - * - * @ops: ops - * @priv_size: size of user private data - * @net: net namespace - * @dev: parent device - * - * Allocate new devlink instance resources, including devlink index - * and name. - */ -struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, - size_t priv_size, struct net *net, - struct device *dev) -{ - struct devlink *devlink; - static u32 last_id; - int ret; - - WARN_ON(!ops || !dev); - if (!devlink_reload_actions_valid(ops)) - return NULL; - - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); - if (!devlink) - return NULL; - - ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, - &last_id, GFP_KERNEL); - if (ret < 0) - goto err_xa_alloc; - - devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); - if (ret) - goto err_register_netdevice_notifier; - - devlink->dev = dev; - devlink->ops = ops; - xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); - xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - write_pnet(&devlink->_net, net); - INIT_LIST_HEAD(&devlink->rate_list); - INIT_LIST_HEAD(&devlink->linecard_list); - INIT_LIST_HEAD(&devlink->sb_list); - INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); - INIT_LIST_HEAD(&devlink->resource_list); - INIT_LIST_HEAD(&devlink->param_list); - INIT_LIST_HEAD(&devlink->region_list); - INIT_LIST_HEAD(&devlink->reporter_list); - INIT_LIST_HEAD(&devlink->trap_list); - INIT_LIST_HEAD(&devlink->trap_group_list); - INIT_LIST_HEAD(&devlink->trap_policer_list); - lockdep_register_key(&devlink->lock_key); - mutex_init(&devlink->lock); - lockdep_set_class(&devlink->lock, &devlink->lock_key); - mutex_init(&devlink->reporters_lock); - mutex_init(&devlink->linecards_lock); - refcount_set(&devlink->refcount, 1); - init_completion(&devlink->comp); - - return devlink; - -err_register_netdevice_notifier: - xa_erase(&devlinks, devlink->index); -err_xa_alloc: - kfree(devlink); - return NULL; -} -EXPORT_SYMBOL_GPL(devlink_alloc_ns); - static void devlink_trap_policer_notify(struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, @@ -10029,7 +9322,7 @@ static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd); -static void devlink_notify_register(struct devlink *devlink) +void devlink_notify_register(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; struct devlink_trap_group_item *group_item; @@ -10070,7 +9363,7 @@ static void devlink_notify_register(struct devlink *devlink) DEVLINK_CMD_PARAM_NEW); } -static void devlink_notify_unregister(struct devlink *devlink) +void devlink_notify_unregister(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; struct devlink_trap_group_item *group_item; @@ -10107,79 +9400,6 @@ static void devlink_notify_unregister(struct devlink *devlink) devlink_notify(devlink, DEVLINK_CMD_DEL); } -/** - * devlink_register - Register devlink instance - * - * @devlink: devlink - */ -void devlink_register(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - /* Make sure that we are in .probe() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - devlink_notify_register(devlink); -} -EXPORT_SYMBOL_GPL(devlink_register); - -/** - * devlink_unregister - Unregister devlink instance - * - * @devlink: devlink - */ -void devlink_unregister(struct devlink *devlink) -{ - ASSERT_DEVLINK_REGISTERED(devlink); - /* Make sure that we are in .remove() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); - devlink_put(devlink); - wait_for_completion(&devlink->comp); - - devlink_notify_unregister(devlink); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); -} -EXPORT_SYMBOL_GPL(devlink_unregister); - -/** - * devlink_free - Free devlink instance resources - * - * @devlink: devlink - */ -void devlink_free(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - mutex_destroy(&devlink->linecards_lock); - mutex_destroy(&devlink->reporters_lock); - mutex_destroy(&devlink->lock); - lockdep_unregister_key(&devlink->lock_key); - WARN_ON(!list_empty(&devlink->trap_policer_list)); - WARN_ON(!list_empty(&devlink->trap_group_list)); - WARN_ON(!list_empty(&devlink->trap_list)); - WARN_ON(!list_empty(&devlink->reporter_list)); - WARN_ON(!list_empty(&devlink->region_list)); - WARN_ON(!list_empty(&devlink->param_list)); - WARN_ON(!list_empty(&devlink->resource_list)); - WARN_ON(!list_empty(&devlink->dpipe_table_list)); - WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); - WARN_ON(!list_empty(&devlink->linecard_list)); - WARN_ON(!xa_empty(&devlink->ports)); - - xa_destroy(&devlink->snapshot_ids); - xa_destroy(&devlink->ports); - - WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb)); - - xa_erase(&devlinks, devlink->index); - - kfree(devlink); -} -EXPORT_SYMBOL_GPL(devlink_free); - static void devlink_port_type_warn(struct work_struct *work) { WARN(true, "Type was not set for devlink port."); @@ -10279,12 +9499,9 @@ int devl_port_register(struct devlink *devlink, devlink_port->index = port_index; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) { - mutex_destroy(&devlink_port->reporters_lock); + if (err) return err; - } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); @@ -10335,7 +9552,6 @@ void devl_port_unregister(struct devlink_port *devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); - mutex_destroy(&devlink_port->reporters_lock); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); @@ -10480,8 +9696,8 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_port_type_clear); -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; @@ -10911,7 +10127,7 @@ static void devlink_linecard_types_fini(struct devlink_linecard *linecard) } /** - * devlink_linecard_create - Create devlink linecard + * devl_linecard_create - Create devlink linecard * * @devlink: devlink * @linecard_index: driver-specific numerical identifier of the linecard @@ -10924,8 +10140,8 @@ static void devlink_linecard_types_fini(struct devlink_linecard *linecard) * Return: Line card structure or an ERR_PTR() encoded error code. */ struct devlink_linecard * -devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, - const struct devlink_linecard_ops *ops, void *priv) +devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) { struct devlink_linecard *linecard; int err; @@ -10934,17 +10150,12 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, !ops->types_count || !ops->types_get)) return ERR_PTR(-EINVAL); - mutex_lock(&devlink->linecards_lock); - if (devlink_linecard_index_exists(devlink, linecard_index)) { - mutex_unlock(&devlink->linecards_lock); + if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); - } linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); - if (!linecard) { - mutex_unlock(&devlink->linecards_lock); + if (!linecard) return ERR_PTR(-ENOMEM); - } linecard->devlink = devlink; linecard->index = linecard_index; @@ -10957,35 +10168,29 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, if (err) { mutex_destroy(&linecard->state_lock); kfree(linecard); - mutex_unlock(&devlink->linecards_lock); return ERR_PTR(err); } list_add_tail(&linecard->list, &devlink->linecard_list); - refcount_set(&linecard->refcount, 1); - mutex_unlock(&devlink->linecards_lock); devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); return linecard; } -EXPORT_SYMBOL_GPL(devlink_linecard_create); +EXPORT_SYMBOL_GPL(devl_linecard_create); /** - * devlink_linecard_destroy - Destroy devlink linecard + * devl_linecard_destroy - Destroy devlink linecard * * @linecard: devlink linecard */ -void devlink_linecard_destroy(struct devlink_linecard *linecard) +void devl_linecard_destroy(struct devlink_linecard *linecard) { - struct devlink *devlink = linecard->devlink; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - mutex_lock(&devlink->linecards_lock); list_del(&linecard->list); devlink_linecard_types_fini(linecard); - mutex_unlock(&devlink->linecards_lock); - devlink_linecard_put(linecard); + mutex_destroy(&linecard->state_lock); + kfree(linecard); } -EXPORT_SYMBOL_GPL(devlink_linecard_destroy); +EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** * devlink_linecard_provision_set - Set provisioning on linecard @@ -11604,8 +10809,6 @@ int devlink_params_register(struct devlink *devlink, const struct devlink_param *param = params; int i, err; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - for (i = 0; i < params_count; i++, param++) { err = devlink_param_register(devlink, param); if (err) @@ -11636,8 +10839,6 @@ void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *param = params; int i; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - for (i = 0; i < params_count; i++, param++) devlink_param_unregister(devlink, param); } @@ -11657,8 +10858,6 @@ int devlink_param_register(struct devlink *devlink, { struct devlink_param_item *param_item; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - WARN_ON(devlink_param_verify(param)); WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name)); @@ -11674,6 +10873,7 @@ int devlink_param_register(struct devlink *devlink, param_item->param = param; list_add_tail(¶m_item->list, &devlink->param_list); + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } EXPORT_SYMBOL_GPL(devlink_param_register); @@ -11688,11 +10888,10 @@ void devlink_param_unregister(struct devlink *devlink, { struct devlink_param_item *param_item; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - param_item = devlink_param_find_by_name(&devlink->param_list, param->name); WARN_ON(!param_item); + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); list_del(¶m_item->list); kfree(param_item); } @@ -11752,8 +10951,6 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, { struct devlink_param_item *param_item; - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); if (!param_item) return -EINVAL; @@ -11767,6 +10964,8 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, else param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); @@ -12921,7 +12120,8 @@ void devlink_compat_running_version(struct devlink *devlink, return; devl_lock(devlink); - __devlink_compat_running_version(devlink, buf, len); + if (devl_is_registered(devlink)) + __devlink_compat_running_version(devlink, buf, len); devl_unlock(devlink); } @@ -12930,20 +12130,28 @@ int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) struct devlink_flash_update_params params = {}; int ret; - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; + devl_lock(devlink); + if (!devl_is_registered(devlink)) { + ret = -ENODEV; + goto out_unlock; + } + + if (!devlink->ops->flash_update) { + ret = -EOPNOTSUPP; + goto out_unlock; + } ret = request_firmware(¶ms.fw, file_name, devlink->dev); if (ret) - return ret; + goto out_unlock; - devl_lock(devlink); devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, ¶ms, NULL); devlink_flash_update_end_notify(devlink); - devl_unlock(devlink); release_firmware(params.fw); +out_unlock: + devl_unlock(devlink); return ret; } @@ -12983,47 +12191,3 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } - -static void __net_exit devlink_pernet_pre_exit(struct net *net) -{ - struct devlink *devlink; - u32 actions_performed; - unsigned long index; - int err; - - /* In case network namespace is getting destroyed, reload - * all devlink instances from this namespace into init_net. - */ - devlinks_xa_for_each_registered_get(net, index, devlink) { - WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); - mutex_lock(&devlink->lock); - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - mutex_unlock(&devlink->lock); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - devlink_put(devlink); - } -} - -static struct pernet_operations devlink_pernet_ops __net_initdata = { - .pre_exit = devlink_pernet_pre_exit, -}; - -static int __init devlink_init(void) -{ - int err; - - err = genl_register_family(&devlink_nl_family); - if (err) - goto out; - err = register_pernet_subsys(&devlink_pernet_ops); - -out: - WARN_ON(err); - return err; -} - -subsys_initcall(devlink_init); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c new file mode 100644 index 000000000000..3f44633af01c --- /dev/null +++ b/net/devlink/netlink.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + */ + +#include <net/genetlink.h> +#include <net/sock.h> + +#include "devl_internal.h" + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = + DEVLINK_ATTR_TRAP_POLICER_ID }, + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, + DEVLINK_PORT_TYPE_IB), + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, + DEVLINK_ESWITCH_MODE_SWITCHDEV), + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, + [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, + [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = + NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), + [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, + [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_ACTION_MAX), + [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), + [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, +}; + +struct devlink * +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) +{ + struct devlink *devlink; + unsigned long index; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + devlinks_xa_for_each_registered_get(net, index, devlink) { + devl_lock(devlink); + if (devl_is_registered(devlink) && + strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0) + return devlink; + devl_unlock(devlink); + devlink_put(devlink); + } + + return ERR_PTR(-ENODEV); +} + +static int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_linecard *linecard; + struct devlink_port *devlink_port; + struct devlink *devlink; + int err; + + devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs); + if (IS_ERR(devlink)) + return PTR_ERR(devlink); + + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + err = PTR_ERR(devlink_port); + goto unlock; + } + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (!IS_ERR(devlink_port)) + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { + struct devlink_rate *devlink_rate; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) { + err = PTR_ERR(devlink_rate); + goto unlock; + } + info->user_ptr[1] = devlink_rate; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) { + err = PTR_ERR(rate_node); + goto unlock; + } + info->user_ptr[1] = rate_node; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) { + err = PTR_ERR(linecard); + goto unlock; + } + info->user_ptr[1] = linecard; + } + return 0; + +unlock: + devl_unlock(devlink); + devlink_put(devlink); + return err; +} + +static void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + devlink = info->user_ptr[0]; + devl_unlock(devlink); + devlink_put(devlink); +} + +static const struct devlink_gen_cmd *devl_gen_cmds[] = { + [DEVLINK_CMD_GET] = &devl_gen_inst, + [DEVLINK_CMD_PORT_GET] = &devl_gen_port, + [DEVLINK_CMD_SB_GET] = &devl_gen_sb, + [DEVLINK_CMD_SB_POOL_GET] = &devl_gen_sb_pool, + [DEVLINK_CMD_SB_PORT_POOL_GET] = &devl_gen_sb_port_pool, + [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_gen_sb_tc_pool_bind, + [DEVLINK_CMD_PARAM_GET] = &devl_gen_param, + [DEVLINK_CMD_REGION_GET] = &devl_gen_region, + [DEVLINK_CMD_INFO_GET] = &devl_gen_info, + [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_gen_health_reporter, + [DEVLINK_CMD_RATE_GET] = &devl_gen_rate_get, + [DEVLINK_CMD_TRAP_GET] = &devl_gen_trap, + [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_gen_trap_group, + [DEVLINK_CMD_TRAP_POLICER_GET] = &devl_gen_trap_policer, + [DEVLINK_CMD_LINECARD_GET] = &devl_gen_linecard, + [DEVLINK_CMD_SELFTESTS_GET] = &devl_gen_selftests, +}; + +int devlink_nl_instance_iter_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + const struct devlink_gen_cmd *cmd; + struct devlink *devlink; + int err = 0; + + cmd = devl_gen_cmds[info->op.cmd]; + + while ((devlink = devlinks_xa_find_get(sock_net(msg->sk), + &state->instance))) { + devl_lock(devlink); + + if (devl_is_registered(devlink)) + err = cmd->dump_one(msg, devlink, cb); + else + err = 0; + + devl_unlock(devlink); + devlink_put(devlink); + + if (err) + break; + + state->instance++; + + /* restart sub-object walk for the next instance */ + state->idx = 0; + } + + if (err != -EMSGSIZE) + return err; + return msg->len; +} + +struct genl_family devlink_nl_family __ro_after_init = { + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .policy = devlink_nl_policy, + .netnsok = true, + .parallel_ops = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, + .module = THIS_MODULE, + .small_ops = devlink_nl_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, + .mcgrps = devlink_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), +}; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 080e5c369f5b..694478fe07d6 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -4,8 +4,10 @@ * Copyright (c) 2017 Microchip Technology */ +#include <linux/dsa/ksz_common.h> #include <linux/etherdevice.h> #include <linux/list.h> +#include <linux/ptp_classify.h> #include <net/dsa.h> #include "tag.h" @@ -16,9 +18,71 @@ #define LAN937X_NAME "lan937x" /* Typically only one byte is used for tail tag. */ +#define KSZ_PTP_TAG_LEN 4 #define KSZ_EGRESS_TAG_LEN 1 #define KSZ_INGRESS_TAG_LEN 1 +#define KSZ_HWTS_EN 0 + +struct ksz_tagger_private { + struct ksz_tagger_data data; /* Must be first */ + unsigned long state; + struct kthread_worker *xmit_worker; +}; + +static struct ksz_tagger_private * +ksz_tagger_private(struct dsa_switch *ds) +{ + return ds->tagger_data; +} + +static void ksz_hwtstamp_set_state(struct dsa_switch *ds, bool on) +{ + struct ksz_tagger_private *priv = ksz_tagger_private(ds); + + if (on) + set_bit(KSZ_HWTS_EN, &priv->state); + else + clear_bit(KSZ_HWTS_EN, &priv->state); +} + +static void ksz_disconnect(struct dsa_switch *ds) +{ + struct ksz_tagger_private *priv = ds->tagger_data; + + kthread_destroy_worker(priv->xmit_worker); + kfree(priv); + ds->tagger_data = NULL; +} + +static int ksz_connect(struct dsa_switch *ds) +{ + struct ksz_tagger_data *tagger_data; + struct kthread_worker *xmit_worker; + struct ksz_tagger_private *priv; + int ret; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + ds->dst->index, ds->index); + if (IS_ERR(xmit_worker)) { + ret = PTR_ERR(xmit_worker); + kfree(priv); + return ret; + } + + priv->xmit_worker = xmit_worker; + /* Export functions for switch driver use */ + tagger_data = &priv->data; + tagger_data->hwtstamp_set_state = ksz_hwtstamp_set_state; + ds->tagger_data = priv; + + return 0; +} + static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int port, unsigned int len) @@ -92,17 +156,20 @@ DSA_TAG_DRIVER(ksz8795_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); /* - * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. + * For Ingress (Host -> KSZ9477), 2/6 bytes are added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)| + * FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if PTP is enabled in the Hardware) * tag0 : Prioritization (not used now) * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5) * - * For Egress (KSZ9477 -> Host), 1 byte is added before FCS. + * For Egress (KSZ9477 -> Host), 1/5 bytes is added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if bit 7 of tag0 is set) * tag0 : zero-based value represents port * (eg, 0x00=port1, 0x02=port3, 0x06=port7) */ @@ -114,6 +181,91 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); #define KSZ9477_TAIL_TAG_OVERRIDE BIT(9) #define KSZ9477_TAIL_TAG_LOOKUP BIT(10) +static void ksz_rcv_timestamp(struct sk_buff *skb, u8 *tag) +{ + u8 *tstamp_raw = tag - KSZ_PTP_TAG_LEN; + ktime_t tstamp; + + tstamp = ksz_decode_tstamp(get_unaligned_be32(tstamp_raw)); + KSZ_SKB_CB(skb)->tstamp = tstamp; +} + +/* Time stamp tag *needs* to be inserted if PTP is enabled in hardware. + * Regardless of Whether it is a PTP frame or not. + */ +static void ksz_xmit_timestamp(struct dsa_port *dp, struct sk_buff *skb) +{ + struct ksz_tagger_private *priv; + struct ptp_header *ptp_hdr; + unsigned int ptp_type; + u32 tstamp_raw = 0; + s64 correction; + + priv = ksz_tagger_private(dp->ds); + + if (!test_bit(KSZ_HWTS_EN, &priv->state)) + return; + + if (!KSZ_SKB_CB(skb)->update_correction) + goto output_tag; + + ptp_type = KSZ_SKB_CB(skb)->ptp_type; + + ptp_hdr = ptp_parse_header(skb, ptp_type); + if (!ptp_hdr) + goto output_tag; + + correction = (s64)get_unaligned_be64(&ptp_hdr->correction); + + if (correction < 0) { + struct timespec64 ts; + + ts = ns_to_timespec64(-correction >> 16); + tstamp_raw = ((ts.tv_sec & 3) << 30) | ts.tv_nsec; + + /* Set correction field to 0 and update UDP checksum */ + ptp_header_update_correction(skb, ptp_type, ptp_hdr, 0); + } + +output_tag: + put_unaligned_be32(tstamp_raw, skb_put(skb, KSZ_PTP_TAG_LEN)); +} + +/* Defer transmit if waiting for egress time stamp is required. */ +static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb) +{ + struct ksz_tagger_data *tagger_data = ksz_tagger_data(dp->ds); + struct ksz_tagger_private *priv = ksz_tagger_private(dp->ds); + void (*xmit_work_fn)(struct kthread_work *work); + struct sk_buff *clone = KSZ_SKB_CB(skb)->clone; + struct ksz_deferred_xmit_work *xmit_work; + struct kthread_worker *xmit_worker; + + if (!clone) + return skb; /* no deferred xmit for this packet */ + + xmit_work_fn = tagger_data->xmit_work_fn; + xmit_worker = priv->xmit_worker; + + if (!xmit_work_fn || !xmit_worker) + return NULL; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + kthread_init_work(&xmit_work->work, xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -126,6 +278,8 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, return NULL; /* Tag encoding */ + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); addr = skb_mac_header(skb); @@ -136,7 +290,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, *tag = cpu_to_be16(val); - return skb; + return ksz_defer_xmit(dp, skb); } static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) @@ -147,8 +301,10 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) unsigned int len = KSZ_EGRESS_TAG_LEN; /* Extra 4-bytes PTP timestamp */ - if (tag[0] & KSZ9477_PTP_TAG_INDICATION) - len += KSZ9477_PTP_TAG_LEN; + if (tag[0] & KSZ9477_PTP_TAG_INDICATION) { + ksz_rcv_timestamp(skb, tag); + len += KSZ_PTP_TAG_LEN; + } return ksz_common_rcv(skb, dev, port, len); } @@ -158,7 +314,9 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = KSZ9477_INGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = KSZ9477_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); @@ -178,6 +336,8 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, return NULL; /* Tag encoding */ + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); addr = skb_mac_header(skb); @@ -186,7 +346,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, if (is_link_local_ether_addr(addr)) *tag |= KSZ9893_TAIL_TAG_OVERRIDE; - return skb; + return ksz_defer_xmit(dp, skb); } static const struct dsa_device_ops ksz9893_netdev_ops = { @@ -194,23 +354,28 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = KSZ_INGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = KSZ_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(ksz9893_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME); -/* For xmit, 2 bytes are added before FCS. +/* For xmit, 2/6 bytes are added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)| + * FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if PTP is enabled in the Hardware) * tag0 : represents tag override, lookup and valid * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8) * - * For rcv, 1 byte is added before FCS. + * For rcv, 1/5 bytes is added before FCS. * --------------------------------------------------------------------------- - * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- + * ts : time stamp (Present only if bit 7 of tag0 is set) * tag0 : zero-based value represents port * (eg, 0x00=port1, 0x02=port3, 0x07=port8) */ @@ -232,6 +397,8 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) return NULL; + ksz_xmit_timestamp(dp, skb); + tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN); val = BIT(dp->index); @@ -244,7 +411,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, put_unaligned_be16(val, tag); - return skb; + return ksz_defer_xmit(dp, skb); } static const struct dsa_device_ops lan937x_netdev_ops = { @@ -252,7 +419,9 @@ static const struct dsa_device_ops lan937x_netdev_ops = { .proto = DSA_TAG_PROTO_LAN937X, .xmit = lan937x_xmit, .rcv = ksz9477_rcv, - .needed_tailroom = LAN937X_EGRESS_TAG_LEN, + .connect = ksz_connect, + .disconnect = ksz_disconnect, + .needed_tailroom = LAN937X_EGRESS_TAG_LEN + KSZ_PTP_TAG_LEN, }; DSA_TAG_DRIVER(lan937x_netdev_ops); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 228f13df2e18..563864c1bf5a 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -8,4 +8,4 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \ - pse-pd.o + pse-pd.o plca.o diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 487bdf345541..e405b47f7eed 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -105,7 +105,10 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ - nla_total_size(sizeof(u8)); /* _USE_CQE_MODE_RX */ + nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ + nla_total_size(sizeof(u32)); /* _TX_AGGR_TIME_USECS */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -180,7 +183,13 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, kcoal->use_cqe_mode_tx, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, - kcoal->use_cqe_mode_rx, supported)) + kcoal->use_cqe_mode_rx, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, + kcoal->tx_aggr_max_bytes, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, + kcoal->tx_aggr_max_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, + kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; return 0; @@ -227,6 +236,9 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, }; int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) @@ -321,6 +333,12 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes, + tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames, + tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); + ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, + tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); ret = 0; if (!mod) goto out_ops; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 6f399afc2ff2..5fb19050991e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -208,6 +208,9 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full), __DEFINE_LINK_MODE_NAME(800000, SR8, Full), __DEFINE_LINK_MODE_NAME(800000, VR8, Full), + __DEFINE_LINK_MODE_NAME(10, T1S, Full), + __DEFINE_LINK_MODE_NAME(10, T1S, Half), + __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -244,6 +247,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_X 1 #define __LINK_MODE_LANES_FX 1 #define __LINK_MODE_LANES_T1L 1 +#define __LINK_MODE_LANES_T1S 1 +#define __LINK_MODE_LANES_T1S_P2MP 1 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 @@ -366,6 +371,9 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), + __DEFINE_LINK_MODE_PARAMS(10, T1S, Full), + __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), + __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index aee98be6237f..9f924875bba9 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -288,6 +288,8 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, + [ETHTOOL_MSG_PLCA_GET_CFG] = ðnl_plca_cfg_request_ops, + [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -603,6 +605,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, + [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, }; /* default notification handler */ @@ -696,6 +699,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -1047,6 +1051,31 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PLCA_GET_CFG, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_plca_get_cfg_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PLCA_SET_CFG, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_plca_cfg, + .policy = ethnl_plca_set_cfg_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_plca_get_status_policy, + .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 3753787ba233..f271266f6e28 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -347,6 +347,8 @@ extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct ethnl_request_ops ethnl_rss_request_ops; +extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; +extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -388,6 +390,9 @@ extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MO extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1]; +extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; +extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; +extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -408,6 +413,7 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); int ethnl_set_module(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_plca_cfg(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c new file mode 100644 index 000000000000..be7404dc9ef2 --- /dev/null +++ b/net/ethtool/plca.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/phy.h> +#include <linux/ethtool_netlink.h> + +#include "netlink.h" +#include "common.h" + +struct plca_req_info { + struct ethnl_req_info base; +}; + +struct plca_reply_data { + struct ethnl_reply_data base; + struct phy_plca_cfg plca_cfg; + struct phy_plca_status plca_st; +}; + +// Helpers ------------------------------------------------------------------ // + +#define PLCA_REPDATA(__reply_base) \ + container_of(__reply_base, struct plca_reply_data, base) + +static void plca_update_sint(int *dst, const struct nlattr *attr, + bool *mod) +{ + if (!attr) + return; + + *dst = nla_get_u32(attr); + *mod = true; +} + +// PLCA get configuration message ------------------------------------------- // + +const struct nla_policy ethnl_plca_get_cfg_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct plca_reply_data *data = PLCA_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_phy_ops *ops; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out; + } + + // note: rtnl_lock is held already by ethnl_default_doit + ops = ethtool_phy_ops; + if (!ops || !ops->get_plca_cfg) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out; + + memset(&data->plca_cfg, 0xff, + sizeof_field(struct plca_reply_data, plca_cfg)); + + ret = ops->get_plca_cfg(dev->phydev, &data->plca_cfg); + ethnl_ops_complete(dev); + +out: + return ret; +} + +static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u16)) + /* _VERSION */ + nla_total_size(sizeof(u8)) + /* _ENABLED */ + nla_total_size(sizeof(u32)) + /* _NODE_CNT */ + nla_total_size(sizeof(u32)) + /* _NODE_ID */ + nla_total_size(sizeof(u32)) + /* _TO_TIMER */ + nla_total_size(sizeof(u32)) + /* _BURST_COUNT */ + nla_total_size(sizeof(u32)); /* _BURST_TIMER */ +} + +static int plca_get_cfg_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct plca_reply_data *data = PLCA_REPDATA(reply_base); + const struct phy_plca_cfg *plca = &data->plca_cfg; + + if ((plca->version >= 0 && + nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) || + (plca->enabled >= 0 && + nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) || + (plca->node_id >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) || + (plca->node_cnt >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) || + (plca->to_tmr >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) || + (plca->burst_cnt >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) || + (plca->burst_tmr >= 0 && + nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr))) + return -EMSGSIZE; + + return 0; +}; + +const struct ethnl_request_ops ethnl_plca_cfg_request_ops = { + .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG, + .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY, + .hdr_attr = ETHTOOL_A_PLCA_HEADER, + .req_info_size = sizeof(struct plca_req_info), + .reply_data_size = sizeof(struct plca_reply_data), + + .prepare_data = plca_get_cfg_prepare_data, + .reply_size = plca_get_cfg_reply_size, + .fill_reply = plca_get_cfg_fill_reply, +}; + +// PLCA set configuration message ------------------------------------------- // + +const struct nla_policy ethnl_plca_set_cfg_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255), + [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255), + [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255), +}; + +int ethnl_set_plca_cfg(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + const struct ethtool_phy_ops *ops; + struct phy_plca_cfg plca_cfg; + struct net_device *dev; + bool mod = false; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_PLCA_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + + rtnl_lock(); + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + + ops = ethtool_phy_ops; + if (!ops || !ops->set_plca_cfg) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + memset(&plca_cfg, 0xff, sizeof(plca_cfg)); + plca_update_sint(&plca_cfg.enabled, tb[ETHTOOL_A_PLCA_ENABLED], &mod); + plca_update_sint(&plca_cfg.node_id, tb[ETHTOOL_A_PLCA_NODE_ID], &mod); + plca_update_sint(&plca_cfg.node_cnt, tb[ETHTOOL_A_PLCA_NODE_CNT], &mod); + plca_update_sint(&plca_cfg.to_tmr, tb[ETHTOOL_A_PLCA_TO_TMR], &mod); + plca_update_sint(&plca_cfg.burst_cnt, tb[ETHTOOL_A_PLCA_BURST_CNT], + &mod); + plca_update_sint(&plca_cfg.burst_tmr, tb[ETHTOOL_A_PLCA_BURST_TMR], + &mod); + + ret = 0; + if (!mod) + goto out_ops; + + ret = ops->set_plca_cfg(dev->phydev, &plca_cfg, info->extack); + if (ret < 0) + goto out_ops; + + ethtool_notify(dev, ETHTOOL_MSG_PLCA_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + ethnl_parse_header_dev_put(&req_info); + + return ret; +} + +// PLCA get status message -------------------------------------------------- // + +const struct nla_policy ethnl_plca_get_status_policy[] = { + [ETHTOOL_A_PLCA_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct plca_reply_data *data = PLCA_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_phy_ops *ops; + int ret; + + // check that the PHY device is available and connected + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out; + } + + // note: rtnl_lock is held already by ethnl_default_doit + ops = ethtool_phy_ops; + if (!ops || !ops->get_plca_status) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out; + + memset(&data->plca_st, 0xff, + sizeof_field(struct plca_reply_data, plca_st)); + + ret = ops->get_plca_status(dev->phydev, &data->plca_st); + ethnl_ops_complete(dev); +out: + return ret; +} + +static int plca_get_status_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)); /* _STATUS */ +} + +static int plca_get_status_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct plca_reply_data *data = PLCA_REPDATA(reply_base); + const u8 status = data->plca_st.pst; + + if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status)) + return -EMSGSIZE; + + return 0; +}; + +const struct ethnl_request_ops ethnl_plca_status_request_ops = { + .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS, + .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, + .hdr_attr = ETHTOOL_A_PLCA_HEADER, + .req_info_size = sizeof(struct plca_req_info), + .reply_data_size = sizeof(struct plca_reply_data), + + .prepare_data = plca_get_status_prepare_data, + .reply_size = plca_get_status_reply_size, + .fill_reply = plca_get_status_fill_reply, +}; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f58d73888638..7a13dd7f546b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1008,17 +1008,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 index; if (port) { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL, NULL); - spin_unlock_bh(&head->lock); - return 0; - } - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ + local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index aab384126f61..f71a7e9a7de6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -259,20 +259,6 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support" - depends on IP_NF_MANGLE - depends on NF_CONNTRACK - depends on NETFILTER_ADVANCED - select NF_CONNTRACK_MARK - select NETFILTER_FAMILY_ARP - help - The CLUSTERIP target allows you to build load-balancing clusters of - network servers without having a dedicated load-balancing - router/server/switch. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 93bad1184251..5a26f9de1ab9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -39,7 +39,6 @@ obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o # targets -obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c deleted file mode 100644 index b3cc416ed292..000000000000 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ /dev/null @@ -1,929 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Cluster IP hashmark target - * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> - * based on ideas of Fabio Olive Leite <olive@unixforge.org> - * - * Development of this code funded by SuSE Linux AG, https://www.suse.com/ - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/jhash.h> -#include <linux/bitops.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <linux/if_arp.h> -#include <linux/seq_file.h> -#include <linux/refcount.h> -#include <linux/netfilter_arp.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/checksum.h> -#include <net/ip.h> - -#define CLUSTERIP_VERSION "0.8" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); - -struct clusterip_config { - struct list_head list; /* list of all configs */ - refcount_t refcount; /* reference count */ - refcount_t entries; /* number of entries/rules - * referencing us */ - - __be32 clusterip; /* the IP address */ - u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ - int ifindex; /* device ifindex */ - u_int16_t num_total_nodes; /* total number of nodes */ - unsigned long local_nodes; /* node number array */ - -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *pde; /* proc dir entry */ -#endif - enum clusterip_hashmode hash_mode; /* which hashing mode */ - u_int32_t hash_initval; /* hash initialization */ - struct rcu_head rcu; /* for call_rcu */ - struct net *net; /* netns for pernet list */ - char ifname[IFNAMSIZ]; /* device ifname */ -}; - -#ifdef CONFIG_PROC_FS -static const struct proc_ops clusterip_proc_ops; -#endif - -struct clusterip_net { - struct list_head configs; - /* lock protects the configs list */ - spinlock_t lock; - - bool clusterip_deprecated_warning; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *procdir; - /* mutex protects the config->pde*/ - struct mutex mutex; -#endif - unsigned int hook_users; -}; - -static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); - -static const struct nf_hook_ops cip_arp_ops = { - .hook = clusterip_arp_mangle, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = -1 -}; - -static unsigned int clusterip_net_id __read_mostly; -static inline struct clusterip_net *clusterip_pernet(struct net *net) -{ - return net_generic(net, clusterip_net_id); -} - -static inline void -clusterip_config_get(struct clusterip_config *c) -{ - refcount_inc(&c->refcount); -} - -static void clusterip_config_rcu_free(struct rcu_head *head) -{ - struct clusterip_config *config; - struct net_device *dev; - - config = container_of(head, struct clusterip_config, rcu); - dev = dev_get_by_name(config->net, config->ifname); - if (dev) { - dev_mc_del(dev, config->clustermac); - dev_put(dev); - } - kfree(config); -} - -static inline void -clusterip_config_put(struct clusterip_config *c) -{ - if (refcount_dec_and_test(&c->refcount)) - call_rcu(&c->rcu, clusterip_config_rcu_free); -} - -/* decrease the count of entries using/referencing this config. If last - * entry(rule) is removed, remove the config from lists, but don't free it - * yet, since proc-files could still be holding references */ -static inline void -clusterip_config_entry_put(struct clusterip_config *c) -{ - struct clusterip_net *cn = clusterip_pernet(c->net); - - local_bh_disable(); - if (refcount_dec_and_lock(&c->entries, &cn->lock)) { - list_del_rcu(&c->list); - spin_unlock(&cn->lock); - local_bh_enable(); - /* In case anyone still accesses the file, the open/close - * functions are also incrementing the refcount on their own, - * so it's safe to remove the entry even if it's in use. */ -#ifdef CONFIG_PROC_FS - mutex_lock(&cn->mutex); - if (cn->procdir) - proc_remove(c->pde); - mutex_unlock(&cn->mutex); -#endif - return; - } - local_bh_enable(); -} - -static struct clusterip_config * -__clusterip_config_find(struct net *net, __be32 clusterip) -{ - struct clusterip_config *c; - struct clusterip_net *cn = clusterip_pernet(net); - - list_for_each_entry_rcu(c, &cn->configs, list) { - if (c->clusterip == clusterip) - return c; - } - - return NULL; -} - -static inline struct clusterip_config * -clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) -{ - struct clusterip_config *c; - - rcu_read_lock_bh(); - c = __clusterip_config_find(net, clusterip); - if (c) { -#ifdef CONFIG_PROC_FS - if (!c->pde) - c = NULL; - else -#endif - if (unlikely(!refcount_inc_not_zero(&c->refcount))) - c = NULL; - else if (entry) { - if (unlikely(!refcount_inc_not_zero(&c->entries))) { - clusterip_config_put(c); - c = NULL; - } - } - } - rcu_read_unlock_bh(); - - return c; -} - -static void -clusterip_config_init_nodelist(struct clusterip_config *c, - const struct ipt_clusterip_tgt_info *i) -{ - int n; - - for (n = 0; n < i->num_local_nodes; n++) - set_bit(i->local_nodes[n] - 1, &c->local_nodes); -} - -static int -clusterip_netdev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - struct clusterip_net *cn = clusterip_pernet(net); - struct clusterip_config *c; - - spin_lock_bh(&cn->lock); - list_for_each_entry_rcu(c, &cn->configs, list) { - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } else if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - } - } - spin_unlock_bh(&cn->lock); - - return NOTIFY_DONE; -} - -static struct clusterip_config * -clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, - __be32 ip, const char *iniface) -{ - struct clusterip_net *cn = clusterip_pernet(net); - struct clusterip_config *c; - struct net_device *dev; - int err; - - if (iniface[0] == '\0') { - pr_info("Please specify an interface name\n"); - return ERR_PTR(-EINVAL); - } - - c = kzalloc(sizeof(*c), GFP_ATOMIC); - if (!c) - return ERR_PTR(-ENOMEM); - - dev = dev_get_by_name(net, iniface); - if (!dev) { - pr_info("no such interface %s\n", iniface); - kfree(c); - return ERR_PTR(-ENOENT); - } - c->ifindex = dev->ifindex; - strcpy(c->ifname, dev->name); - memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); - dev_mc_add(dev, c->clustermac); - dev_put(dev); - - c->clusterip = ip; - c->num_total_nodes = i->num_total_nodes; - clusterip_config_init_nodelist(c, i); - c->hash_mode = i->hash_mode; - c->hash_initval = i->hash_initval; - c->net = net; - refcount_set(&c->refcount, 1); - - spin_lock_bh(&cn->lock); - if (__clusterip_config_find(net, ip)) { - err = -EBUSY; - goto out_config_put; - } - - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); - -#ifdef CONFIG_PROC_FS - { - char buffer[16]; - - /* create proc dir entry */ - sprintf(buffer, "%pI4", &ip); - mutex_lock(&cn->mutex); - c->pde = proc_create_data(buffer, 0600, - cn->procdir, - &clusterip_proc_ops, c); - mutex_unlock(&cn->mutex); - if (!c->pde) { - err = -ENOMEM; - goto err; - } - } -#endif - - refcount_set(&c->entries, 1); - return c; - -#ifdef CONFIG_PROC_FS -err: -#endif - spin_lock_bh(&cn->lock); - list_del_rcu(&c->list); -out_config_put: - spin_unlock_bh(&cn->lock); - clusterip_config_put(c); - return ERR_PTR(err); -} - -#ifdef CONFIG_PROC_FS -static int -clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) -{ - - if (nodenum == 0 || - nodenum > c->num_total_nodes) - return 1; - - /* check if we already have this number in our bitfield */ - if (test_and_set_bit(nodenum - 1, &c->local_nodes)) - return 1; - - return 0; -} - -static bool -clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) -{ - if (nodenum == 0 || - nodenum > c->num_total_nodes) - return true; - - if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) - return false; - - return true; -} -#endif - -static inline u_int32_t -clusterip_hashfn(const struct sk_buff *skb, - const struct clusterip_config *config) -{ - const struct iphdr *iph = ip_hdr(skb); - unsigned long hashval; - u_int16_t sport = 0, dport = 0; - int poff; - - poff = proto_ports_offset(iph->protocol); - if (poff >= 0) { - const u_int16_t *ports; - u16 _ports[2]; - - ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); - if (ports) { - sport = ports[0]; - dport = ports[1]; - } - } else { - net_info_ratelimited("unknown protocol %u\n", iph->protocol); - } - - switch (config->hash_mode) { - case CLUSTERIP_HASHMODE_SIP: - hashval = jhash_1word(ntohl(iph->saddr), - config->hash_initval); - break; - case CLUSTERIP_HASHMODE_SIP_SPT: - hashval = jhash_2words(ntohl(iph->saddr), sport, - config->hash_initval); - break; - case CLUSTERIP_HASHMODE_SIP_SPT_DPT: - hashval = jhash_3words(ntohl(iph->saddr), sport, dport, - config->hash_initval); - break; - default: - /* to make gcc happy */ - hashval = 0; - /* This cannot happen, unless the check function wasn't called - * at rule load time */ - pr_info("unknown mode %u\n", config->hash_mode); - BUG(); - break; - } - - /* node numbers are 1..n, not 0..n */ - return reciprocal_scale(hashval, config->num_total_nodes) + 1; -} - -static inline int -clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) -{ - return test_bit(hash - 1, &config->local_nodes); -} - -/*********************************************************************** - * IPTABLES TARGET - ***********************************************************************/ - -static unsigned int -clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - u_int32_t hash; - - /* don't need to clusterip_config_get() here, since refcount - * is only decremented by destroy() - and ip_tables guarantees - * that the ->target() function isn't called after ->destroy() */ - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_DROP; - - /* special case: ICMP error handling. conntrack distinguishes between - * error messages (RELATED) and information requests (see below) */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP && - (ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)) - return XT_CONTINUE; - - /* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO, - * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here - * on, which all have an ID field [relevant for hashing]. */ - - hash = clusterip_hashfn(skb, cipinfo->config); - - switch (ctinfo) { - case IP_CT_NEW: - WRITE_ONCE(ct->mark, hash); - break; - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - /* FIXME: we don't handle expectations at the moment. - * They can arrive on a different node than - * the master connection (e.g. FTP passive mode) */ - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - default: /* Prevent gcc warnings */ - break; - } - -#ifdef DEBUG - nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -#endif - pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); - if (!clusterip_responsible(cipinfo->config, hash)) { - pr_debug("not responsible\n"); - return NF_DROP; - } - pr_debug("responsible\n"); - - /* despite being received via linklayer multicast, this is - * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ - skb->pkt_type = PACKET_HOST; - - return XT_CONTINUE; -} - -static int clusterip_tg_check(const struct xt_tgchk_param *par) -{ - struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct clusterip_net *cn = clusterip_pernet(par->net); - const struct ipt_entry *e = par->entryinfo; - struct clusterip_config *config; - int ret, i; - - if (par->nft_compat) { - pr_err("cannot use CLUSTERIP target from nftables compat\n"); - return -EOPNOTSUPP; - } - - if (cn->hook_users == UINT_MAX) - return -EOVERFLOW; - - if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && - cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && - cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { - pr_info("unknown mode %u\n", cipinfo->hash_mode); - return -EINVAL; - - } - if (e->ip.dmsk.s_addr != htonl(0xffffffff) || - e->ip.dst.s_addr == 0) { - pr_info("Please specify destination IP\n"); - return -EINVAL; - } - if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { - pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); - return -EINVAL; - } - for (i = 0; i < cipinfo->num_local_nodes; i++) { - if (cipinfo->local_nodes[i] - 1 >= - sizeof(config->local_nodes) * 8) { - pr_info("bad local_nodes[%d] %u\n", - i, cipinfo->local_nodes[i]); - return -EINVAL; - } - } - - config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); - if (!config) { - if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { - pr_info("no config found for %pI4, need 'new'\n", - &e->ip.dst.s_addr); - return -EINVAL; - } else { - config = clusterip_config_init(par->net, cipinfo, - e->ip.dst.s_addr, - e->ip.iniface); - if (IS_ERR(config)) - return PTR_ERR(config); - } - } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { - clusterip_config_entry_put(config); - clusterip_config_put(config); - return -EINVAL; - } - - ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) { - pr_info("cannot load conntrack support for proto=%u\n", - par->family); - clusterip_config_entry_put(config); - clusterip_config_put(config); - return ret; - } - - if (cn->hook_users == 0) { - ret = nf_register_net_hook(par->net, &cip_arp_ops); - - if (ret < 0) { - clusterip_config_entry_put(config); - clusterip_config_put(config); - nf_ct_netns_put(par->net, par->family); - return ret; - } - } - - cn->hook_users++; - - if (!cn->clusterip_deprecated_warning) { - pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " - "use xt_cluster instead\n"); - cn->clusterip_deprecated_warning = true; - } - - cipinfo->config = config; - return ret; -} - -/* drop reference count of cluster config when rule is deleted */ -static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) -{ - const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; - struct clusterip_net *cn = clusterip_pernet(par->net); - - /* if no more entries are referencing the config, remove it - * from the list and destroy the proc entry */ - clusterip_config_entry_put(cipinfo->config); - - clusterip_config_put(cipinfo->config); - - nf_ct_netns_put(par->net, par->family); - cn->hook_users--; - - if (cn->hook_users == 0) - nf_unregister_net_hook(par->net, &cip_arp_ops); -} - -#ifdef CONFIG_NETFILTER_XTABLES_COMPAT -struct compat_ipt_clusterip_tgt_info -{ - u_int32_t flags; - u_int8_t clustermac[6]; - u_int16_t num_total_nodes; - u_int16_t num_local_nodes; - u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; - u_int32_t hash_mode; - u_int32_t hash_initval; - compat_uptr_t config; -}; -#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ - -static struct xt_target clusterip_tg_reg __read_mostly = { - .name = "CLUSTERIP", - .family = NFPROTO_IPV4, - .target = clusterip_tg, - .checkentry = clusterip_tg_check, - .destroy = clusterip_tg_destroy, - .targetsize = sizeof(struct ipt_clusterip_tgt_info), - .usersize = offsetof(struct ipt_clusterip_tgt_info, config), -#ifdef CONFIG_NETFILTER_XTABLES_COMPAT - .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), -#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ - .me = THIS_MODULE -}; - - -/*********************************************************************** - * ARP MANGLING CODE - ***********************************************************************/ - -/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ -struct arp_payload { - u_int8_t src_hw[ETH_ALEN]; - __be32 src_ip; - u_int8_t dst_hw[ETH_ALEN]; - __be32 dst_ip; -} __packed; - -#ifdef DEBUG -static void arp_print(struct arp_payload *payload) -{ -#define HBUFFERLEN 30 - char hbuffer[HBUFFERLEN]; - int j, k; - - for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { - hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); - hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); - hbuffer[k++] = ':'; - } - hbuffer[--k] = '\0'; - - pr_debug("src %pI4@%s, dst %pI4\n", - &payload->src_ip, hbuffer, &payload->dst_ip); -} -#endif - -static unsigned int -clusterip_arp_mangle(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct arphdr *arp = arp_hdr(skb); - struct arp_payload *payload; - struct clusterip_config *c; - struct net *net = state->net; - - /* we don't care about non-ethernet and non-ipv4 ARP */ - if (arp->ar_hrd != htons(ARPHRD_ETHER) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) - return NF_ACCEPT; - - /* we only want to mangle arp requests and replies */ - if (arp->ar_op != htons(ARPOP_REPLY) && - arp->ar_op != htons(ARPOP_REQUEST)) - return NF_ACCEPT; - - payload = (void *)(arp+1); - - /* if there is no clusterip configuration for the arp reply's - * source ip, we don't want to mangle it */ - c = clusterip_config_find_get(net, payload->src_ip, 0); - if (!c) - return NF_ACCEPT; - - /* normally the linux kernel always replies to arp queries of - * addresses on different interfacs. However, in the CLUSTERIP case - * this wouldn't work, since we didn't subscribe the mcast group on - * other interfaces */ - if (c->ifindex != state->out->ifindex) { - pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", - c->ifindex, state->out->ifindex); - clusterip_config_put(c); - return NF_ACCEPT; - } - - /* mangle reply hardware address */ - memcpy(payload->src_hw, c->clustermac, arp->ar_hln); - -#ifdef DEBUG - pr_debug("mangled arp reply: "); - arp_print(payload); -#endif - - clusterip_config_put(c); - - return NF_ACCEPT; -} - -/*********************************************************************** - * PROC DIR HANDLING - ***********************************************************************/ - -#ifdef CONFIG_PROC_FS - -struct clusterip_seq_position { - unsigned int pos; /* position */ - unsigned int weight; /* number of bits set == size */ - unsigned int bit; /* current bit */ - unsigned long val; /* current value */ -}; - -static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) -{ - struct clusterip_config *c = s->private; - unsigned int weight; - u_int32_t local_nodes; - struct clusterip_seq_position *idx; - - /* FIXME: possible race */ - local_nodes = c->local_nodes; - weight = hweight32(local_nodes); - if (*pos >= weight) - return NULL; - - idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); - if (!idx) - return ERR_PTR(-ENOMEM); - - idx->pos = *pos; - idx->weight = weight; - idx->bit = ffs(local_nodes); - idx->val = local_nodes; - clear_bit(idx->bit - 1, &idx->val); - - return idx; -} - -static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct clusterip_seq_position *idx = v; - - *pos = ++idx->pos; - if (*pos >= idx->weight) { - kfree(v); - return NULL; - } - idx->bit = ffs(idx->val); - clear_bit(idx->bit - 1, &idx->val); - return idx; -} - -static void clusterip_seq_stop(struct seq_file *s, void *v) -{ - if (!IS_ERR(v)) - kfree(v); -} - -static int clusterip_seq_show(struct seq_file *s, void *v) -{ - struct clusterip_seq_position *idx = v; - - if (idx->pos != 0) - seq_putc(s, ','); - - seq_printf(s, "%u", idx->bit); - - if (idx->pos == idx->weight - 1) - seq_putc(s, '\n'); - - return 0; -} - -static const struct seq_operations clusterip_seq_ops = { - .start = clusterip_seq_start, - .next = clusterip_seq_next, - .stop = clusterip_seq_stop, - .show = clusterip_seq_show, -}; - -static int clusterip_proc_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &clusterip_seq_ops); - - if (!ret) { - struct seq_file *sf = file->private_data; - struct clusterip_config *c = pde_data(inode); - - sf->private = c; - - clusterip_config_get(c); - } - - return ret; -} - -static int clusterip_proc_release(struct inode *inode, struct file *file) -{ - struct clusterip_config *c = pde_data(inode); - int ret; - - ret = seq_release(inode, file); - - if (!ret) - clusterip_config_put(c); - - return ret; -} - -static ssize_t clusterip_proc_write(struct file *file, const char __user *input, - size_t size, loff_t *ofs) -{ - struct clusterip_config *c = pde_data(file_inode(file)); -#define PROC_WRITELEN 10 - char buffer[PROC_WRITELEN+1]; - unsigned long nodenum; - int rc; - - if (size > PROC_WRITELEN) - return -EIO; - if (copy_from_user(buffer, input, size)) - return -EFAULT; - buffer[size] = 0; - - if (*buffer == '+') { - rc = kstrtoul(buffer+1, 10, &nodenum); - if (rc) - return rc; - if (clusterip_add_node(c, nodenum)) - return -ENOMEM; - } else if (*buffer == '-') { - rc = kstrtoul(buffer+1, 10, &nodenum); - if (rc) - return rc; - if (clusterip_del_node(c, nodenum)) - return -ENOENT; - } else - return -EIO; - - return size; -} - -static const struct proc_ops clusterip_proc_ops = { - .proc_open = clusterip_proc_open, - .proc_read = seq_read, - .proc_write = clusterip_proc_write, - .proc_lseek = seq_lseek, - .proc_release = clusterip_proc_release, -}; - -#endif /* CONFIG_PROC_FS */ - -static int clusterip_net_init(struct net *net) -{ - struct clusterip_net *cn = clusterip_pernet(net); - - INIT_LIST_HEAD(&cn->configs); - - spin_lock_init(&cn->lock); - -#ifdef CONFIG_PROC_FS - cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); - if (!cn->procdir) { - pr_err("Unable to proc dir entry\n"); - return -ENOMEM; - } - mutex_init(&cn->mutex); -#endif /* CONFIG_PROC_FS */ - - return 0; -} - -static void clusterip_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - struct clusterip_net *cn = clusterip_pernet(net); - - mutex_lock(&cn->mutex); - proc_remove(cn->procdir); - cn->procdir = NULL; - mutex_unlock(&cn->mutex); -#endif -} - -static struct pernet_operations clusterip_net_ops = { - .init = clusterip_net_init, - .exit = clusterip_net_exit, - .id = &clusterip_net_id, - .size = sizeof(struct clusterip_net), -}; - -static struct notifier_block cip_netdev_notifier = { - .notifier_call = clusterip_netdev_event -}; - -static int __init clusterip_tg_init(void) -{ - int ret; - - ret = register_pernet_subsys(&clusterip_net_ops); - if (ret < 0) - return ret; - - ret = xt_register_target(&clusterip_tg_reg); - if (ret < 0) - goto cleanup_subsys; - - ret = register_netdevice_notifier(&cip_netdev_notifier); - if (ret < 0) - goto unregister_target; - - pr_info("ClusterIP Version %s loaded successfully\n", - CLUSTERIP_VERSION); - - return 0; - -unregister_target: - xt_unregister_target(&clusterip_tg_reg); -cleanup_subsys: - unregister_pernet_subsys(&clusterip_net_ops); - return ret; -} - -static void __exit clusterip_tg_exit(void) -{ - pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); - - unregister_netdevice_notifier(&cip_netdev_notifier); - xt_unregister_target(&clusterip_tg_reg); - unregister_pernet_subsys(&clusterip_net_ops); - - /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ - rcu_barrier(); -} - -module_init(clusterip_tg_init); -module_exit(clusterip_tg_exit); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e74e0361fd92..76889ceeead9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -91,7 +91,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -3284,23 +3284,17 @@ out: return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; - entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) - entries = dst_entries_get_slow(ops); - - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -3310,7 +3304,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, @@ -6512,7 +6505,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index ff691d9f4a04..b1c028df686e 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -13,7 +13,7 @@ #include <net/rpl.h> struct rpl_iptunnel_encap { - struct ipv6_rpl_sr_hdr srh[0]; + DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 5ded85e2c374..b30cea2fbf3f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1594,8 +1594,7 @@ mp_rst: TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); - MPTCP_INC_STATS(sock_net((const struct sock *)tp), - MPTCP_MIB_MPPRIOTX); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2ea7eae43bdb..b5505b8167f9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1143,7 +1143,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE); + MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); @@ -1903,8 +1903,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) } if (token) - return mptcp_userspace_pm_set_flags(sock_net(skb->sk), - token, &addr, &remote, bkup); + return mptcp_userspace_pm_set_flags(net, token, &addr, &remote, bkup); spin_lock_bh(&pernet->lock); entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cd6cc67c2c5..1ff2efbab29e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -923,9 +923,8 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - sock_owned_by_me(sk); + msk_owned_by_me(msk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->data_avail)) @@ -1408,7 +1407,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - sock_owned_by_me(sk); + msk_owned_by_me(msk); if (__mptcp_check_fallback(msk)) { if (!msk->first) @@ -1890,7 +1889,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) u32 time, advmss = 1; u64 rtt_us, mstamp; - sock_owned_by_me(sk); + msk_owned_by_me(msk); if (copied <= 0) return; @@ -2217,7 +2216,7 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - sock_owned_by_me((const struct sock *)msk); + msk_owned_by_me(msk); if (__mptcp_check_fallback(msk)) return NULL; @@ -2724,8 +2723,8 @@ static int mptcp_init_sock(struct sock *sk) mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); - sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } @@ -2892,6 +2891,12 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } +static void mptcp_listen_inuse_dec(struct sock *sk) +{ + if (inet_sk_state_load(sk) == TCP_LISTEN) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +} + bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; @@ -2901,6 +2906,7 @@ bool __mptcp_close(struct sock *sk, long timeout) sk->sk_shutdown = SHUTDOWN_MASK; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } @@ -3001,6 +3007,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) if (msk->fastopening) return 0; + mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3639,12 +3646,13 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct sock *sk = sock->sk; struct socket *ssock; int err; pr_debug("msk=%p", msk); - lock_sock(sock->sk); + lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); if (!ssock) { err = -EINVAL; @@ -3652,18 +3660,20 @@ static int mptcp_listen(struct socket *sock, int backlog) } mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_LISTEN); - sock_set_flag(sock->sk, SOCK_RCU_FREE); + inet_sk_state_store(sk, TCP_LISTEN); + sock_set_flag(sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - if (!err) - mptcp_copy_inaddrs(sock->sk, ssock->sk); + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + if (!err) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + mptcp_copy_inaddrs(sk, ssock->sk); + } mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); unlock: - release_sock(sock->sk); + release_sock(sk); return err; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 601469249da8..61fd8eabfca2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -755,7 +755,7 @@ static inline void mptcp_token_init_request(struct request_sock *req) int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); -int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4b1e6ec1b36..582ed93bcc8a 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -18,7 +18,7 @@ static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - sock_owned_by_me((const struct sock *)msk); + msk_owned_by_me(msk); if (likely(!__mptcp_check_fallback(msk))) return NULL; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 65430f314a68..5bb924534387 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -134,7 +134,7 @@ int mptcp_token_new_request(struct request_sock *req) /** * mptcp_token_new_connect - create new key/idsn/token for subflow - * @sk: the socket that will initiate a connection + * @ssk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. @@ -148,11 +148,12 @@ int mptcp_token_new_request(struct request_sock *req) * * returns 0 on success. */ -int mptcp_token_new_connect(struct sock *sk) +int mptcp_token_new_connect(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int retries = MPTCP_TOKEN_MAX_RETRIES; + struct sock *sk = subflow->conn; struct token_bucket *bucket; again: @@ -169,12 +170,13 @@ again: } pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); + ssk, subflow->local_key, subflow->token, subflow->idsn); WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; spin_unlock_bh(&bucket->lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } @@ -190,8 +192,10 @@ void mptcp_token_accept(struct mptcp_subflow_request_sock *req, struct mptcp_sock *msk) { struct mptcp_subflow_request_sock *pos; + struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); bucket = token_bucket(req->token); spin_lock_bh(&bucket->lock); @@ -370,12 +374,14 @@ void mptcp_token_destroy_request(struct request_sock *req) */ void mptcp_token_destroy(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; struct mptcp_sock *pos; if (sk_unhashed((struct sock *)msk)) return; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); bucket = token_bucket(msk->token); spin_lock_bh(&bucket->lock); pos = __token_lookup_msk(bucket, msk->token); diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index 5d984bec1cd8..0758865ab658 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -57,6 +57,9 @@ static struct mptcp_sock *build_msk(struct kunit *test) KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); sock_net_set((struct sock *)msk, &init_net); + + /* be sure the token helpers can dereference sk->sk_prot */ + ((struct sock *)msk)->sk_prot = &tcp_prot; return msk; } diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3754eb06fb41..ba2a6b5e93d9 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -98,6 +98,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o endif endif +ifdef CONFIG_NFT_CT +ifdef CONFIG_RETPOLINE +nf_tables-objs += nft_ct_fast.o +endif +endif + obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 496c4920505b..c00858344f02 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -514,7 +514,6 @@ EXPORT_SYMBOL_GPL(nf_ct_get_id); static void clean_from_lists(struct nf_conn *ct) { - pr_debug("clean_from_lists(%p)\n", ct); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); @@ -582,7 +581,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -603,7 +601,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); @@ -786,8 +783,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - rcu_read_lock(); - h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested @@ -799,7 +794,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) - goto found; + return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); @@ -807,8 +802,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, h = NULL; } -found: - rcu_read_unlock(); return h; } @@ -820,16 +813,21 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; + rcu_read_lock(); + thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) - return thash; + goto out_unlock; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, rid, net)); + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + +out_unlock: + rcu_read_unlock(); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -1210,7 +1208,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } - pr_debug("Confirming conntrack %p\n", ct); /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking @@ -1721,10 +1718,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; - if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { - pr_debug("Can't invert tuple.\n"); + if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; - } zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, @@ -1764,8 +1759,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { - pr_debug("expectation arrives ct=%p exp=%p\n", - ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ @@ -1829,10 +1822,8 @@ resolve_normal_ct(struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, - &tuple)) { - pr_debug("Can't get tuple\n"); + &tuple)) return 0; - } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); @@ -1864,17 +1855,15 @@ resolve_normal_ct(struct nf_conn *tmpl, if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { + unsigned long status = READ_ONCE(ct->status); + /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("normal packet for %p\n", ct); + if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; - } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("related packet for %p\n", ct); + else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; - } else { - pr_debug("new packet for %p\n", ct); + else ctinfo = IP_CT_NEW; - } } nf_ct_set(skb, ct, ctinfo); return 0; @@ -1988,7 +1977,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { - pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; @@ -2027,7 +2015,6 @@ repeat: if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ - pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a @@ -2066,7 +2053,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, /* Should be unconfirmed, so not in hash table yet */ WARN_ON(nf_ct_is_confirmed(ct)); - pr_debug("Altering reply tuple of %p to ", ct); nf_ct_dump_tuple(newreply); ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1286ae7d4609..90672e293e57 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3866,7 +3866,7 @@ static int __init ctnetlink_init(void) { int ret; - BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx)); + NL_ASSET_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ccef340be575..c928ff63b10e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -284,16 +284,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) /* We only do TCP and SCTP at the moment: is there a better way? */ if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + tuple.dst.protonum != IPPROTO_SCTP) return -ENOPROTOOPT; - } - if ((unsigned int)*len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); + if ((unsigned int)*len < sizeof(struct sockaddr_in)) return -EINVAL; - } h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { @@ -307,17 +302,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u3.ip; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); nf_ct_put(ct); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; else return 0; } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } @@ -360,12 +350,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + if (!h) return -ENOENT; - } ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index d88b92a8ffca..dbdfcc6cd2aa 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -168,7 +168,8 @@ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - unsigned long *map) + unsigned long *map, + const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; @@ -177,8 +178,6 @@ static int do_basic_checks(struct nf_conn *ct, flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) @@ -193,7 +192,9 @@ static int do_basic_checks(struct nf_conn *ct, sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { - pr_debug("Basic checks failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "%s failed. chunk num %d, type %d, len %d flag %d\n", + __func__, count, sch->type, sch->length, flag); return 1; } @@ -201,7 +202,6 @@ static int do_basic_checks(struct nf_conn *ct, set_bit(sch->type, map); } - pr_debug("Basic checks passed\n"); return count == 0; } @@ -211,69 +211,51 @@ static int sctp_new_state(enum ip_conntrack_dir dir, { int i; - pr_debug("Chunk type: %d\n", chunk_type); - switch (chunk_type) { case SCTP_CID_INIT: - pr_debug("SCTP_CID_INIT\n"); i = 0; break; case SCTP_CID_INIT_ACK: - pr_debug("SCTP_CID_INIT_ACK\n"); i = 1; break; case SCTP_CID_ABORT: - pr_debug("SCTP_CID_ABORT\n"); i = 2; break; case SCTP_CID_SHUTDOWN: - pr_debug("SCTP_CID_SHUTDOWN\n"); i = 3; break; case SCTP_CID_SHUTDOWN_ACK: - pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); i = 4; break; case SCTP_CID_ERROR: - pr_debug("SCTP_CID_ERROR\n"); i = 5; break; case SCTP_CID_COOKIE_ECHO: - pr_debug("SCTP_CID_COOKIE_ECHO\n"); i = 6; break; case SCTP_CID_COOKIE_ACK: - pr_debug("SCTP_CID_COOKIE_ACK\n"); i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: - pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; case SCTP_CID_HEARTBEAT: - pr_debug("SCTP_CID_HEARTBEAT"); i = 9; break; case SCTP_CID_HEARTBEAT_ACK: - pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; case SCTP_CID_DATA: case SCTP_CID_SACK: - pr_debug("SCTP_CID_DATA/SACK"); i = 11; break; default: /* Other chunks like DATA or SACK do not change the state */ - pr_debug("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); + pr_debug("Unknown chunk type %d, Will stay in %s\n", + chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } - pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", - dir, sctp_conntrack_names[cur_state], chunk_type, - sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); - return sctp_conntracks[dir][i][cur_state]; } @@ -392,7 +374,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (sh == NULL) goto out; - if (do_basic_checks(ct, skb, dataoff, map) != 0) + if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { @@ -414,7 +396,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); + nf_ct_l4proto_log_invalid(skb, ct, state, + "verification tag check failed %x vs %x for dir %d", + sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } } @@ -488,9 +472,10 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " - "conntrack=%u\n", - dir, sch->type, old_state); + nf_ct_l4proto_log_invalid(skb, ct, state, + "Invalid, old_state %d, dir %d, type %d", + old_state, dir, sch->type); + goto out_unlock; } @@ -536,7 +521,6 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && dir == IP_CT_DIR_REPLY && new_state == SCTP_CONNTRACK_ESTABLISHED) { - pr_debug("Setting assured bit\n"); set_bit(IPS_ASSURED_BIT, &ct->status); nf_conntrack_event_cache(IPCT_ASSURED, ct); } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3ac1af6f59fc..16ee5ebe1ce1 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -930,7 +930,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); - struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum nf_ct_tcp_action res; @@ -954,7 +953,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; - tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: @@ -1232,13 +1230,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; - pr_debug("tcp_conntracks: "); - nf_ct_dump_tuple(tuple); - pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", - (th->syn ? 1 : 0), (th->ack ? 1 : 0), - (th->fin ? 1 : 0), (th->rst ? 1 : 0), - old_state, new_state); - ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3b516cffc779..6b9206635b24 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -88,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, const struct nf_hook_state *state) { unsigned int *timeouts; + unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; @@ -96,26 +97,27 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); - if (!nf_ct_is_confirmed(ct)) + status = READ_ONCE(ct->status); + if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + if (status & IPS_SEEN_REPLY_BIT) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; - stream = true; + stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ - if (unlikely((ct->status & IPS_NAT_CLASH))) + if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8c09e4d12ac1..974b95dece1d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1401,6 +1401,10 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(table)) { + if (PTR_ERR(table) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } @@ -2639,6 +2643,10 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } @@ -3716,6 +3724,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } @@ -3729,6 +3741,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } @@ -4808,6 +4824,10 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(set)) { + if (PTR_ERR(set) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } @@ -6690,6 +6710,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); + if (err == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) + continue; + if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -7334,6 +7358,10 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(obj)) { + if (PTR_ERR(obj) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } @@ -7964,6 +7992,10 @@ static int nf_tables_delflowtable(struct sk_buff *skb, } if (IS_ERR(flowtable)) { + if (PTR_ERR(flowtable) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } @@ -8373,6 +8405,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, + [NFT_MSG_DESTROYTABLE] = { + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, @@ -8391,6 +8429,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, + [NFT_MSG_DESTROYCHAIN] = { + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, @@ -8415,6 +8459,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_DESTROYRULE] = { + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, @@ -8433,6 +8483,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, + [NFT_MSG_DESTROYSET] = { + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, @@ -8451,6 +8507,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_DESTROYSETELEM] = { + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, @@ -8473,6 +8535,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_DESTROYOBJ] = { + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, [NFT_MSG_GETOBJ_RESET] = { .call = nf_tables_getobj, .type = NFNL_CB_RCU, @@ -8497,6 +8565,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, + [NFT_MSG_DESTROYFLOWTABLE] = { + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static int nf_tables_validate(struct net *net) @@ -8590,6 +8664,7 @@ static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: @@ -8597,23 +8672,29 @@ static void nft_commit_release(struct nft_trans *trans) kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: nf_tables_set_elem_destroy(&trans->ctx, nft_trans_elem_set(trans), nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else @@ -9065,8 +9146,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + nf_tables_table_notify(&trans->ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { @@ -9081,8 +9163,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + nf_tables_chain_notify(&trans->ctx, trans->msg_type); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, trans->ctx.chain); @@ -9098,10 +9181,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), - NFT_MSG_DELRULE); + trans->msg_type); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); @@ -9129,9 +9213,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), - NFT_MSG_DELSET, GFP_KERNEL); + trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -9143,11 +9228,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM); + trans->msg_type); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9169,9 +9255,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_DELOBJ); + trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { @@ -9193,11 +9280,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable_hooks(trans)); } else { @@ -9205,7 +9293,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); } @@ -9301,6 +9389,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; @@ -9322,6 +9411,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: trans->ctx.table->use++; nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); @@ -9336,6 +9426,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); @@ -9357,6 +9448,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); @@ -9372,6 +9464,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nft_setelem_data_activate(net, te->set, &te->elem); @@ -9391,6 +9484,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); @@ -9407,6 +9501,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 709a736c301c..6ecd0ba2e546 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,6 +21,26 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_X86) + +static struct static_key_false nf_tables_skip_direct_calls; + +static bool nf_skip_indirect_calls(void) +{ + return static_branch_likely(&nf_tables_skip_direct_calls); +} + +static void __init nf_skip_indirect_calls_enable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&nf_tables_skip_direct_calls); +} +#else +static inline bool nf_skip_indirect_calls(void) { return false; } + +static inline void nf_skip_indirect_calls_enable(void) { } +#endif + static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) @@ -193,7 +213,12 @@ static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_pktinfo *pkt) { #ifdef CONFIG_RETPOLINE - unsigned long e = (unsigned long)expr->ops->eval; + unsigned long e; + + if (nf_skip_indirect_calls()) + goto indirect_call; + + e = (unsigned long)expr->ops->eval; #define X(e, fun) \ do { if ((e) == (unsigned long)(fun)) \ return fun(expr, regs, pkt); } while (0) @@ -203,13 +228,19 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); +#if IS_ENABLED(CONFIG_NFT_CT) + X(e, nft_ct_get_fast_eval); +#endif X(e, nft_range_eval); X(e, nft_immediate_eval); X(e, nft_byteorder_eval); X(e, nft_dynset_eval); X(e, nft_rt_get_eval); X(e, nft_bitwise_eval); + X(e, nft_objref_eval); + X(e, nft_objref_map_eval); #undef X +indirect_call: #endif /* CONFIG_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } @@ -369,6 +400,8 @@ int __init nf_tables_core_module_init(void) goto err; } + nf_skip_indirect_calls_enable(); + return 0; err: diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index c68e2151defe..b9c84499438b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -12,7 +12,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> @@ -23,16 +23,6 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> -struct nft_ct { - enum nft_ct_keys key:8; - enum ip_conntrack_dir dir:8; - u8 len; - union { - u8 dreg; - u8 sreg; - }; -}; - struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; @@ -759,6 +749,18 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track, return false; } +#ifdef CONFIG_RETPOLINE +static const struct nft_expr_ops nft_ct_get_fast_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_get_fast_eval, + .init = nft_ct_get_init, + .destroy = nft_ct_get_destroy, + .dump = nft_ct_get_dump, + .reduce = nft_ct_set_reduce, +}; +#endif + static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), @@ -791,8 +793,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); - if (tb[NFTA_CT_DREG]) + if (tb[NFTA_CT_DREG]) { +#ifdef CONFIG_RETPOLINE + u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + + switch (k) { + case NFT_CT_STATE: + case NFT_CT_DIRECTION: + case NFT_CT_STATUS: + case NFT_CT_MARK: + case NFT_CT_SECMARK: + return &nft_ct_get_fast_ops; + } +#endif return &nft_ct_get_ops; + } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c new file mode 100644 index 000000000000..89983b0613fa --- /dev/null +++ b/net/netfilter/nft_ct_fast.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +#if IS_ENABLED(CONFIG_NFT_CT) +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack.h> + +void nft_ct_get_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int state; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_CT_STATE: + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + state = NF_CT_STATE_UNTRACKED_BIT; + else + state = NF_CT_STATE_INVALID_BIT; + *dest = state; + return; + case NFT_CT_DIRECTION: + nft_reg_store8(dest, CTINFO2DIR(ctinfo)); + return; + case NFT_CT_STATUS: + *dest = ct->status; + return; +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + *dest = ct->mark; + return; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + *dest = ct->secmark; + return; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} +EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval); +#endif diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7b01aa2ef653..cb37169608ba 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -13,9 +13,9 @@ #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) -static void nft_objref_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); @@ -100,9 +100,9 @@ struct nft_objref_map { struct nft_set_binding binding; }; -static void nft_objref_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5b3c0ac495be..cd09ef49df22 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1582,7 +1582,7 @@ errout: static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, - int ref) + int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -1606,7 +1606,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], nla_nest_end(skb, nest); + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1625,7 +1630,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { + 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1799,7 +1804,7 @@ tcf_reoffload_del_notify(struct net *net, struct tc_action *action) if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1) <= 0) { + if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1886,7 +1891,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 2) <= 0) { + 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; @@ -1965,7 +1970,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, - RTM_NEWACTION, 0, 0) <= 0) { + RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 668130f08903..5b4a95e8a1ee 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -488,7 +488,8 @@ static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, #endif static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast); + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack); static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, @@ -521,7 +522,7 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, */ if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, NULL); return chain; @@ -1817,7 +1818,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool terse_dump, bool rtnl_held) + bool terse_dump, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1857,7 +1859,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } + + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto nla_put_failure; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1871,7 +1879,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, bool unicast, - bool rtnl_held) + bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1883,7 +1891,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1912,7 +1920,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -1938,14 +1946,15 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event) + struct tcf_chain *chain, int event, + struct netlink_ext_ack *extack) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) - tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false, true); + tfilter_notify(net, oskb, n, tp, block, q, parent, NULL, + event, false, true, extack); } static void tfilter_put(struct tcf_proto *tp, void *fh) @@ -2156,7 +2165,7 @@ replay: flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_NEWTFILTER, false, rtnl_held); + RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); /* q pointer is NULL for shared blocks */ if (q) @@ -2284,7 +2293,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; @@ -2308,7 +2317,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, tcf_proto_put(tp, rtnl_held, NULL); tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_DELTFILTER, false, rtnl_held); + RTM_DELTFILTER, false, rtnl_held, extack); err = 0; goto errout; } @@ -2452,7 +2461,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, - fh, RTM_NEWTFILTER, true, rtnl_held); + fh, RTM_NEWTFILTER, true, rtnl_held, NULL); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } @@ -2490,7 +2499,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, a->terse_dump, true); + RTM_NEWTFILTER, a->terse_dump, true, NULL); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, @@ -2524,7 +2533,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, false, true) <= 0) + RTM_NEWTFILTER, false, true, NULL) <= 0) goto errout; cb->args[1] = 1; } @@ -2667,7 +2676,8 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct net *net, struct sk_buff *skb, struct tcf_block *block, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { unsigned char *b = skb_tail_pointer(skb); const struct tcf_proto_ops *ops; @@ -2704,7 +2714,12 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, goto nla_put_failure; } + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -2714,7 +2729,8 @@ nla_put_failure: } static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast) + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct tcf_block *block = chain->block; @@ -2728,7 +2744,7 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, portid, - seq, flags, event) <= 0) { + seq, flags, event, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2756,7 +2772,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, return -ENOBUFS; if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, - block, portid, seq, flags, RTM_DELCHAIN) <= 0) { + block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2908,11 +2924,11 @@ replay: } tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, extack); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference @@ -2922,7 +2938,7 @@ replay: break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, - n->nlmsg_flags, n->nlmsg_type, true); + n->nlmsg_flags, n->nlmsg_type, true, extack); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; @@ -3022,7 +3038,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWCHAIN); + RTM_NEWCHAIN, NULL); if (err <= 0) break; index++; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 72d2c204d5f3..c14018a8052c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -902,7 +902,8 @@ static void qdisc_offload_graft_root(struct net_device *dev, } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; @@ -970,7 +971,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -991,7 +997,8 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1002,12 +1009,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, - 0, RTM_DELQDISC) < 0) + 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, - old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } @@ -1022,10 +1029,11 @@ err_out: static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { if (new || old) - qdisc_notify(net, skb, n, clid, old, new); + qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); @@ -1105,12 +1113,12 @@ skip: qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } else { - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } if (dev->flags & IFF_UP) @@ -1141,7 +1149,7 @@ skip: err = cops->graft(parent, cl, new, &old, extack); if (err) return err; - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } @@ -1509,7 +1517,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (err != 0) return err; } else { - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, NULL); } return 0; } @@ -1648,7 +1656,7 @@ replay: } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1715,7 +1723,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1737,7 +1745,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1810,8 +1818,8 @@ done: ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, - unsigned long cl, - u32 portid, u32 seq, u16 flags, int event) + unsigned long cl, u32 portid, u32 seq, u16 flags, + int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1846,7 +1854,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1857,7 +1870,7 @@ nla_put_failure: static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, - unsigned long cl, int event) + unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1866,7 +1879,7 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -1893,7 +1906,7 @@ static int tclass_del_notify(struct net *net, return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, - RTM_DELTCLASS) < 0) { + RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -2100,7 +2113,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: - err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack); goto out; default: err = -EINVAL; @@ -2118,7 +2131,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { - tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); @@ -2140,7 +2153,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTCLASS); + RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, diff --git a/net/socket.c b/net/socket.c index 888cd618a968..77626e4d9690 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> +#include <trace/events/sock.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -709,12 +710,22 @@ INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); + +static noinline void call_trace_sock_send_length(struct sock *sk, int ret, + int flags) +{ + trace_sock_send_length(sk, ret, 0); +} + static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); + + if (trace_sock_send_length_enabled()) + call_trace_sock_send_length(sock->sk, ret, 0); return ret; } @@ -989,12 +1000,21 @@ INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t, int)); INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, size_t, int)); + +static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags) +{ + trace_sock_recv_length(sk, ret, flags); +} + static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, - inet_recvmsg, sock, msg, msg_data_left(msg), - flags); + int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, + msg_data_left(msg), flags); + if (trace_sock_recv_length_enabled()) + call_trace_sock_recv_length(sock->sk, ret, flags); + return ret; } /** @@ -1044,6 +1064,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, { struct socket *sock; int flags; + int ret; sock = file->private_data; @@ -1051,7 +1072,11 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ flags |= more; - return kernel_sendpage(sock, page, offset, size, flags); + ret = kernel_sendpage(sock, page, offset, size, flags); + + if (trace_sock_send_length_enabled()) + call_trace_sock_send_length(sock->sk, ret, 0); + return ret; } static ssize_t sock_splice_read(struct file *file, loff_t *ppos, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f0c2293f1d3b..009616fa0256 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -807,23 +807,23 @@ static int unix_count_nr_fds(struct sock *sk) static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; + unsigned char s_state; struct unix_sock *u; - int nr_fds; + int nr_fds = 0; if (sk) { + s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); - if (sock->type == SOCK_DGRAM) { - nr_fds = atomic_read(&u->scm_stat.nr_fds); - goto out_print; - } - unix_state_lock(sk); - if (sk->sk_state != TCP_LISTEN) + /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their + * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. + * SOCK_DGRAM is ordinary. So, no lock is needed. + */ + if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); - else + else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); - unix_state_unlock(sk); -out_print: + seq_printf(m, "scm_fds: %u\n", nr_fds); } } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index d593d5b6d4b1..19aea7cba26e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1861,8 +1861,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, written = transport->stream_enqueue(vsk, msg, len - total_written); } + if (written < 0) { - err = -ENOMEM; + err = written; goto out_err; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index ad64f403536a..28b5a8e8e094 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -42,8 +42,7 @@ struct virtio_vsock { bool tx_run; struct work_struct send_pkt_work; - spinlock_t send_pkt_list_lock; - struct list_head send_pkt_list; + struct sk_buff_head send_pkt_queue; atomic_t queued_replies; @@ -101,41 +100,31 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - struct virtio_vsock_pkt *pkt; struct scatterlist hdr, buf, *sgs[2]; int ret, in_sg = 0, out_sg = 0; + struct sk_buff *skb; bool reply; - spin_lock_bh(&vsock->send_pkt_list_lock); - if (list_empty(&vsock->send_pkt_list)) { - spin_unlock_bh(&vsock->send_pkt_list_lock); + skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); + if (!skb) break; - } - - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - spin_unlock_bh(&vsock->send_pkt_list_lock); - virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_deliver_tap_pkt(skb); + reply = virtio_vsock_skb_reply(skb); - reply = pkt->reply; - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb))); sgs[out_sg++] = &hdr; - if (pkt->buf) { - sg_init_one(&buf, pkt->buf, pkt->len); + if (skb->len > 0) { + sg_init_one(&buf, skb->data, skb->len); sgs[out_sg++] = &buf; } - ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL); + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); /* Usually this means that there is no more space available in * the vq */ if (ret < 0) { - spin_lock_bh(&vsock->send_pkt_list_lock); - list_add(&pkt->list, &vsock->send_pkt_list); - spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } @@ -164,32 +153,32 @@ out: } static int -virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) +virtio_transport_send_pkt(struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; - int len = pkt->len; + int len = skb->len; + + hdr = virtio_vsock_hdr(skb); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); if (!vsock) { - virtio_transport_free_pkt(pkt); + kfree_skb(skb); len = -ENODEV; goto out_rcu; } - if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { - virtio_transport_free_pkt(pkt); + if (le64_to_cpu(hdr->dst_cid) == vsock->guest_cid) { + kfree_skb(skb); len = -ENODEV; goto out_rcu; } - if (pkt->reply) + if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); - spin_lock_bh(&vsock->send_pkt_list_lock); - list_add_tail(&pkt->list, &vsock->send_pkt_list); - spin_unlock_bh(&vsock->send_pkt_list_lock); - + virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); out_rcu: @@ -201,9 +190,7 @@ static int virtio_transport_cancel_pkt(struct vsock_sock *vsk) { struct virtio_vsock *vsock; - struct virtio_vsock_pkt *pkt, *n; int cnt = 0, ret; - LIST_HEAD(freeme); rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); @@ -212,20 +199,7 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) goto out_rcu; } - spin_lock_bh(&vsock->send_pkt_list_lock); - list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { - if (pkt->vsk != vsk) - continue; - list_move(&pkt->list, &freeme); - } - spin_unlock_bh(&vsock->send_pkt_list_lock); - - list_for_each_entry_safe(pkt, n, &freeme, list) { - if (pkt->reply) - cnt++; - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; @@ -246,38 +220,28 @@ out_rcu: static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { - int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; - struct virtio_vsock_pkt *pkt; - struct scatterlist hdr, buf, *sgs[2]; + int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; + struct scatterlist pkt, *p; struct virtqueue *vq; + struct sk_buff *skb; int ret; vq = vsock->vqs[VSOCK_VQ_RX]; do { - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) + skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); + if (!skb) break; - pkt->buf = kmalloc(buf_len, GFP_KERNEL); - if (!pkt->buf) { - virtio_transport_free_pkt(pkt); + memset(skb->head, 0, VIRTIO_VSOCK_SKB_HEADROOM); + sg_init_one(&pkt, virtio_vsock_hdr(skb), total_len); + p = &pkt; + ret = virtqueue_add_sgs(vq, &p, 0, 1, skb, GFP_KERNEL); + if (ret < 0) { + kfree_skb(skb); break; } - pkt->buf_len = buf_len; - pkt->len = buf_len; - - sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); - sgs[0] = &hdr; - - sg_init_one(&buf, pkt->buf, buf_len); - sgs[1] = &buf; - ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); - if (ret) { - virtio_transport_free_pkt(pkt); - break; - } vsock->rx_buf_nr++; } while (vq->num_free); if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) @@ -299,12 +263,12 @@ static void virtio_transport_tx_work(struct work_struct *work) goto out; do { - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; unsigned int len; virtqueue_disable_cb(vq); - while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { + consume_skb(skb); added = true; } } while (!virtqueue_enable_cb(vq)); @@ -529,7 +493,7 @@ static void virtio_transport_rx_work(struct work_struct *work) do { virtqueue_disable_cb(vq); for (;;) { - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; unsigned int len; if (!virtio_transport_more_replies(vsock)) { @@ -540,23 +504,22 @@ static void virtio_transport_rx_work(struct work_struct *work) goto out; } - pkt = virtqueue_get_buf(vq, &len); - if (!pkt) { + skb = virtqueue_get_buf(vq, &len); + if (!skb) break; - } vsock->rx_buf_nr--; /* Drop short/long packets */ - if (unlikely(len < sizeof(pkt->hdr) || - len > sizeof(pkt->hdr) + pkt->len)) { - virtio_transport_free_pkt(pkt); + if (unlikely(len < sizeof(struct virtio_vsock_hdr) || + len > virtio_vsock_skb_len(skb))) { + kfree_skb(skb); continue; } - pkt->len = len - sizeof(pkt->hdr); - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(&virtio_transport, pkt); + virtio_vsock_skb_rx_put(skb); + virtio_transport_deliver_tap_pkt(skb); + virtio_transport_recv_pkt(&virtio_transport, skb); } } while (!virtqueue_enable_cb(vq)); @@ -610,7 +573,7 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) { struct virtio_device *vdev = vsock->vdev; - struct virtio_vsock_pkt *pkt; + struct sk_buff *skb; /* Reset all connected sockets when the VQs disappear */ vsock_for_each_connected_socket(&virtio_transport.transport, @@ -637,23 +600,16 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) virtio_reset_device(vdev); mutex_lock(&vsock->rx_lock); - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) + kfree_skb(skb); mutex_unlock(&vsock->rx_lock); mutex_lock(&vsock->tx_lock); - while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) - virtio_transport_free_pkt(pkt); + while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) + kfree_skb(skb); mutex_unlock(&vsock->tx_lock); - spin_lock_bh(&vsock->send_pkt_list_lock); - while (!list_empty(&vsock->send_pkt_list)) { - pkt = list_first_entry(&vsock->send_pkt_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); @@ -690,8 +646,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); mutex_init(&vsock->event_lock); - spin_lock_init(&vsock->send_pkt_list_lock); - INIT_LIST_HEAD(&vsock->send_pkt_list); + skb_queue_head_init(&vsock->send_pkt_queue); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a9980e9b9304..a1581c77cf84 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -37,53 +37,56 @@ virtio_transport_get_ops(struct vsock_sock *vsk) return container_of(t, struct virtio_transport, transport); } -static struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, +/* Returns a new packet on success, otherwise returns NULL. + * + * If NULL is returned, errp is set to a negative errno. + */ +static struct sk_buff * +virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, size_t len, u32 src_cid, u32 src_port, u32 dst_cid, u32 dst_port) { - struct virtio_vsock_pkt *pkt; + const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len; + struct virtio_vsock_hdr *hdr; + struct sk_buff *skb; + void *payload; int err; - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) + skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); + if (!skb) return NULL; - pkt->hdr.type = cpu_to_le16(info->type); - pkt->hdr.op = cpu_to_le16(info->op); - pkt->hdr.src_cid = cpu_to_le64(src_cid); - pkt->hdr.dst_cid = cpu_to_le64(dst_cid); - pkt->hdr.src_port = cpu_to_le32(src_port); - pkt->hdr.dst_port = cpu_to_le32(dst_port); - pkt->hdr.flags = cpu_to_le32(info->flags); - pkt->len = len; - pkt->hdr.len = cpu_to_le32(len); - pkt->reply = info->reply; - pkt->vsk = info->vsk; + hdr = virtio_vsock_hdr(skb); + hdr->type = cpu_to_le16(info->type); + hdr->op = cpu_to_le16(info->op); + hdr->src_cid = cpu_to_le64(src_cid); + hdr->dst_cid = cpu_to_le64(dst_cid); + hdr->src_port = cpu_to_le32(src_port); + hdr->dst_port = cpu_to_le32(dst_port); + hdr->flags = cpu_to_le32(info->flags); + hdr->len = cpu_to_le32(len); if (info->msg && len > 0) { - pkt->buf = kmalloc(len, GFP_KERNEL); - if (!pkt->buf) - goto out_pkt; - - pkt->buf_len = len; - - err = memcpy_from_msg(pkt->buf, info->msg, len); + payload = skb_put(skb, len); + err = memcpy_from_msg(payload, info->msg, len); if (err) goto out; if (msg_data_left(info->msg) == 0 && info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); if (info->msg->msg_flags & MSG_EOR) - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); } } + if (info->reply) + virtio_vsock_skb_set_reply(skb); + trace_virtio_transport_alloc_pkt(src_cid, src_port, dst_cid, dst_port, len, @@ -91,19 +94,18 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, info->op, info->flags); - return pkt; + return skb; out: - kfree(pkt->buf); -out_pkt: - kfree(pkt); + kfree_skb(skb); return NULL; } /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { - struct virtio_vsock_pkt *pkt = opaque; + struct virtio_vsock_hdr *pkt_hdr; + struct sk_buff *pkt = opaque; struct af_vsockmon_hdr *hdr; struct sk_buff *skb; size_t payload_len; @@ -113,10 +115,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) * the payload length from the header and the buffer pointer taking * care of the offset in the original packet. */ - payload_len = le32_to_cpu(pkt->hdr.len); - payload_buf = pkt->buf + pkt->off; + pkt_hdr = virtio_vsock_hdr(pkt); + payload_len = pkt->len; + payload_buf = pkt->data; - skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len, + skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); if (!skb) return NULL; @@ -124,16 +127,16 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) hdr = skb_put(skb, sizeof(*hdr)); /* pkt->hdr is little-endian so no need to byteswap here */ - hdr->src_cid = pkt->hdr.src_cid; - hdr->src_port = pkt->hdr.src_port; - hdr->dst_cid = pkt->hdr.dst_cid; - hdr->dst_port = pkt->hdr.dst_port; + hdr->src_cid = pkt_hdr->src_cid; + hdr->src_port = pkt_hdr->src_port; + hdr->dst_cid = pkt_hdr->dst_cid; + hdr->dst_port = pkt_hdr->dst_port; hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO); - hdr->len = cpu_to_le16(sizeof(pkt->hdr)); + hdr->len = cpu_to_le16(sizeof(*pkt_hdr)); memset(hdr->reserved, 0, sizeof(hdr->reserved)); - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(pkt_hdr->op)) { case VIRTIO_VSOCK_OP_REQUEST: case VIRTIO_VSOCK_OP_RESPONSE: hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT); @@ -154,7 +157,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) break; } - skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr)); + skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { skb_put_data(skb, payload_buf, payload_len); @@ -163,13 +166,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) return skb; } -void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) +void virtio_transport_deliver_tap_pkt(struct sk_buff *skb) { - if (pkt->tap_delivered) + if (virtio_vsock_skb_tap_delivered(skb)) return; - vsock_deliver_tap(virtio_transport_build_skb, pkt); - pkt->tap_delivered = true; + vsock_deliver_tap(virtio_transport_build_skb, skb); + virtio_vsock_skb_set_tap_delivered(skb); } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); @@ -192,8 +195,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; - struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; + struct sk_buff *skb; info->type = virtio_transport_get_type(sk_vsock(vsk)); @@ -224,42 +227,47 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) return pkt_len; - pkt = virtio_transport_alloc_pkt(info, pkt_len, + skb = virtio_transport_alloc_skb(info, pkt_len, src_cid, src_port, dst_cid, dst_port); - if (!pkt) { + if (!skb) { virtio_transport_put_credit(vvs, pkt_len); return -ENOMEM; } - virtio_transport_inc_tx_pkt(vvs, pkt); + virtio_transport_inc_tx_pkt(vvs, skb); - return t_ops->send_pkt(pkt); + return t_ops->send_pkt(skb); } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) + if (vvs->rx_bytes + skb->len > vvs->buf_alloc) return false; - vvs->rx_bytes += pkt->len; + vvs->rx_bytes += skb->len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - vvs->rx_bytes -= pkt->len; - vvs->fwd_cnt += pkt->len; + int len; + + len = skb_headroom(skb) - sizeof(struct virtio_vsock_hdr) - skb->len; + vvs->rx_bytes -= len; + vvs->fwd_cnt += len; } -void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + spin_lock_bh(&vvs->rx_lock); vvs->last_fwd_cnt = vvs->fwd_cnt; - pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); - pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc); spin_unlock_bh(&vvs->rx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); @@ -303,29 +311,29 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; size_t bytes, total = 0, off; + struct sk_buff *skb, *tmp; int err = -EFAULT; spin_lock_bh(&vvs->rx_lock); - list_for_each_entry(pkt, &vvs->rx_queue, list) { - off = pkt->off; + skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { + off = 0; if (total == len) break; - while (total < len && off < pkt->len) { + while (total < len && off < skb->len) { bytes = len - total; - if (bytes > pkt->len - off) - bytes = pkt->len - off; + if (bytes > skb->len - off) + bytes = skb->len - off; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since memcpy_to_msg() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf + off, bytes); + err = memcpy_to_msg(msg, skb->data + off, bytes); if (err) goto out; @@ -352,37 +360,38 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; size_t bytes, total = 0; - u32 free_space; + struct sk_buff *skb; int err = -EFAULT; + u32 free_space; spin_lock_bh(&vvs->rx_lock); - while (total < len && !list_empty(&vvs->rx_queue)) { - pkt = list_first_entry(&vvs->rx_queue, - struct virtio_vsock_pkt, list); + while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + skb = __skb_dequeue(&vvs->rx_queue); bytes = len - total; - if (bytes > pkt->len - pkt->off) - bytes = pkt->len - pkt->off; + if (bytes > skb->len) + bytes = skb->len; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since memcpy_to_msg() may sleep. */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + err = memcpy_to_msg(msg, skb->data, bytes); if (err) goto out; spin_lock_bh(&vvs->rx_lock); total += bytes; - pkt->off += bytes; - if (pkt->off == pkt->len) { - virtio_transport_dec_rx_pkt(vvs, pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); + skb_pull(skb, bytes); + + if (skb->len == 0) { + virtio_transport_dec_rx_pkt(vvs, skb); + consume_skb(skb); + } else { + __skb_queue_head(&vvs->rx_queue, skb); } } @@ -414,10 +423,10 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, int flags) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt; int dequeued_len = 0; size_t user_buf_len = msg_data_left(msg); bool msg_ready = false; + struct sk_buff *skb; spin_lock_bh(&vvs->rx_lock); @@ -427,13 +436,18 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, } while (!msg_ready) { - pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); + struct virtio_vsock_hdr *hdr; + + skb = __skb_dequeue(&vvs->rx_queue); + if (!skb) + break; + hdr = virtio_vsock_hdr(skb); if (dequeued_len >= 0) { size_t pkt_len; size_t bytes_to_copy; - pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); + pkt_len = (size_t)le32_to_cpu(hdr->len); bytes_to_copy = min(user_buf_len, pkt_len); if (bytes_to_copy) { @@ -444,7 +458,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, */ spin_unlock_bh(&vvs->rx_lock); - err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); + err = memcpy_to_msg(msg, skb->data, bytes_to_copy); if (err) { /* Copy of message failed. Rest of * fragments will be freed without copy. @@ -452,6 +466,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len = err; } else { user_buf_len -= bytes_to_copy; + skb_pull(skb, bytes_to_copy); } spin_lock_bh(&vvs->rx_lock); @@ -461,17 +476,16 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len += pkt_len; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { msg_ready = true; vvs->msg_count--; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); + virtio_transport_dec_rx_pkt(vvs, skb); + kfree_skb(skb); } spin_unlock_bh(&vvs->rx_lock); @@ -609,7 +623,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); - INIT_LIST_HEAD(&vvs->rx_queue); + skb_queue_head_init(&vvs->rx_queue); return 0; } @@ -806,16 +820,16 @@ void virtio_transport_destruct(struct vsock_sock *vsk) EXPORT_SYMBOL_GPL(virtio_transport_destruct); static int virtio_transport_reset(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .reply = !!pkt, + .reply = !!skb, .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ - if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (skb && le16_to_cpu(virtio_vsock_hdr(skb)->op) == VIRTIO_VSOCK_OP_RST) return 0; return virtio_transport_send_pkt_info(vsk, &info); @@ -825,29 +839,30 @@ static int virtio_transport_reset(struct vsock_sock *vsk, * attempt was made to connect to a socket that does not exist. */ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { - struct virtio_vsock_pkt *reply; + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .type = le16_to_cpu(pkt->hdr.type), + .type = le16_to_cpu(hdr->type), .reply = true, }; + struct sk_buff *reply; /* Send RST only if the original pkt is not a RST pkt */ - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) return 0; - reply = virtio_transport_alloc_pkt(&info, 0, - le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port), - le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); + reply = virtio_transport_alloc_skb(&info, 0, + le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port), + le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); if (!reply) return -ENOMEM; if (!t) { - virtio_transport_free_pkt(reply); + kfree_skb(reply); return -ENOTCONN; } @@ -858,16 +873,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, static void virtio_transport_remove_sock(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; - struct virtio_vsock_pkt *pkt, *tmp; /* We don't need to take rx_lock, as the socket is closing and we are * removing it. */ - list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - + __skb_queue_purge(&vvs->rx_queue); vsock_remove_sock(vsk); } @@ -981,13 +991,14 @@ EXPORT_SYMBOL_GPL(virtio_transport_release); static int virtio_transport_recv_connecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); - int err; int skerr; + int err; - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RESPONSE: sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; @@ -1008,7 +1019,7 @@ virtio_transport_recv_connecting(struct sock *sk, return 0; destroy: - virtio_transport_reset(vsk, pkt); + virtio_transport_reset(vsk, skb); sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk_error_report(sk); @@ -1017,34 +1028,37 @@ destroy: static void virtio_transport_recv_enqueue(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { struct virtio_vsock_sock *vvs = vsk->trans; bool can_enqueue, free_pkt = false; + struct virtio_vsock_hdr *hdr; + u32 len; - pkt->len = le32_to_cpu(pkt->hdr.len); - pkt->off = 0; + hdr = virtio_vsock_hdr(skb); + len = le32_to_cpu(hdr->len); spin_lock_bh(&vvs->rx_lock); - can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); + can_enqueue = virtio_transport_inc_rx_pkt(vvs, skb); if (!can_enqueue) { free_pkt = true; goto out; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. */ - if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) { - struct virtio_vsock_pkt *last_pkt; + if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) { + struct virtio_vsock_hdr *last_hdr; + struct sk_buff *last_skb; - last_pkt = list_last_entry(&vvs->rx_queue, - struct virtio_vsock_pkt, list); + last_skb = skb_peek_tail(&vvs->rx_queue); + last_hdr = virtio_vsock_hdr(last_skb); /* If there is space in the last packet queued, we copy the * new packet in its buffer. We avoid this if the last packet @@ -1052,35 +1066,35 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, * delimiter of SEQPACKET message, so 'pkt' is the first packet * of a new message. */ - if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && - !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) { - memcpy(last_pkt->buf + last_pkt->len, pkt->buf, - pkt->len); - last_pkt->len += pkt->len; + if (skb->len < skb_tailroom(last_skb) && + !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) { + memcpy(skb_put(last_skb, skb->len), skb->data, skb->len); free_pkt = true; - last_pkt->hdr.flags |= pkt->hdr.flags; + last_hdr->flags |= hdr->flags; + last_hdr->len = cpu_to_le32(last_skb->len); goto out; } } - list_add_tail(&pkt->list, &vvs->rx_queue); + __skb_queue_tail(&vvs->rx_queue, skb); out: spin_unlock_bh(&vvs->rx_lock); if (free_pkt) - virtio_transport_free_pkt(pkt); + kfree_skb(skb); } static int virtio_transport_recv_connected(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); int err = 0; - switch (le16_to_cpu(pkt->hdr.op)) { + switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RW: - virtio_transport_recv_enqueue(vsk, pkt); + virtio_transport_recv_enqueue(vsk, skb); vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: @@ -1090,18 +1104,17 @@ virtio_transport_recv_connected(struct sock *sk, sk->sk_write_space(sk); break; case VIRTIO_VSOCK_OP_SHUTDOWN: - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) vsk->peer_shutdown |= RCV_SHUTDOWN; - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); - virtio_transport_do_close(vsk, true); } - if (le32_to_cpu(pkt->hdr.flags)) + if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); break; case VIRTIO_VSOCK_OP_RST: @@ -1112,28 +1125,30 @@ virtio_transport_recv_connected(struct sock *sk, break; } - virtio_transport_free_pkt(pkt); + kfree_skb(skb); return err; } static void virtio_transport_recv_disconnecting(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); - if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) virtio_transport_do_close(vsk, true); } static int virtio_transport_send_response(struct vsock_sock *vsk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, - .remote_cid = le64_to_cpu(pkt->hdr.src_cid), - .remote_port = le32_to_cpu(pkt->hdr.src_port), + .remote_cid = le64_to_cpu(hdr->src_cid), + .remote_port = le32_to_cpu(hdr->src_port), .reply = true, .vsk = vsk, }; @@ -1142,8 +1157,9 @@ virtio_transport_send_response(struct vsock_sock *vsk, } static bool virtio_transport_space_update(struct sock *sk, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct virtio_vsock_sock *vvs = vsk->trans; bool space_available; @@ -1158,8 +1174,8 @@ static bool virtio_transport_space_update(struct sock *sk, /* buf_alloc and fwd_cnt is always included in the hdr */ spin_lock_bh(&vvs->tx_lock); - vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); - vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); space_available = virtio_transport_has_space(vsk); spin_unlock_bh(&vvs->tx_lock); return space_available; @@ -1167,27 +1183,28 @@ static bool virtio_transport_space_update(struct sock *sk, /* Handle server socket */ static int -virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, +virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, struct virtio_transport *t) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; int ret; - if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { - virtio_transport_reset_no_sock(t, pkt); + if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset_no_sock(t, skb); return -EINVAL; } if (sk_acceptq_is_full(sk)) { - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } child = vsock_create_connected(sk); if (!child) { - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } @@ -1198,10 +1215,10 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); - vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); - vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&vchild->local_addr, le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port)); + vsock_addr_init(&vchild->remote_addr, le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); ret = vsock_assign_transport(vchild, vsk); /* Transport assigned (looking at remote_addr) must be the same @@ -1209,17 +1226,17 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, */ if (ret || vchild->transport != &t->transport) { release_sock(child); - virtio_transport_reset_no_sock(t, pkt); + virtio_transport_reset_no_sock(t, skb); sock_put(child); return ret; } - if (virtio_transport_space_update(child, pkt)) + if (virtio_transport_space_update(child, skb)) child->sk_write_space(child); vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); - virtio_transport_send_response(vchild, pkt); + virtio_transport_send_response(vchild, skb); release_sock(child); @@ -1237,29 +1254,30 @@ static bool virtio_transport_valid_type(u16 type) * lock. */ void virtio_transport_recv_pkt(struct virtio_transport *t, - struct virtio_vsock_pkt *pkt) + struct sk_buff *skb) { + struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct sockaddr_vm src, dst; struct vsock_sock *vsk; struct sock *sk; bool space_available; - vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid), - le32_to_cpu(pkt->hdr.src_port)); - vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid), - le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&src, le64_to_cpu(hdr->src_cid), + le32_to_cpu(hdr->src_port)); + vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid), + le32_to_cpu(hdr->dst_port)); trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, dst.svm_cid, dst.svm_port, - le32_to_cpu(pkt->hdr.len), - le16_to_cpu(pkt->hdr.type), - le16_to_cpu(pkt->hdr.op), - le32_to_cpu(pkt->hdr.flags), - le32_to_cpu(pkt->hdr.buf_alloc), - le32_to_cpu(pkt->hdr.fwd_cnt)); - - if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { - (void)virtio_transport_reset_no_sock(t, pkt); + le32_to_cpu(hdr->len), + le16_to_cpu(hdr->type), + le16_to_cpu(hdr->op), + le32_to_cpu(hdr->flags), + le32_to_cpu(hdr->buf_alloc), + le32_to_cpu(hdr->fwd_cnt)); + + if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { + (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } @@ -1270,13 +1288,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { - (void)virtio_transport_reset_no_sock(t, pkt); + (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } } - if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { - (void)virtio_transport_reset_no_sock(t, pkt); + if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { + (void)virtio_transport_reset_no_sock(t, skb); sock_put(sk); goto free_pkt; } @@ -1287,13 +1305,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, /* Check if sk has been closed before lock_sock */ if (sock_flag(sk, SOCK_DONE)) { - (void)virtio_transport_reset_no_sock(t, pkt); + (void)virtio_transport_reset_no_sock(t, skb); release_sock(sk); sock_put(sk); goto free_pkt; } - space_available = virtio_transport_space_update(sk, pkt); + space_available = virtio_transport_space_update(sk, skb); /* Update CID in case it has changed after a transport reset event */ if (vsk->local_addr.svm_cid != VMADDR_CID_ANY) @@ -1304,23 +1322,23 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, switch (sk->sk_state) { case TCP_LISTEN: - virtio_transport_recv_listen(sk, pkt, t); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_listen(sk, skb, t); + kfree_skb(skb); break; case TCP_SYN_SENT: - virtio_transport_recv_connecting(sk, pkt); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_connecting(sk, skb); + kfree_skb(skb); break; case TCP_ESTABLISHED: - virtio_transport_recv_connected(sk, pkt); + virtio_transport_recv_connected(sk, skb); break; case TCP_CLOSING: - virtio_transport_recv_disconnecting(sk, pkt); - virtio_transport_free_pkt(pkt); + virtio_transport_recv_disconnecting(sk, skb); + kfree_skb(skb); break; default: - (void)virtio_transport_reset_no_sock(t, pkt); - virtio_transport_free_pkt(pkt); + (void)virtio_transport_reset_no_sock(t, skb); + kfree_skb(skb); break; } @@ -1333,16 +1351,42 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, return; free_pkt: - virtio_transport_free_pkt(pkt); + kfree_skb(skb); } EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +/* Remove skbs found in a queue that have a vsk that matches. + * + * Each skb is freed. + * + * Returns the count of skbs that were reply packets. + */ +int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue) { - kvfree(pkt->buf); - kfree(pkt); + struct sk_buff_head freeme; + struct sk_buff *skb, *tmp; + int cnt = 0; + + skb_queue_head_init(&freeme); + + spin_lock_bh(&queue->lock); + skb_queue_walk_safe(queue, skb, tmp) { + if (vsock_sk(skb->sk) != vsk) + continue; + + __skb_unlink(skb, queue); + __skb_queue_tail(&freeme, skb); + + if (virtio_vsock_skb_reply(skb)) + cnt++; + } + spin_unlock_bh(&queue->lock); + + __skb_queue_purge(&freeme); + + return cnt; } -EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); +EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 169a8cf65b39..671e03240fc5 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -16,7 +16,7 @@ struct vsock_loopback { struct workqueue_struct *workqueue; spinlock_t pkt_list_lock; /* protects pkt_list */ - struct list_head pkt_list; + struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; @@ -27,13 +27,13 @@ static u32 vsock_loopback_get_local_cid(void) return VMADDR_CID_LOCAL; } -static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) +static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; - int len = pkt->len; + int len = skb->len; spin_lock_bh(&vsock->pkt_list_lock); - list_add_tail(&pkt->list, &vsock->pkt_list); + skb_queue_tail(&vsock->pkt_queue, skb); spin_unlock_bh(&vsock->pkt_list_lock); queue_work(vsock->workqueue, &vsock->pkt_work); @@ -44,21 +44,8 @@ static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; - struct virtio_vsock_pkt *pkt, *n; - LIST_HEAD(freeme); - spin_lock_bh(&vsock->pkt_list_lock); - list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { - if (pkt->vsk != vsk) - continue; - list_move(&pkt->list, &freeme); - } - spin_unlock_bh(&vsock->pkt_list_lock); - - list_for_each_entry_safe(pkt, n, &freeme, list) { - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } @@ -121,20 +108,18 @@ static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); - LIST_HEAD(pkts); + struct sk_buff_head pkts; + struct sk_buff *skb; + + skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_list_lock); - list_splice_init(&vsock->pkt_list, &pkts); + skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_list_lock); - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_deliver_tap_pkt(pkt); - virtio_transport_recv_pkt(&loopback_transport, pkt); + while ((skb = __skb_dequeue(&pkts))) { + virtio_transport_deliver_tap_pkt(skb); + virtio_transport_recv_pkt(&loopback_transport, skb); } } @@ -148,7 +133,7 @@ static int __init vsock_loopback_init(void) return -ENOMEM; spin_lock_init(&vsock->pkt_list_lock); - INIT_LIST_HEAD(&vsock->pkt_list); + skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, @@ -166,19 +151,13 @@ out_wq: static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; - struct virtio_vsock_pkt *pkt; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); spin_lock_bh(&vsock->pkt_list_lock); - while (!list_empty(&vsock->pkt_list)) { - pkt = list_first_entry(&vsock->pkt_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } + virtio_vsock_skb_queue_purge(&vsock->pkt_queue); spin_unlock_bh(&vsock->pkt_list_lock); destroy_workqueue(vsock->workqueue); |